summaryrefslogtreecommitdiffstats
path: root/doc/rados
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--doc/rados/api/index.rst25
-rw-r--r--doc/rados/api/libcephsqlite.rst454
-rw-r--r--doc/rados/api/librados-intro.rst1051
-rw-r--r--doc/rados/api/librados.rst187
-rw-r--r--doc/rados/api/libradospp.rst9
-rw-r--r--doc/rados/api/objclass-sdk.rst39
-rw-r--r--doc/rados/api/python.rst428
-rw-r--r--doc/rados/command/list-inconsistent-obj.json237
-rw-r--r--doc/rados/command/list-inconsistent-snap.json86
-rw-r--r--doc/rados/configuration/auth-config-ref.rst379
-rw-r--r--doc/rados/configuration/bluestore-config-ref.rst552
-rw-r--r--doc/rados/configuration/ceph-conf.rst715
-rw-r--r--doc/rados/configuration/common.rst207
-rw-r--r--doc/rados/configuration/demo-ceph.conf31
-rw-r--r--doc/rados/configuration/filestore-config-ref.rst377
-rw-r--r--doc/rados/configuration/general-config-ref.rst19
-rw-r--r--doc/rados/configuration/index.rst53
-rw-r--r--doc/rados/configuration/journal-ref.rst39
-rw-r--r--doc/rados/configuration/mclock-config-ref.rst699
-rw-r--r--doc/rados/configuration/mon-config-ref.rst642
-rw-r--r--doc/rados/configuration/mon-lookup-dns.rst58
-rw-r--r--doc/rados/configuration/mon-osd-interaction.rst245
-rw-r--r--doc/rados/configuration/msgr2.rst257
-rw-r--r--doc/rados/configuration/network-config-ref.rst355
-rw-r--r--doc/rados/configuration/osd-config-ref.rst445
-rw-r--r--doc/rados/configuration/pool-pg-config-ref.rst46
-rw-r--r--doc/rados/configuration/pool-pg.conf21
-rw-r--r--doc/rados/configuration/storage-devices.rst93
-rw-r--r--doc/rados/index.rst81
-rw-r--r--doc/rados/man/index.rst32
-rw-r--r--doc/rados/operations/add-or-rm-mons.rst458
-rw-r--r--doc/rados/operations/add-or-rm-osds.rst419
-rw-r--r--doc/rados/operations/balancer.rst221
-rw-r--r--doc/rados/operations/bluestore-migration.rst357
-rw-r--r--doc/rados/operations/cache-tiering.rst557
-rw-r--r--doc/rados/operations/change-mon-elections.rst100
-rw-r--r--doc/rados/operations/control.rst665
-rw-r--r--doc/rados/operations/crush-map-edits.rst746
-rw-r--r--doc/rados/operations/crush-map.rst1147
-rw-r--r--doc/rados/operations/data-placement.rst47
-rw-r--r--doc/rados/operations/devices.rst227
-rw-r--r--doc/rados/operations/erasure-code-clay.rst240
-rw-r--r--doc/rados/operations/erasure-code-isa.rst107
-rw-r--r--doc/rados/operations/erasure-code-jerasure.rst123
-rw-r--r--doc/rados/operations/erasure-code-lrc.rst388
-rw-r--r--doc/rados/operations/erasure-code-profile.rst128
-rw-r--r--doc/rados/operations/erasure-code-shec.rst145
-rw-r--r--doc/rados/operations/erasure-code.rst272
-rw-r--r--doc/rados/operations/health-checks.rst1619
-rw-r--r--doc/rados/operations/index.rst99
-rw-r--r--doc/rados/operations/monitoring-osd-pg.rst556
-rw-r--r--doc/rados/operations/monitoring.rst644
-rw-r--r--doc/rados/operations/operating.rst174
-rw-r--r--doc/rados/operations/pg-concepts.rst104
-rw-r--r--doc/rados/operations/pg-repair.rst118
-rw-r--r--doc/rados/operations/pg-states.rst118
-rw-r--r--doc/rados/operations/placement-groups.rst897
-rw-r--r--doc/rados/operations/pools.rst751
-rw-r--r--doc/rados/operations/read-balancer.rst64
-rw-r--r--doc/rados/operations/stretch-mode.rst262
-rw-r--r--doc/rados/operations/upmap.rst113
-rw-r--r--doc/rados/operations/user-management.rst840
-rw-r--r--doc/rados/troubleshooting/community.rst37
-rw-r--r--doc/rados/troubleshooting/cpu-profiling.rst80
-rw-r--r--doc/rados/troubleshooting/index.rst19
-rw-r--r--doc/rados/troubleshooting/log-and-debug.rst430
-rw-r--r--doc/rados/troubleshooting/memory-profiling.rst203
-rw-r--r--doc/rados/troubleshooting/troubleshooting-mon.rst713
-rw-r--r--doc/rados/troubleshooting/troubleshooting-osd.rst787
-rw-r--r--doc/rados/troubleshooting/troubleshooting-pg.rst782
-rw-r--r--doc/radosgw/STS.rst297
-rw-r--r--doc/radosgw/STSLite.rst196
-rw-r--r--doc/radosgw/admin.rst715
-rw-r--r--doc/radosgw/adminops.rst2166
-rw-r--r--doc/radosgw/api.rst16
-rw-r--r--doc/radosgw/archive-sync-module.rst44
-rw-r--r--doc/radosgw/barbican.rst123
-rw-r--r--doc/radosgw/bucketpolicy.rst216
-rw-r--r--doc/radosgw/cloud-sync-module.rst244
-rw-r--r--doc/radosgw/cloud-transition.rst368
-rw-r--r--doc/radosgw/compression.rst91
-rw-r--r--doc/radosgw/config-ref.rst301
-rw-r--r--doc/radosgw/d3n_datacache.rst116
-rw-r--r--doc/radosgw/dynamicresharding.rst238
-rw-r--r--doc/radosgw/elastic-sync-module.rst181
-rw-r--r--doc/radosgw/encryption.rst96
-rw-r--r--doc/radosgw/frontends.rst163
-rw-r--r--doc/radosgw/index.rst87
-rw-r--r--doc/radosgw/keycloak.rst138
-rw-r--r--doc/radosgw/keystone.rst179
-rw-r--r--doc/radosgw/kmip.rst219
-rw-r--r--doc/radosgw/layout.rst208
-rw-r--r--doc/radosgw/ldap-auth.rst167
-rw-r--r--doc/radosgw/lua-scripting.rst570
-rw-r--r--doc/radosgw/mfa.rst102
-rw-r--r--doc/radosgw/multisite-sync-policy.rst716
-rw-r--r--doc/radosgw/multisite.rst1690
-rw-r--r--doc/radosgw/multitenancy.rst169
-rw-r--r--doc/radosgw/nfs.rst375
-rw-r--r--doc/radosgw/notifications.rst547
-rw-r--r--doc/radosgw/oidc.rst97
-rw-r--r--doc/radosgw/opa.rst72
-rw-r--r--doc/radosgw/orphans.rst117
-rw-r--r--doc/radosgw/placement.rst263
-rw-r--r--doc/radosgw/pools.rst57
-rw-r--r--doc/radosgw/qat-accel.rst155
-rw-r--r--doc/radosgw/rgw-cache.rst155
-rw-r--r--doc/radosgw/role.rst570
-rw-r--r--doc/radosgw/s3-notification-compatibility.rst149
-rw-r--r--doc/radosgw/s3.rst98
-rw-r--r--doc/radosgw/s3/authentication.rst235
-rw-r--r--doc/radosgw/s3/bucketops.rst706
-rw-r--r--doc/radosgw/s3/commons.rst113
-rw-r--r--doc/radosgw/s3/cpp.rst337
-rw-r--r--doc/radosgw/s3/csharp.rst199
-rw-r--r--doc/radosgw/s3/java.rst212
-rw-r--r--doc/radosgw/s3/objectops.rst558
-rw-r--r--doc/radosgw/s3/perl.rst192
-rw-r--r--doc/radosgw/s3/php.rst214
-rw-r--r--doc/radosgw/s3/python.rst197
-rw-r--r--doc/radosgw/s3/ruby.rst364
-rw-r--r--doc/radosgw/s3/serviceops.rst69
-rw-r--r--doc/radosgw/s3select.rst796
-rw-r--r--doc/radosgw/session-tags.rst427
-rw-r--r--doc/radosgw/swift.rst79
-rw-r--r--doc/radosgw/swift/auth.rst82
-rw-r--r--doc/radosgw/swift/containerops.rst341
-rw-r--r--doc/radosgw/swift/java.rst175
-rw-r--r--doc/radosgw/swift/objectops.rst271
-rw-r--r--doc/radosgw/swift/python.rst114
-rw-r--r--doc/radosgw/swift/ruby.rst119
-rw-r--r--doc/radosgw/swift/serviceops.rst76
-rw-r--r--doc/radosgw/swift/tempurl.rst102
-rw-r--r--doc/radosgw/swift/tutorial.rst62
-rw-r--r--doc/radosgw/sync-modules.rst97
-rw-r--r--doc/radosgw/troubleshooting.rst208
-rw-r--r--doc/radosgw/vault.rst442
137 files changed, 42877 insertions, 0 deletions
diff --git a/doc/rados/api/index.rst b/doc/rados/api/index.rst
new file mode 100644
index 000000000..5422ce871
--- /dev/null
+++ b/doc/rados/api/index.rst
@@ -0,0 +1,25 @@
+.. _rados api:
+
+===========================
+ Ceph Storage Cluster APIs
+===========================
+
+The :term:`Ceph Storage Cluster` has a messaging layer protocol that enables
+clients to interact with a :term:`Ceph Monitor` and a :term:`Ceph OSD Daemon`.
+``librados`` provides this functionality to :term:`Ceph Client`\s in the form of
+a library. All Ceph Clients either use ``librados`` or the same functionality
+encapsulated in ``librados`` to interact with the object store. For example,
+``librbd`` and ``libcephfs`` leverage this functionality. You may use
+``librados`` to interact with Ceph directly (e.g., an application that talks to
+Ceph, your own interface to Ceph, etc.).
+
+
+.. toctree::
+ :maxdepth: 2
+
+ Introduction to librados <librados-intro>
+ librados (C) <librados>
+ librados (C++) <libradospp>
+ librados (Python) <python>
+ libcephsqlite (SQLite) <libcephsqlite>
+ object class <objclass-sdk>
diff --git a/doc/rados/api/libcephsqlite.rst b/doc/rados/api/libcephsqlite.rst
new file mode 100644
index 000000000..beee4a466
--- /dev/null
+++ b/doc/rados/api/libcephsqlite.rst
@@ -0,0 +1,454 @@
+.. _libcephsqlite:
+
+================
+ Ceph SQLite VFS
+================
+
+This `SQLite VFS`_ may be used for storing and accessing a `SQLite`_ database
+backed by RADOS. This allows you to fully decentralize your database using
+Ceph's object store for improved availability, accessibility, and use of
+storage.
+
+Note what this is not: a distributed SQL engine. SQLite on RADOS can be thought
+of like RBD as compared to CephFS: RBD puts a disk image on RADOS for the
+purposes of exclusive access by a machine and generally does not allow parallel
+access by other machines; on the other hand, CephFS allows fully distributed
+access to a file system from many client mounts. SQLite on RADOS is meant to be
+accessed by a single SQLite client database connection at a given time. The
+database may be manipulated safely by multiple clients only in a serial fashion
+controlled by RADOS locks managed by the Ceph SQLite VFS.
+
+
+Usage
+^^^^^
+
+Normal unmodified applications (including the sqlite command-line toolset
+binary) may load the *ceph* VFS using the `SQLite Extension Loading API`_.
+
+.. code:: sql
+
+ .LOAD libcephsqlite.so
+
+or during the invocation of ``sqlite3``
+
+.. code:: sh
+
+ sqlite3 -cmd '.load libcephsqlite.so'
+
+A database file is formatted as a SQLite URI::
+
+ file:///<"*"poolid|poolname>:[namespace]/<dbname>?vfs=ceph
+
+The RADOS ``namespace`` is optional. Note the triple ``///`` in the path. The URI
+authority must be empty or localhost in SQLite. Only the path part of the URI
+is parsed. For this reason, the URI will not parse properly if you only use two
+``//``.
+
+A complete example of (optionally) creating a database and opening:
+
+.. code:: sh
+
+ sqlite3 -cmd '.load libcephsqlite.so' -cmd '.open file:///foo:bar/baz.db?vfs=ceph'
+
+Note you cannot specify the database file as the normal positional argument to
+``sqlite3``. This is because the ``.load libcephsqlite.so`` command is applied
+after opening the database, but opening the database depends on the extension
+being loaded first.
+
+An example passing the pool integer id and no RADOS namespace:
+
+.. code:: sh
+
+ sqlite3 -cmd '.load libcephsqlite.so' -cmd '.open file:///*2:/baz.db?vfs=ceph'
+
+Like other Ceph tools, the *ceph* VFS looks at some environment variables that
+help with configuring which Ceph cluster to communicate with and which
+credential to use. Here would be a typical configuration:
+
+.. code:: sh
+
+ export CEPH_CONF=/path/to/ceph.conf
+ export CEPH_KEYRING=/path/to/ceph.keyring
+ export CEPH_ARGS='--id myclientid'
+ ./runmyapp
+ # or
+ sqlite3 -cmd '.load libcephsqlite.so' -cmd '.open file:///foo:bar/baz.db?vfs=ceph'
+
+The default operation would look at the standard Ceph configuration file path
+using the ``client.admin`` user.
+
+
+User
+^^^^
+
+The *ceph* VFS requires a user credential with read access to the monitors, the
+ability to blocklist dead clients of the database, and access to the OSDs
+hosting the database. This can be done with authorizations as simply as:
+
+.. code:: sh
+
+ ceph auth get-or-create client.X mon 'allow r, allow command "osd blocklist" with blocklistop=add' osd 'allow rwx'
+
+.. note:: The terminology change from ``blacklist`` to ``blocklist``; older clusters may require using the old terms.
+
+You may also simplify using the ``simple-rados-client-with-blocklist`` profile:
+
+.. code:: sh
+
+ ceph auth get-or-create client.X mon 'profile simple-rados-client-with-blocklist' osd 'allow rwx'
+
+To learn why blocklisting is necessary, see :ref:`libcephsqlite-corrupt`.
+
+
+Page Size
+^^^^^^^^^
+
+SQLite allows configuring the page size prior to creating a new database. It is
+advisable to increase this config to 65536 (64K) when using RADOS backed
+databases to reduce the number of OSD reads/writes and thereby improve
+throughput and latency.
+
+.. code:: sql
+
+ PRAGMA page_size = 65536
+
+You may also try other values according to your application needs but note that
+64K is the max imposed by SQLite.
+
+
+Cache
+^^^^^
+
+The ceph VFS does not do any caching of reads or buffering of writes. Instead,
+and more appropriately, the SQLite page cache is used. You may find it is too small
+for most workloads and should therefore increase it significantly:
+
+
+.. code:: sql
+
+ PRAGMA cache_size = 4096
+
+Which will cache 4096 pages or 256MB (with 64K ``page_cache``).
+
+
+Journal Persistence
+^^^^^^^^^^^^^^^^^^^
+
+By default, SQLite deletes the journal for every transaction. This can be
+expensive as the *ceph* VFS must delete every object backing the journal for each
+transaction. For this reason, it is much faster and simpler to ask SQLite to
+**persist** the journal. In this mode, SQLite will invalidate the journal via a
+write to its header. This is done as:
+
+.. code:: sql
+
+ PRAGMA journal_mode = PERSIST
+
+The cost of this may be increased unused space according to the high-water size
+of the rollback journal (based on transaction type and size).
+
+
+Exclusive Lock Mode
+^^^^^^^^^^^^^^^^^^^
+
+SQLite operates in a ``NORMAL`` locking mode where each transaction requires
+locking the backing database file. This can add unnecessary overhead to
+transactions when you know there's only ever one user of the database at a
+given time. You can have SQLite lock the database once for the duration of the
+connection using:
+
+.. code:: sql
+
+ PRAGMA locking_mode = EXCLUSIVE
+
+This can more than **halve** the time taken to perform a transaction. Keep in
+mind this prevents other clients from accessing the database.
+
+In this locking mode, each write transaction to the database requires 3
+synchronization events: once to write to the journal, another to write to the
+database file, and a final write to invalidate the journal header (in
+``PERSIST`` journaling mode).
+
+
+WAL Journal
+^^^^^^^^^^^
+
+The `WAL Journal Mode`_ is only available when SQLite is operating in exclusive
+lock mode. This is because it requires shared memory communication with other
+readers and writers when in the ``NORMAL`` locking mode.
+
+As with local disk databases, WAL mode may significantly reduce small
+transaction latency. Testing has shown it can provide more than 50% speedup
+over persisted rollback journals in exclusive locking mode. You can expect
+around 150-250 transactions per second depending on size.
+
+
+Performance Notes
+^^^^^^^^^^^^^^^^^
+
+The filing backend for the database on RADOS is asynchronous as much as
+possible. Still, performance can be anywhere from 3x-10x slower than a local
+database on SSD. Latency can be a major factor. It is advisable to be familiar
+with SQL transactions and other strategies for efficient database updates.
+Depending on the performance of the underlying pool, you can expect small
+transactions to take up to 30 milliseconds to complete. If you use the
+``EXCLUSIVE`` locking mode, it can be reduced further to 15 milliseconds per
+transaction. A WAL journal in ``EXCLUSIVE`` locking mode can further reduce
+this as low as ~2-5 milliseconds (or the time to complete a RADOS write; you
+won't get better than that!).
+
+There is no limit to the size of a SQLite database on RADOS imposed by the Ceph
+VFS. There are standard `SQLite Limits`_ to be aware of, notably the maximum
+database size of 281 TB. Large databases may or may not be performant on Ceph.
+Experimentation for your own use-case is advised.
+
+Be aware that read-heavy queries could take significant amounts of time as
+reads are necessarily synchronous (due to the VFS API). No readahead is yet
+performed by the VFS.
+
+
+Recommended Use-Cases
+^^^^^^^^^^^^^^^^^^^^^
+
+The original purpose of this module was to support saving relational or large
+data in RADOS which needs to span multiple objects. Many current applications
+with trivial state try to use RADOS omap storage on a single object but this
+cannot scale without striping data across multiple objects. Unfortunately, it
+is non-trivial to design a store spanning multiple objects which is consistent
+and also simple to use. SQLite can be used to bridge that gap.
+
+
+Parallel Access
+^^^^^^^^^^^^^^^
+
+The VFS does not yet support concurrent readers. All database access is protected
+by a single exclusive lock.
+
+
+Export or Extract Database out of RADOS
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The database is striped on RADOS and can be extracted using the RADOS cli toolset.
+
+.. code:: sh
+
+ rados --pool=foo --striper get bar.db local-bar.db
+ rados --pool=foo --striper get bar.db-journal local-bar.db-journal
+ sqlite3 local-bar.db ...
+
+Keep in mind the rollback journal is also striped and will need to be extracted
+as well if the database was in the middle of a transaction. If you're using
+WAL, that journal will need to be extracted as well.
+
+Keep in mind that extracting the database using the striper uses the same RADOS
+locks as those used by the *ceph* VFS. However, the journal file locks are not
+used by the *ceph* VFS (SQLite only locks the main database file) so there is a
+potential race with other SQLite clients when extracting both files. That could
+result in fetching a corrupt journal.
+
+Instead of manually extracting the files, it would be more advisable to use the
+`SQLite Backup`_ mechanism instead.
+
+
+Temporary Tables
+^^^^^^^^^^^^^^^^
+
+Temporary tables backed by the ceph VFS are not supported. The main reason for
+this is that the VFS lacks context about where it should put the database, i.e.
+which RADOS pool. The persistent database associated with the temporary
+database is not communicated via the SQLite VFS API.
+
+Instead, it's suggested to attach a secondary local or `In-Memory Database`_
+and put the temporary tables there. Alternatively, you may set a connection
+pragma:
+
+.. code:: sql
+
+ PRAGMA temp_store=memory
+
+
+.. _libcephsqlite-breaking-locks:
+
+Breaking Locks
+^^^^^^^^^^^^^^
+
+Access to the database file is protected by an exclusive lock on the first
+object stripe of the database. If the application fails without unlocking the
+database (e.g. a segmentation fault), the lock is not automatically unlocked,
+even if the client connection is blocklisted afterward. Eventually, the lock
+will timeout subject to the configurations::
+
+ cephsqlite_lock_renewal_timeout = 30000
+
+The timeout is in milliseconds. Once the timeout is reached, the OSD will
+expire the lock and allow clients to relock. When this occurs, the database
+will be recovered by SQLite and the in-progress transaction rolled back. The
+new client recovering the database will also blocklist the old client to
+prevent potential database corruption from rogue writes.
+
+The holder of the exclusive lock on the database will periodically renew the
+lock so it does not lose the lock. This is necessary for large transactions or
+database connections operating in ``EXCLUSIVE`` locking mode. The lock renewal
+interval is adjustable via::
+
+ cephsqlite_lock_renewal_interval = 2000
+
+This configuration is also in units of milliseconds.
+
+It is possible to break the lock early if you know the client is gone for good
+(e.g. blocklisted). This allows restoring database access to clients
+immediately. For example:
+
+.. code:: sh
+
+ $ rados --pool=foo --namespace bar lock info baz.db.0000000000000000 striper.lock
+ {"name":"striper.lock","type":"exclusive","tag":"","lockers":[{"name":"client.4463","cookie":"555c7208-db39-48e8-a4d7-3ba92433a41a","description":"SimpleRADOSStriper","expiration":"0.000000","addr":"127.0.0.1:0/1831418345"}]}
+
+ $ rados --pool=foo --namespace bar lock break baz.db.0000000000000000 striper.lock client.4463 --lock-cookie 555c7208-db39-48e8-a4d7-3ba92433a41a
+
+.. _libcephsqlite-corrupt:
+
+How to Corrupt Your Database
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There is the usual reading on `How to Corrupt Your SQLite Database`_ that you
+should review before using this tool. To add to that, the most likely way you
+may corrupt your database is by a rogue process transiently losing network
+connectivity and then resuming its work. The exclusive RADOS lock it held will
+be lost but it cannot know that immediately. Any work it might do after
+regaining network connectivity could corrupt the database.
+
+The *ceph* VFS library defaults do not allow for this scenario to occur. The Ceph
+VFS will blocklist the last owner of the exclusive lock on the database if it
+detects incomplete cleanup.
+
+By blocklisting the old client, it's no longer possible for the old client to
+resume its work on the database when it returns (subject to blocklist
+expiration, 3600 seconds by default). To turn off blocklisting the prior client, change::
+
+ cephsqlite_blocklist_dead_locker = false
+
+Do NOT do this unless you know database corruption cannot result due to other
+guarantees. If this config is true (the default), the *ceph* VFS will cowardly
+fail if it cannot blocklist the prior instance (due to lack of authorization,
+for example).
+
+One example where out-of-band mechanisms exist to blocklist the last dead
+holder of the exclusive lock on the database is in the ``ceph-mgr``. The
+monitors are made aware of the RADOS connection used for the *ceph* VFS and will
+blocklist the instance during ``ceph-mgr`` failover. This prevents a zombie
+``ceph-mgr`` from continuing work and potentially corrupting the database. For
+this reason, it is not necessary for the *ceph* VFS to do the blocklist command
+in the new instance of the ``ceph-mgr`` (but it still does so, harmlessly).
+
+To blocklist the *ceph* VFS manually, you may see the instance address of the
+*ceph* VFS using the ``ceph_status`` SQL function:
+
+.. code:: sql
+
+ SELECT ceph_status();
+
+.. code::
+
+ {"id":788461300,"addr":"172.21.10.4:0/1472139388"}
+
+You may easily manipulate that information using the `JSON1 extension`_:
+
+.. code:: sql
+
+ SELECT json_extract(ceph_status(), '$.addr');
+
+.. code::
+
+ 172.21.10.4:0/3563721180
+
+This is the address you would pass to the ceph blocklist command:
+
+.. code:: sh
+
+ ceph osd blocklist add 172.21.10.4:0/3082314560
+
+
+Performance Statistics
+^^^^^^^^^^^^^^^^^^^^^^
+
+The *ceph* VFS provides a SQLite function, ``ceph_perf``, for querying the
+performance statistics of the VFS. The data is from "performance counters" as
+in other Ceph services normally queried via an admin socket.
+
+.. code:: sql
+
+ SELECT ceph_perf();
+
+.. code::
+
+ {"libcephsqlite_vfs":{"op_open":{"avgcount":2,"sum":0.150001291,"avgtime":0.075000645},"op_delete":{"avgcount":0,"sum":0.000000000,"avgtime":0.000000000},"op_access":{"avgcount":1,"sum":0.003000026,"avgtime":0.003000026},"op_fullpathname":{"avgcount":1,"sum":0.064000551,"avgtime":0.064000551},"op_currenttime":{"avgcount":0,"sum":0.000000000,"avgtime":0.000000000},"opf_close":{"avgcount":1,"sum":0.000000000,"avgtime":0.000000000},"opf_read":{"avgcount":3,"sum":0.036000310,"avgtime":0.012000103},"opf_write":{"avgcount":0,"sum":0.000000000,"avgtime":0.000000000},"opf_truncate":{"avgcount":0,"sum":0.000000000,"avgtime":0.000000000},"opf_sync":{"avgcount":0,"sum":0.000000000,"avgtime":0.000000000},"opf_filesize":{"avgcount":2,"sum":0.000000000,"avgtime":0.000000000},"opf_lock":{"avgcount":1,"sum":0.158001360,"avgtime":0.158001360},"opf_unlock":{"avgcount":1,"sum":0.101000871,"avgtime":0.101000871},"opf_checkreservedlock":{"avgcount":1,"sum":0.002000017,"avgtime":0.002000017},"opf_filecontrol":{"avgcount":4,"sum":0.000000000,"avgtime":0.000000000},"opf_sectorsize":{"avgcount":0,"sum":0.000000000,"avgtime":0.000000000},"opf_devicecharacteristics":{"avgcount":4,"sum":0.000000000,"avgtime":0.000000000}},"libcephsqlite_striper":{"update_metadata":0,"update_allocated":0,"update_size":0,"update_version":0,"shrink":0,"shrink_bytes":0,"lock":1,"unlock":1}}
+
+You may easily manipulate that information using the `JSON1 extension`_:
+
+.. code:: sql
+
+ SELECT json_extract(ceph_perf(), '$.libcephsqlite_vfs.opf_sync.avgcount');
+
+.. code::
+
+ 776
+
+That tells you the number of times SQLite has called the xSync method of the
+`SQLite IO Methods`_ of the VFS (for **all** open database connections in the
+process). You could analyze the performance stats before and after a number of
+queries to see the number of file system syncs required (this would just be
+proportional to the number of transactions). Alternatively, you may be more
+interested in the average latency to complete a write:
+
+.. code:: sql
+
+ SELECT json_extract(ceph_perf(), '$.libcephsqlite_vfs.opf_write');
+
+.. code::
+
+ {"avgcount":7873,"sum":0.675005797,"avgtime":0.000085736}
+
+Which would tell you there have been 7873 writes with an average
+time-to-complete of 85 microseconds. That clearly shows the calls are executed
+asynchronously. Returning to sync:
+
+.. code:: sql
+
+ SELECT json_extract(ceph_perf(), '$.libcephsqlite_vfs.opf_sync');
+
+.. code::
+
+ {"avgcount":776,"sum":4.802041199,"avgtime":0.006188197}
+
+6 milliseconds were spent on average executing a sync call. This gathers all of
+the asynchronous writes as well as an asynchronous update to the size of the
+striped file.
+
+
+Debugging
+^^^^^^^^^
+
+Debugging libcephsqlite can be turned on via::
+
+ debug_cephsqlite
+
+If running the ``sqlite3`` command-line tool, use:
+
+.. code:: sh
+
+ env CEPH_ARGS='--log_to_file true --log-file sqlite3.log --debug_cephsqlite 20 --debug_ms 1' sqlite3 ...
+
+This will save all the usual Ceph debugging to a file ``sqlite3.log`` for inspection.
+
+
+.. _SQLite: https://sqlite.org/index.html
+.. _SQLite VFS: https://www.sqlite.org/vfs.html
+.. _SQLite Backup: https://www.sqlite.org/backup.html
+.. _SQLite Limits: https://www.sqlite.org/limits.html
+.. _SQLite Extension Loading API: https://sqlite.org/c3ref/load_extension.html
+.. _In-Memory Database: https://www.sqlite.org/inmemorydb.html
+.. _WAL Journal Mode: https://sqlite.org/wal.html
+.. _How to Corrupt Your SQLite Database: https://www.sqlite.org/howtocorrupt.html
+.. _JSON1 Extension: https://www.sqlite.org/json1.html
+.. _SQLite IO Methods: https://www.sqlite.org/c3ref/io_methods.html
diff --git a/doc/rados/api/librados-intro.rst b/doc/rados/api/librados-intro.rst
new file mode 100644
index 000000000..5174188b4
--- /dev/null
+++ b/doc/rados/api/librados-intro.rst
@@ -0,0 +1,1051 @@
+==========================
+ Introduction to librados
+==========================
+
+The :term:`Ceph Storage Cluster` provides the basic storage service that allows
+:term:`Ceph` to uniquely deliver **object, block, and file storage** in one
+unified system. However, you are not limited to using the RESTful, block, or
+POSIX interfaces. Based upon :abbr:`RADOS (Reliable Autonomic Distributed Object
+Store)`, the ``librados`` API enables you to create your own interface to the
+Ceph Storage Cluster.
+
+The ``librados`` API enables you to interact with the two types of daemons in
+the Ceph Storage Cluster:
+
+- The :term:`Ceph Monitor`, which maintains a master copy of the cluster map.
+- The :term:`Ceph OSD Daemon` (OSD), which stores data as objects on a storage node.
+
+.. ditaa::
+ +---------------------------------+
+ | Ceph Storage Cluster Protocol |
+ | (librados) |
+ +---------------------------------+
+ +---------------+ +---------------+
+ | OSDs | | Monitors |
+ +---------------+ +---------------+
+
+This guide provides a high-level introduction to using ``librados``.
+Refer to :doc:`../../architecture` for additional details of the Ceph
+Storage Cluster. To use the API, you need a running Ceph Storage Cluster.
+See `Installation (Quick)`_ for details.
+
+
+Step 1: Getting librados
+========================
+
+Your client application must bind with ``librados`` to connect to the Ceph
+Storage Cluster. You must install ``librados`` and any required packages to
+write applications that use ``librados``. The ``librados`` API is written in
+C++, with additional bindings for C, Python, Java and PHP.
+
+
+Getting librados for C/C++
+--------------------------
+
+To install ``librados`` development support files for C/C++ on Debian/Ubuntu
+distributions, execute the following:
+
+.. prompt:: bash $
+
+ sudo apt-get install librados-dev
+
+To install ``librados`` development support files for C/C++ on RHEL/CentOS
+distributions, execute the following:
+
+.. prompt:: bash $
+
+ sudo yum install librados2-devel
+
+Once you install ``librados`` for developers, you can find the required
+headers for C/C++ under ``/usr/include/rados``:
+
+.. prompt:: bash $
+
+ ls /usr/include/rados
+
+
+Getting librados for Python
+---------------------------
+
+The ``rados`` module provides ``librados`` support to Python
+applications. You may install ``python3-rados`` for Debian, Ubuntu, SLE or
+openSUSE or the ``python-rados`` package for CentOS/RHEL.
+
+To install ``librados`` development support files for Python on Debian/Ubuntu
+distributions, execute the following:
+
+.. prompt:: bash $
+
+ sudo apt-get install python3-rados
+
+To install ``librados`` development support files for Python on RHEL/CentOS
+distributions, execute the following:
+
+.. prompt:: bash $
+
+ sudo yum install python-rados
+
+To install ``librados`` development support files for Python on SLE/openSUSE
+distributions, execute the following:
+
+.. prompt:: bash $
+
+ sudo zypper install python3-rados
+
+You can find the module under ``/usr/share/pyshared`` on Debian systems,
+or under ``/usr/lib/python*/site-packages`` on CentOS/RHEL systems.
+
+
+Getting librados for Java
+-------------------------
+
+To install ``librados`` for Java, you need to execute the following procedure:
+
+#. Install ``jna.jar``. For Debian/Ubuntu, execute:
+
+ .. prompt:: bash $
+
+ sudo apt-get install libjna-java
+
+ For CentOS/RHEL, execute:
+
+ .. prompt:: bash $
+
+ sudo yum install jna
+
+ The JAR files are located in ``/usr/share/java``.
+
+#. Clone the ``rados-java`` repository:
+
+ .. prompt:: bash $
+
+ git clone --recursive https://github.com/ceph/rados-java.git
+
+#. Build the ``rados-java`` repository:
+
+ .. prompt:: bash $
+
+ cd rados-java
+ ant
+
+ The JAR file is located under ``rados-java/target``.
+
+#. Copy the JAR for RADOS to a common location (e.g., ``/usr/share/java``) and
+ ensure that it and the JNA JAR are in your JVM's classpath. For example:
+
+ .. prompt:: bash $
+
+ sudo cp target/rados-0.1.3.jar /usr/share/java/rados-0.1.3.jar
+ sudo ln -s /usr/share/java/jna-3.2.7.jar /usr/lib/jvm/default-java/jre/lib/ext/jna-3.2.7.jar
+ sudo ln -s /usr/share/java/rados-0.1.3.jar /usr/lib/jvm/default-java/jre/lib/ext/rados-0.1.3.jar
+
+To build the documentation, execute the following:
+
+.. prompt:: bash $
+
+ ant docs
+
+
+Getting librados for PHP
+-------------------------
+
+To install the ``librados`` extension for PHP, you need to execute the following procedure:
+
+#. Install php-dev. For Debian/Ubuntu, execute:
+
+ .. prompt:: bash $
+
+ sudo apt-get install php5-dev build-essential
+
+ For CentOS/RHEL, execute:
+
+ .. prompt:: bash $
+
+ sudo yum install php-devel
+
+#. Clone the ``phprados`` repository:
+
+ .. prompt:: bash $
+
+ git clone https://github.com/ceph/phprados.git
+
+#. Build ``phprados``:
+
+ .. prompt:: bash $
+
+ cd phprados
+ phpize
+ ./configure
+ make
+ sudo make install
+
+#. Enable ``phprados`` by adding the following line to ``php.ini``::
+
+ extension=rados.so
+
+
+Step 2: Configuring a Cluster Handle
+====================================
+
+A :term:`Ceph Client`, via ``librados``, interacts directly with OSDs to store
+and retrieve data. To interact with OSDs, the client app must invoke
+``librados`` and connect to a Ceph Monitor. Once connected, ``librados``
+retrieves the :term:`Cluster Map` from the Ceph Monitor. When the client app
+wants to read or write data, it creates an I/O context and binds to a
+:term:`Pool`. The pool has an associated :term:`CRUSH rule` that defines how it
+will place data in the storage cluster. Via the I/O context, the client
+provides the object name to ``librados``, which takes the object name
+and the cluster map (i.e., the topology of the cluster) and `computes`_ the
+placement group and `OSD`_ for locating the data. Then the client application
+can read or write data. The client app doesn't need to learn about the topology
+of the cluster directly.
+
+.. ditaa::
+ +--------+ Retrieves +---------------+
+ | Client |------------>| Cluster Map |
+ +--------+ +---------------+
+ |
+ v Writes
+ /-----\
+ | obj |
+ \-----/
+ | To
+ v
+ +--------+ +---------------+
+ | Pool |---------->| CRUSH Rule |
+ +--------+ Selects +---------------+
+
+
+The Ceph Storage Cluster handle encapsulates the client configuration, including:
+
+- The `user ID`_ for ``rados_create()`` or user name for ``rados_create2()``
+ (preferred).
+- The :term:`cephx` authentication key
+- The monitor ID and IP address
+- Logging levels
+- Debugging levels
+
+Thus, the first steps in using the cluster from your app are to 1) create
+a cluster handle that your app will use to connect to the storage cluster,
+and then 2) use that handle to connect. To connect to the cluster, the
+app must supply a monitor address, a username and an authentication key
+(cephx is enabled by default).
+
+.. tip:: Talking to different Ceph Storage Clusters – or to the same cluster
+ with different users – requires different cluster handles.
+
+RADOS provides a number of ways for you to set the required values. For
+the monitor and encryption key settings, an easy way to handle them is to ensure
+that your Ceph configuration file contains a ``keyring`` path to a keyring file
+and at least one monitor address (e.g., ``mon_host``). For example::
+
+ [global]
+ mon_host = 192.168.1.1
+ keyring = /etc/ceph/ceph.client.admin.keyring
+
+Once you create the handle, you can read a Ceph configuration file to configure
+the handle. You can also pass arguments to your app and parse them with the
+function for parsing command line arguments (e.g., ``rados_conf_parse_argv()``),
+or parse Ceph environment variables (e.g., ``rados_conf_parse_env()``). Some
+wrappers may not implement convenience methods, so you may need to implement
+these capabilities. The following diagram provides a high-level flow for the
+initial connection.
+
+
+.. ditaa::
+ +---------+ +---------+
+ | Client | | Monitor |
+ +---------+ +---------+
+ | |
+ |-----+ create |
+ | | cluster |
+ |<----+ handle |
+ | |
+ |-----+ read |
+ | | config |
+ |<----+ file |
+ | |
+ | connect |
+ |-------------->|
+ | |
+ |<--------------|
+ | connected |
+ | |
+
+
+Once connected, your app can invoke functions that affect the whole cluster
+with only the cluster handle. For example, once you have a cluster
+handle, you can:
+
+- Get cluster statistics
+- Use Pool Operation (exists, create, list, delete)
+- Get and set the configuration
+
+
+One of the powerful features of Ceph is the ability to bind to different pools.
+Each pool may have a different number of placement groups, object replicas and
+replication strategies. For example, a pool could be set up as a "hot" pool that
+uses SSDs for frequently used objects or a "cold" pool that uses erasure coding.
+
+The main difference in the various ``librados`` bindings is between C and
+the object-oriented bindings for C++, Java and Python. The object-oriented
+bindings use objects to represent cluster handles, IO Contexts, iterators,
+exceptions, etc.
+
+
+C Example
+---------
+
+For C, creating a simple cluster handle using the ``admin`` user, configuring
+it and connecting to the cluster might look something like this:
+
+.. code-block:: c
+
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include <rados/librados.h>
+
+ int main (int argc, const char **argv)
+ {
+
+ /* Declare the cluster handle and required arguments. */
+ rados_t cluster;
+ char cluster_name[] = "ceph";
+ char user_name[] = "client.admin";
+ uint64_t flags = 0;
+
+ /* Initialize the cluster handle with the "ceph" cluster name and the "client.admin" user */
+ int err;
+ err = rados_create2(&cluster, cluster_name, user_name, flags);
+
+ if (err < 0) {
+ fprintf(stderr, "%s: Couldn't create the cluster handle! %s\n", argv[0], strerror(-err));
+ exit(EXIT_FAILURE);
+ } else {
+ printf("\nCreated a cluster handle.\n");
+ }
+
+
+ /* Read a Ceph configuration file to configure the cluster handle. */
+ err = rados_conf_read_file(cluster, "/etc/ceph/ceph.conf");
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot read config file: %s\n", argv[0], strerror(-err));
+ exit(EXIT_FAILURE);
+ } else {
+ printf("\nRead the config file.\n");
+ }
+
+ /* Read command line arguments */
+ err = rados_conf_parse_argv(cluster, argc, argv);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot parse command line arguments: %s\n", argv[0], strerror(-err));
+ exit(EXIT_FAILURE);
+ } else {
+ printf("\nRead the command line arguments.\n");
+ }
+
+ /* Connect to the cluster */
+ err = rados_connect(cluster);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot connect to cluster: %s\n", argv[0], strerror(-err));
+ exit(EXIT_FAILURE);
+ } else {
+ printf("\nConnected to the cluster.\n");
+ }
+
+ }
+
+Compile your client and link to ``librados`` using ``-lrados``. For example:
+
+.. prompt:: bash $
+
+ gcc ceph-client.c -lrados -o ceph-client
+
+
+C++ Example
+-----------
+
+The Ceph project provides a C++ example in the ``ceph/examples/librados``
+directory. For C++, a simple cluster handle using the ``admin`` user requires
+you to initialize a ``librados::Rados`` cluster handle object:
+
+.. code-block:: c++
+
+ #include <iostream>
+ #include <string>
+ #include <rados/librados.hpp>
+
+ int main(int argc, const char **argv)
+ {
+
+ int ret = 0;
+
+ /* Declare the cluster handle and required variables. */
+ librados::Rados cluster;
+ char cluster_name[] = "ceph";
+ char user_name[] = "client.admin";
+ uint64_t flags = 0;
+
+ /* Initialize the cluster handle with the "ceph" cluster name and "client.admin" user */
+ {
+ ret = cluster.init2(user_name, cluster_name, flags);
+ if (ret < 0) {
+ std::cerr << "Couldn't initialize the cluster handle! error " << ret << std::endl;
+ return EXIT_FAILURE;
+ } else {
+ std::cout << "Created a cluster handle." << std::endl;
+ }
+ }
+
+ /* Read a Ceph configuration file to configure the cluster handle. */
+ {
+ ret = cluster.conf_read_file("/etc/ceph/ceph.conf");
+ if (ret < 0) {
+ std::cerr << "Couldn't read the Ceph configuration file! error " << ret << std::endl;
+ return EXIT_FAILURE;
+ } else {
+ std::cout << "Read the Ceph configuration file." << std::endl;
+ }
+ }
+
+ /* Read command line arguments */
+ {
+ ret = cluster.conf_parse_argv(argc, argv);
+ if (ret < 0) {
+ std::cerr << "Couldn't parse command line options! error " << ret << std::endl;
+ return EXIT_FAILURE;
+ } else {
+ std::cout << "Parsed command line options." << std::endl;
+ }
+ }
+
+ /* Connect to the cluster */
+ {
+ ret = cluster.connect();
+ if (ret < 0) {
+ std::cerr << "Couldn't connect to cluster! error " << ret << std::endl;
+ return EXIT_FAILURE;
+ } else {
+ std::cout << "Connected to the cluster." << std::endl;
+ }
+ }
+
+ return 0;
+ }
+
+
+Compile the source; then, link ``librados`` using ``-lrados``.
+For example:
+
+.. prompt:: bash $
+
+ g++ -g -c ceph-client.cc -o ceph-client.o
+ g++ -g ceph-client.o -lrados -o ceph-client
+
+
+
+Python Example
+--------------
+
+Python uses the ``admin`` id and the ``ceph`` cluster name by default, and
+will read the standard ``ceph.conf`` file if the conffile parameter is
+set to the empty string. The Python binding converts C++ errors
+into exceptions.
+
+
+.. code-block:: python
+
+ import rados
+
+ try:
+ cluster = rados.Rados(conffile='')
+ except TypeError as e:
+ print('Argument validation error: {}'.format(e))
+ raise e
+
+ print("Created cluster handle.")
+
+ try:
+ cluster.connect()
+ except Exception as e:
+ print("connection error: {}".format(e))
+ raise e
+ finally:
+ print("Connected to the cluster.")
+
+
+Execute the example to verify that it connects to your cluster:
+
+.. prompt:: bash $
+
+ python ceph-client.py
+
+
+Java Example
+------------
+
+Java requires you to specify the user ID (``admin``) or user name
+(``client.admin``), and uses the ``ceph`` cluster name by default . The Java
+binding converts C++-based errors into exceptions.
+
+.. code-block:: java
+
+ import com.ceph.rados.Rados;
+ import com.ceph.rados.RadosException;
+
+ import java.io.File;
+
+ public class CephClient {
+ public static void main (String args[]){
+
+ try {
+ Rados cluster = new Rados("admin");
+ System.out.println("Created cluster handle.");
+
+ File f = new File("/etc/ceph/ceph.conf");
+ cluster.confReadFile(f);
+ System.out.println("Read the configuration file.");
+
+ cluster.connect();
+ System.out.println("Connected to the cluster.");
+
+ } catch (RadosException e) {
+ System.out.println(e.getMessage() + ": " + e.getReturnValue());
+ }
+ }
+ }
+
+
+Compile the source; then, run it. If you have copied the JAR to
+``/usr/share/java`` and sym linked from your ``ext`` directory, you won't need
+to specify the classpath. For example:
+
+.. prompt:: bash $
+
+ javac CephClient.java
+ java CephClient
+
+
+PHP Example
+------------
+
+With the RADOS extension enabled in PHP you can start creating a new cluster handle very easily:
+
+.. code-block:: php
+
+ <?php
+
+ $r = rados_create();
+ rados_conf_read_file($r, '/etc/ceph/ceph.conf');
+ if (!rados_connect($r)) {
+ echo "Failed to connect to Ceph cluster";
+ } else {
+ echo "Successfully connected to Ceph cluster";
+ }
+
+
+Save this as rados.php and run the code:
+
+.. prompt:: bash $
+
+ php rados.php
+
+
+Step 3: Creating an I/O Context
+===============================
+
+Once your app has a cluster handle and a connection to a Ceph Storage Cluster,
+you may create an I/O Context and begin reading and writing data. An I/O Context
+binds the connection to a specific pool. The user must have appropriate
+`CAPS`_ permissions to access the specified pool. For example, a user with read
+access but not write access will only be able to read data. I/O Context
+functionality includes:
+
+- Write/read data and extended attributes
+- List and iterate over objects and extended attributes
+- Snapshot pools, list snapshots, etc.
+
+
+.. ditaa::
+ +---------+ +---------+ +---------+
+ | Client | | Monitor | | OSD |
+ +---------+ +---------+ +---------+
+ | | |
+ |-----+ create | |
+ | | I/O | |
+ |<----+ context | |
+ | | |
+ | write data | |
+ |---------------+-------------->|
+ | | |
+ | write ack | |
+ |<--------------+---------------|
+ | | |
+ | write xattr | |
+ |---------------+-------------->|
+ | | |
+ | xattr ack | |
+ |<--------------+---------------|
+ | | |
+ | read data | |
+ |---------------+-------------->|
+ | | |
+ | read ack | |
+ |<--------------+---------------|
+ | | |
+ | remove data | |
+ |---------------+-------------->|
+ | | |
+ | remove ack | |
+ |<--------------+---------------|
+
+
+
+RADOS enables you to interact both synchronously and asynchronously. Once your
+app has an I/O Context, read/write operations only require you to know the
+object/xattr name. The CRUSH algorithm encapsulated in ``librados`` uses the
+cluster map to identify the appropriate OSD. OSD daemons handle the replication,
+as described in `Smart Daemons Enable Hyperscale`_. The ``librados`` library also
+maps objects to placement groups, as described in `Calculating PG IDs`_.
+
+The following examples use the default ``data`` pool. However, you may also
+use the API to list pools, ensure they exist, or create and delete pools. For
+the write operations, the examples illustrate how to use synchronous mode. For
+the read operations, the examples illustrate how to use asynchronous mode.
+
+.. important:: Use caution when deleting pools with this API. If you delete
+ a pool, the pool and ALL DATA in the pool will be lost.
+
+
+C Example
+---------
+
+
+.. code-block:: c
+
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include <rados/librados.h>
+
+ int main (int argc, const char **argv)
+ {
+ /*
+ * Continued from previous C example, where cluster handle and
+ * connection are established. First declare an I/O Context.
+ */
+
+ rados_ioctx_t io;
+ char *poolname = "data";
+
+ err = rados_ioctx_create(cluster, poolname, &io);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot open rados pool %s: %s\n", argv[0], poolname, strerror(-err));
+ rados_shutdown(cluster);
+ exit(EXIT_FAILURE);
+ } else {
+ printf("\nCreated I/O context.\n");
+ }
+
+ /* Write data to the cluster synchronously. */
+ err = rados_write(io, "hw", "Hello World!", 12, 0);
+ if (err < 0) {
+ fprintf(stderr, "%s: Cannot write object \"hw\" to pool %s: %s\n", argv[0], poolname, strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ } else {
+ printf("\nWrote \"Hello World\" to object \"hw\".\n");
+ }
+
+ char xattr[] = "en_US";
+ err = rados_setxattr(io, "hw", "lang", xattr, 5);
+ if (err < 0) {
+ fprintf(stderr, "%s: Cannot write xattr to pool %s: %s\n", argv[0], poolname, strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ } else {
+ printf("\nWrote \"en_US\" to xattr \"lang\" for object \"hw\".\n");
+ }
+
+ /*
+ * Read data from the cluster asynchronously.
+ * First, set up asynchronous I/O completion.
+ */
+ rados_completion_t comp;
+ err = rados_aio_create_completion(NULL, NULL, NULL, &comp);
+ if (err < 0) {
+ fprintf(stderr, "%s: Could not create aio completion: %s\n", argv[0], strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ } else {
+ printf("\nCreated AIO completion.\n");
+ }
+
+ /* Next, read data using rados_aio_read. */
+ char read_res[100];
+ err = rados_aio_read(io, "hw", comp, read_res, 12, 0);
+ if (err < 0) {
+ fprintf(stderr, "%s: Cannot read object. %s %s\n", argv[0], poolname, strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ } else {
+ printf("\nRead object \"hw\". The contents are:\n %s \n", read_res);
+ }
+
+ /* Wait for the operation to complete */
+ rados_aio_wait_for_complete(comp);
+
+ /* Release the asynchronous I/O complete handle to avoid memory leaks. */
+ rados_aio_release(comp);
+
+
+ char xattr_res[100];
+ err = rados_getxattr(io, "hw", "lang", xattr_res, 5);
+ if (err < 0) {
+ fprintf(stderr, "%s: Cannot read xattr. %s %s\n", argv[0], poolname, strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ } else {
+ printf("\nRead xattr \"lang\" for object \"hw\". The contents are:\n %s \n", xattr_res);
+ }
+
+ err = rados_rmxattr(io, "hw", "lang");
+ if (err < 0) {
+ fprintf(stderr, "%s: Cannot remove xattr. %s %s\n", argv[0], poolname, strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ } else {
+ printf("\nRemoved xattr \"lang\" for object \"hw\".\n");
+ }
+
+ err = rados_remove(io, "hw");
+ if (err < 0) {
+ fprintf(stderr, "%s: Cannot remove object. %s %s\n", argv[0], poolname, strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ } else {
+ printf("\nRemoved object \"hw\".\n");
+ }
+
+ }
+
+
+
+C++ Example
+-----------
+
+
+.. code-block:: c++
+
+ #include <iostream>
+ #include <string>
+ #include <rados/librados.hpp>
+
+ int main(int argc, const char **argv)
+ {
+
+ /* Continued from previous C++ example, where cluster handle and
+ * connection are established. First declare an I/O Context.
+ */
+
+ librados::IoCtx io_ctx;
+ const char *pool_name = "data";
+
+ {
+ ret = cluster.ioctx_create(pool_name, io_ctx);
+ if (ret < 0) {
+ std::cerr << "Couldn't set up ioctx! error " << ret << std::endl;
+ exit(EXIT_FAILURE);
+ } else {
+ std::cout << "Created an ioctx for the pool." << std::endl;
+ }
+ }
+
+
+ /* Write an object synchronously. */
+ {
+ librados::bufferlist bl;
+ bl.append("Hello World!");
+ ret = io_ctx.write_full("hw", bl);
+ if (ret < 0) {
+ std::cerr << "Couldn't write object! error " << ret << std::endl;
+ exit(EXIT_FAILURE);
+ } else {
+ std::cout << "Wrote new object 'hw' " << std::endl;
+ }
+ }
+
+
+ /*
+ * Add an xattr to the object.
+ */
+ {
+ librados::bufferlist lang_bl;
+ lang_bl.append("en_US");
+ ret = io_ctx.setxattr("hw", "lang", lang_bl);
+ if (ret < 0) {
+ std::cerr << "failed to set xattr version entry! error "
+ << ret << std::endl;
+ exit(EXIT_FAILURE);
+ } else {
+ std::cout << "Set the xattr 'lang' on our object!" << std::endl;
+ }
+ }
+
+
+ /*
+ * Read the object back asynchronously.
+ */
+ {
+ librados::bufferlist read_buf;
+ int read_len = 4194304;
+
+ //Create I/O Completion.
+ librados::AioCompletion *read_completion = librados::Rados::aio_create_completion();
+
+ //Send read request.
+ ret = io_ctx.aio_read("hw", read_completion, &read_buf, read_len, 0);
+ if (ret < 0) {
+ std::cerr << "Couldn't start read object! error " << ret << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ // Wait for the request to complete, and check that it succeeded.
+ read_completion->wait_for_complete();
+ ret = read_completion->get_return_value();
+ if (ret < 0) {
+ std::cerr << "Couldn't read object! error " << ret << std::endl;
+ exit(EXIT_FAILURE);
+ } else {
+ std::cout << "Read object hw asynchronously with contents.\n"
+ << read_buf.c_str() << std::endl;
+ }
+ }
+
+
+ /*
+ * Read the xattr.
+ */
+ {
+ librados::bufferlist lang_res;
+ ret = io_ctx.getxattr("hw", "lang", lang_res);
+ if (ret < 0) {
+ std::cerr << "failed to get xattr version entry! error "
+ << ret << std::endl;
+ exit(EXIT_FAILURE);
+ } else {
+ std::cout << "Got the xattr 'lang' from object hw!"
+ << lang_res.c_str() << std::endl;
+ }
+ }
+
+
+ /*
+ * Remove the xattr.
+ */
+ {
+ ret = io_ctx.rmxattr("hw", "lang");
+ if (ret < 0) {
+ std::cerr << "Failed to remove xattr! error "
+ << ret << std::endl;
+ exit(EXIT_FAILURE);
+ } else {
+ std::cout << "Removed the xattr 'lang' from our object!" << std::endl;
+ }
+ }
+
+ /*
+ * Remove the object.
+ */
+ {
+ ret = io_ctx.remove("hw");
+ if (ret < 0) {
+ std::cerr << "Couldn't remove object! error " << ret << std::endl;
+ exit(EXIT_FAILURE);
+ } else {
+ std::cout << "Removed object 'hw'." << std::endl;
+ }
+ }
+ }
+
+
+
+Python Example
+--------------
+
+.. code-block:: python
+
+ print("\n\nI/O Context and Object Operations")
+ print("=================================")
+
+ print("\nCreating a context for the 'data' pool")
+ if not cluster.pool_exists('data'):
+ raise RuntimeError('No data pool exists')
+ ioctx = cluster.open_ioctx('data')
+
+ print("\nWriting object 'hw' with contents 'Hello World!' to pool 'data'.")
+ ioctx.write("hw", b"Hello World!")
+ print("Writing XATTR 'lang' with value 'en_US' to object 'hw'")
+ ioctx.set_xattr("hw", "lang", b"en_US")
+
+
+ print("\nWriting object 'bm' with contents 'Bonjour tout le monde!' to pool
+ 'data'.")
+ ioctx.write("bm", b"Bonjour tout le monde!")
+ print("Writing XATTR 'lang' with value 'fr_FR' to object 'bm'")
+ ioctx.set_xattr("bm", "lang", b"fr_FR")
+
+ print("\nContents of object 'hw'\n------------------------")
+ print(ioctx.read("hw"))
+
+ print("\n\nGetting XATTR 'lang' from object 'hw'")
+ print(ioctx.get_xattr("hw", "lang"))
+
+ print("\nContents of object 'bm'\n------------------------")
+ print(ioctx.read("bm"))
+
+ print("\n\nGetting XATTR 'lang' from object 'bm'")
+ print(ioctx.get_xattr("bm", "lang"))
+
+
+ print("\nRemoving object 'hw'")
+ ioctx.remove_object("hw")
+
+ print("Removing object 'bm'")
+ ioctx.remove_object("bm")
+
+
+Java-Example
+------------
+
+.. code-block:: java
+
+ import com.ceph.rados.Rados;
+ import com.ceph.rados.RadosException;
+
+ import java.io.File;
+ import com.ceph.rados.IoCTX;
+
+ public class CephClient {
+ public static void main (String args[]){
+
+ try {
+ Rados cluster = new Rados("admin");
+ System.out.println("Created cluster handle.");
+
+ File f = new File("/etc/ceph/ceph.conf");
+ cluster.confReadFile(f);
+ System.out.println("Read the configuration file.");
+
+ cluster.connect();
+ System.out.println("Connected to the cluster.");
+
+ IoCTX io = cluster.ioCtxCreate("data");
+
+ String oidone = "hw";
+ String contentone = "Hello World!";
+ io.write(oidone, contentone);
+
+ String oidtwo = "bm";
+ String contenttwo = "Bonjour tout le monde!";
+ io.write(oidtwo, contenttwo);
+
+ String[] objects = io.listObjects();
+ for (String object: objects)
+ System.out.println(object);
+
+ io.remove(oidone);
+ io.remove(oidtwo);
+
+ cluster.ioCtxDestroy(io);
+
+ } catch (RadosException e) {
+ System.out.println(e.getMessage() + ": " + e.getReturnValue());
+ }
+ }
+ }
+
+
+PHP Example
+-----------
+
+.. code-block:: php
+
+ <?php
+
+ $io = rados_ioctx_create($r, "mypool");
+ rados_write_full($io, "oidOne", "mycontents");
+ rados_remove("oidOne");
+ rados_ioctx_destroy($io);
+
+
+Step 4: Closing Sessions
+========================
+
+Once your app finishes with the I/O Context and cluster handle, the app should
+close the connection and shutdown the handle. For asynchronous I/O, the app
+should also ensure that pending asynchronous operations have completed.
+
+
+C Example
+---------
+
+.. code-block:: c
+
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+
+
+C++ Example
+-----------
+
+.. code-block:: c++
+
+ io_ctx.close();
+ cluster.shutdown();
+
+
+Java Example
+--------------
+
+.. code-block:: java
+
+ cluster.ioCtxDestroy(io);
+ cluster.shutDown();
+
+
+Python Example
+--------------
+
+.. code-block:: python
+
+ print("\nClosing the connection.")
+ ioctx.close()
+
+ print("Shutting down the handle.")
+ cluster.shutdown()
+
+PHP Example
+-----------
+
+.. code-block:: php
+
+ rados_shutdown($r);
+
+
+
+.. _user ID: ../../operations/user-management#command-line-usage
+.. _CAPS: ../../operations/user-management#authorization-capabilities
+.. _Installation (Quick): ../../../start
+.. _Smart Daemons Enable Hyperscale: ../../../architecture#smart-daemons-enable-hyperscale
+.. _Calculating PG IDs: ../../../architecture#calculating-pg-ids
+.. _computes: ../../../architecture#calculating-pg-ids
+.. _OSD: ../../../architecture#mapping-pgs-to-osds
diff --git a/doc/rados/api/librados.rst b/doc/rados/api/librados.rst
new file mode 100644
index 000000000..3e202bd4b
--- /dev/null
+++ b/doc/rados/api/librados.rst
@@ -0,0 +1,187 @@
+==============
+ Librados (C)
+==============
+
+.. highlight:: c
+
+`librados` provides low-level access to the RADOS service. For an
+overview of RADOS, see :doc:`../../architecture`.
+
+
+Example: connecting and writing an object
+=========================================
+
+To use `Librados`, you instantiate a :c:type:`rados_t` variable (a cluster handle) and
+call :c:func:`rados_create()` with a pointer to it::
+
+ int err;
+ rados_t cluster;
+
+ err = rados_create(&cluster, NULL);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot create a cluster handle: %s\n", argv[0], strerror(-err));
+ exit(1);
+ }
+
+Then you configure your :c:type:`rados_t` to connect to your cluster,
+either by setting individual values (:c:func:`rados_conf_set()`),
+using a configuration file (:c:func:`rados_conf_read_file()`), using
+command line options (:c:func:`rados_conf_parse_argv`), or an
+environment variable (:c:func:`rados_conf_parse_env()`)::
+
+ err = rados_conf_read_file(cluster, "/path/to/myceph.conf");
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot read config file: %s\n", argv[0], strerror(-err));
+ exit(1);
+ }
+
+Once the cluster handle is configured, you can connect to the cluster with :c:func:`rados_connect()`::
+
+ err = rados_connect(cluster);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot connect to cluster: %s\n", argv[0], strerror(-err));
+ exit(1);
+ }
+
+Then you open an "IO context", a :c:type:`rados_ioctx_t`, with :c:func:`rados_ioctx_create()`::
+
+ rados_ioctx_t io;
+ char *poolname = "mypool";
+
+ err = rados_ioctx_create(cluster, poolname, &io);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot open rados pool %s: %s\n", argv[0], poolname, strerror(-err));
+ rados_shutdown(cluster);
+ exit(1);
+ }
+
+Note that the pool you try to access must exist.
+
+Then you can use the RADOS data manipulation functions, for example
+write into an object called ``greeting`` with
+:c:func:`rados_write_full()`::
+
+ err = rados_write_full(io, "greeting", "hello", 5);
+ if (err < 0) {
+ fprintf(stderr, "%s: cannot write pool %s: %s\n", argv[0], poolname, strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ }
+
+In the end, you will want to close your IO context and connection to RADOS with :c:func:`rados_ioctx_destroy()` and :c:func:`rados_shutdown()`::
+
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+
+
+Asynchronous IO
+===============
+
+When doing lots of IO, you often don't need to wait for one operation
+to complete before starting the next one. `Librados` provides
+asynchronous versions of several operations:
+
+* :c:func:`rados_aio_write`
+* :c:func:`rados_aio_append`
+* :c:func:`rados_aio_write_full`
+* :c:func:`rados_aio_read`
+
+For each operation, you must first create a
+:c:type:`rados_completion_t` that represents what to do when the
+operation is safe or complete by calling
+:c:func:`rados_aio_create_completion`. If you don't need anything
+special to happen, you can pass NULL::
+
+ rados_completion_t comp;
+ err = rados_aio_create_completion(NULL, NULL, NULL, &comp);
+ if (err < 0) {
+ fprintf(stderr, "%s: could not create aio completion: %s\n", argv[0], strerror(-err));
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ }
+
+Now you can call any of the aio operations, and wait for it to
+be in memory or on disk on all replicas::
+
+ err = rados_aio_write(io, "foo", comp, "bar", 3, 0);
+ if (err < 0) {
+ fprintf(stderr, "%s: could not schedule aio write: %s\n", argv[0], strerror(-err));
+ rados_aio_release(comp);
+ rados_ioctx_destroy(io);
+ rados_shutdown(cluster);
+ exit(1);
+ }
+ rados_aio_wait_for_complete(comp); // in memory
+ rados_aio_wait_for_safe(comp); // on disk
+
+Finally, we need to free the memory used by the completion with :c:func:`rados_aio_release`::
+
+ rados_aio_release(comp);
+
+You can use the callbacks to tell your application when writes are
+durable, or when read buffers are full. For example, if you wanted to
+measure the latency of each operation when appending to several
+objects, you could schedule several writes and store the ack and
+commit time in the corresponding callback, then wait for all of them
+to complete using :c:func:`rados_aio_flush` before analyzing the
+latencies::
+
+ typedef struct {
+ struct timeval start;
+ struct timeval ack_end;
+ struct timeval commit_end;
+ } req_duration;
+
+ void ack_callback(rados_completion_t comp, void *arg) {
+ req_duration *dur = (req_duration *) arg;
+ gettimeofday(&dur->ack_end, NULL);
+ }
+
+ void commit_callback(rados_completion_t comp, void *arg) {
+ req_duration *dur = (req_duration *) arg;
+ gettimeofday(&dur->commit_end, NULL);
+ }
+
+ int output_append_latency(rados_ioctx_t io, const char *data, size_t len, size_t num_writes) {
+ req_duration times[num_writes];
+ rados_completion_t comps[num_writes];
+ for (size_t i = 0; i < num_writes; ++i) {
+ gettimeofday(&times[i].start, NULL);
+ int err = rados_aio_create_completion((void*) &times[i], ack_callback, commit_callback, &comps[i]);
+ if (err < 0) {
+ fprintf(stderr, "Error creating rados completion: %s\n", strerror(-err));
+ return err;
+ }
+ char obj_name[100];
+ snprintf(obj_name, sizeof(obj_name), "foo%ld", (unsigned long)i);
+ err = rados_aio_append(io, obj_name, comps[i], data, len);
+ if (err < 0) {
+ fprintf(stderr, "Error from rados_aio_append: %s", strerror(-err));
+ return err;
+ }
+ }
+ // wait until all requests finish *and* the callbacks complete
+ rados_aio_flush(io);
+ // the latencies can now be analyzed
+ printf("Request # | Ack latency (s) | Commit latency (s)\n");
+ for (size_t i = 0; i < num_writes; ++i) {
+ // don't forget to free the completions
+ rados_aio_release(comps[i]);
+ struct timeval ack_lat, commit_lat;
+ timersub(&times[i].ack_end, &times[i].start, &ack_lat);
+ timersub(&times[i].commit_end, &times[i].start, &commit_lat);
+ printf("%9ld | %8ld.%06ld | %10ld.%06ld\n", (unsigned long) i, ack_lat.tv_sec, ack_lat.tv_usec, commit_lat.tv_sec, commit_lat.tv_usec);
+ }
+ return 0;
+ }
+
+Note that all the :c:type:`rados_completion_t` must be freed with :c:func:`rados_aio_release` to avoid leaking memory.
+
+
+API calls
+=========
+
+ .. autodoxygenfile:: rados_types.h
+ .. autodoxygenfile:: librados.h
diff --git a/doc/rados/api/libradospp.rst b/doc/rados/api/libradospp.rst
new file mode 100644
index 000000000..08483c8d4
--- /dev/null
+++ b/doc/rados/api/libradospp.rst
@@ -0,0 +1,9 @@
+==================
+ LibradosPP (C++)
+==================
+
+.. note:: The librados C++ API is not guaranteed to be API+ABI stable
+ between major releases. All applications using the librados C++ API must
+ be recompiled and relinked against a specific Ceph release.
+
+.. todo:: write me!
diff --git a/doc/rados/api/objclass-sdk.rst b/doc/rados/api/objclass-sdk.rst
new file mode 100644
index 000000000..90b8eb018
--- /dev/null
+++ b/doc/rados/api/objclass-sdk.rst
@@ -0,0 +1,39 @@
+.. _`rados-objclass-api-sdk`:
+
+===========================
+SDK for Ceph Object Classes
+===========================
+
+`Ceph` can be extended by creating shared object classes called `Ceph Object
+Classes`. The existing framework to build these object classes has dependencies
+on the internal functionality of `Ceph`, which restricts users to build object
+classes within the tree. The aim of this project is to create an independent
+object class interface, which can be used to build object classes outside the
+`Ceph` tree. This allows us to have two types of object classes, 1) those that
+have in-tree dependencies and reside in the tree and 2) those that can make use
+of the `Ceph Object Class SDK framework` and can be built outside of the `Ceph`
+tree because they do not depend on any internal implementation of `Ceph`. This
+project decouples object class development from Ceph and encourages creation
+and distribution of object classes as packages.
+
+In order to demonstrate the use of this framework, we have provided an example
+called ``cls_sdk``, which is a very simple object class that makes use of the
+SDK framework. This object class resides in the ``src/cls`` directory.
+
+Installing objclass.h
+---------------------
+
+The object class interface that enables out-of-tree development of object
+classes resides in ``src/include/rados/`` and gets installed with `Ceph`
+installation. After running ``make install``, you should be able to see it
+in ``<prefix>/include/rados``. ::
+
+ ls /usr/local/include/rados
+
+Using the SDK example
+---------------------
+
+The ``cls_sdk`` object class resides in ``src/cls/sdk/``. This gets built and
+loaded into Ceph, with the Ceph build process. You can run the
+``ceph_test_cls_sdk`` unittest, which resides in ``src/test/cls_sdk/``,
+to test this class.
diff --git a/doc/rados/api/python.rst b/doc/rados/api/python.rst
new file mode 100644
index 000000000..346653a3d
--- /dev/null
+++ b/doc/rados/api/python.rst
@@ -0,0 +1,428 @@
+===================
+ Librados (Python)
+===================
+
+The ``rados`` module is a thin Python wrapper for ``librados``.
+
+Installation
+============
+
+To install Python libraries for Ceph, see `Getting librados for Python`_.
+
+
+Getting Started
+===============
+
+You can create your own Ceph client using Python. The following tutorial will
+show you how to import the Ceph Python module, connect to a Ceph cluster, and
+perform object operations as a ``client.admin`` user.
+
+.. note:: To use the Ceph Python bindings, you must have access to a
+ running Ceph cluster. To set one up quickly, see `Getting Started`_.
+
+First, create a Python source file for your Ceph client.
+
+.. prompt:: bash
+
+ vim client.py
+
+
+Import the Module
+-----------------
+
+To use the ``rados`` module, import it into your source file.
+
+.. code-block:: python
+ :linenos:
+
+ import rados
+
+
+Configure a Cluster Handle
+--------------------------
+
+Before connecting to the Ceph Storage Cluster, create a cluster handle. By
+default, the cluster handle assumes a cluster named ``ceph`` (i.e., the default
+for deployment tools, and our Getting Started guides too), and a
+``client.admin`` user name. You may change these defaults to suit your needs.
+
+To connect to the Ceph Storage Cluster, your application needs to know where to
+find the Ceph Monitor. Provide this information to your application by
+specifying the path to your Ceph configuration file, which contains the location
+of the initial Ceph monitors.
+
+.. code-block:: python
+ :linenos:
+
+ import rados, sys
+
+ #Create Handle Examples.
+ cluster = rados.Rados(conffile='ceph.conf')
+ cluster = rados.Rados(conffile=sys.argv[1])
+ cluster = rados.Rados(conffile = 'ceph.conf', conf = dict (keyring = '/path/to/keyring'))
+
+Ensure that the ``conffile`` argument provides the path and file name of your
+Ceph configuration file. You may use the ``sys`` module to avoid hard-coding the
+Ceph configuration path and file name.
+
+Your Python client also requires a client keyring. For this example, we use the
+``client.admin`` key by default. If you would like to specify the keyring when
+creating the cluster handle, you may use the ``conf`` argument. Alternatively,
+you may specify the keyring path in your Ceph configuration file. For example,
+you may add something like the following line to your Ceph configuration file::
+
+ keyring = /path/to/ceph.client.admin.keyring
+
+For additional details on modifying your configuration via Python, see `Configuration`_.
+
+
+Connect to the Cluster
+----------------------
+
+Once you have a cluster handle configured, you may connect to the cluster.
+With a connection to the cluster, you may execute methods that return
+information about the cluster.
+
+.. code-block:: python
+ :linenos:
+ :emphasize-lines: 7
+
+ import rados, sys
+
+ cluster = rados.Rados(conffile='ceph.conf')
+ print("\nlibrados version: {}".format(str(cluster.version())))
+ print("Will attempt to connect to: {}".format(str(cluster.conf_get('mon host'))))
+
+ cluster.connect()
+ print("\nCluster ID: {}".format(cluster.get_fsid()))
+
+ print("\n\nCluster Statistics")
+ print("==================")
+ cluster_stats = cluster.get_cluster_stats()
+
+ for key, value in cluster_stats.items():
+ print(key, value)
+
+
+By default, Ceph authentication is ``on``. Your application will need to know
+the location of the keyring. The ``python-ceph`` module doesn't have the default
+location, so you need to specify the keyring path. The easiest way to specify
+the keyring is to add it to the Ceph configuration file. The following Ceph
+configuration file example uses the ``client.admin`` keyring.
+
+.. code-block:: ini
+ :linenos:
+
+ [global]
+ # ... elided configuration
+ keyring = /path/to/keyring/ceph.client.admin.keyring
+
+
+Manage Pools
+------------
+
+When connected to the cluster, the ``Rados`` API allows you to manage pools. You
+can list pools, check for the existence of a pool, create a pool and delete a
+pool.
+
+.. code-block:: python
+ :linenos:
+ :emphasize-lines: 6, 13, 18, 25
+
+ print("\n\nPool Operations")
+ print("===============")
+
+ print("\nAvailable Pools")
+ print("----------------")
+ pools = cluster.list_pools()
+
+ for pool in pools:
+ print(pool)
+
+ print("\nCreate 'test' Pool")
+ print("------------------")
+ cluster.create_pool('test')
+
+ print("\nPool named 'test' exists: {}".format(str(cluster.pool_exists('test'))))
+ print("\nVerify 'test' Pool Exists")
+ print("-------------------------")
+ pools = cluster.list_pools()
+
+ for pool in pools:
+ print(pool)
+
+ print("\nDelete 'test' Pool")
+ print("------------------")
+ cluster.delete_pool('test')
+ print("\nPool named 'test' exists: {}".format(str(cluster.pool_exists('test'))))
+
+
+Input/Output Context
+--------------------
+
+Reading from and writing to the Ceph Storage Cluster requires an input/output
+context (ioctx). You can create an ioctx with the ``open_ioctx()`` or
+``open_ioctx2()`` method of the ``Rados`` class. The ``ioctx_name`` parameter
+is the name of the pool and ``pool_id`` is the ID of the pool you wish to use.
+
+.. code-block:: python
+ :linenos:
+
+ ioctx = cluster.open_ioctx('data')
+
+
+or
+
+.. code-block:: python
+ :linenos:
+
+ ioctx = cluster.open_ioctx2(pool_id)
+
+
+Once you have an I/O context, you can read/write objects, extended attributes,
+and perform a number of other operations. After you complete operations, ensure
+that you close the connection. For example:
+
+.. code-block:: python
+ :linenos:
+
+ print("\nClosing the connection.")
+ ioctx.close()
+
+
+Writing, Reading and Removing Objects
+-------------------------------------
+
+Once you create an I/O context, you can write objects to the cluster. If you
+write to an object that doesn't exist, Ceph creates it. If you write to an
+object that exists, Ceph overwrites it (except when you specify a range, and
+then it only overwrites the range). You may read objects (and object ranges)
+from the cluster. You may also remove objects from the cluster. For example:
+
+.. code-block:: python
+ :linenos:
+ :emphasize-lines: 2, 5, 8
+
+ print("\nWriting object 'hw' with contents 'Hello World!' to pool 'data'.")
+ ioctx.write_full("hw", "Hello World!")
+
+ print("\n\nContents of object 'hw'\n------------------------\n")
+ print(ioctx.read("hw"))
+
+ print("\nRemoving object 'hw'")
+ ioctx.remove_object("hw")
+
+
+Writing and Reading XATTRS
+--------------------------
+
+Once you create an object, you can write extended attributes (XATTRs) to
+the object and read XATTRs from the object. For example:
+
+.. code-block:: python
+ :linenos:
+ :emphasize-lines: 2, 5
+
+ print("\n\nWriting XATTR 'lang' with value 'en_US' to object 'hw'")
+ ioctx.set_xattr("hw", "lang", "en_US")
+
+ print("\n\nGetting XATTR 'lang' from object 'hw'\n")
+ print(ioctx.get_xattr("hw", "lang"))
+
+
+Listing Objects
+---------------
+
+If you want to examine the list of objects in a pool, you may
+retrieve the list of objects and iterate over them with the object iterator.
+For example:
+
+.. code-block:: python
+ :linenos:
+ :emphasize-lines: 1, 6, 7, 13
+
+ object_iterator = ioctx.list_objects()
+
+ while True :
+
+ try :
+ rados_object = object_iterator.__next__()
+ print("Object contents = {}".format(rados_object.read()))
+
+ except StopIteration :
+ break
+
+ # Or alternatively
+ [print("Object contents = {}".format(obj.read())) for obj in ioctx.list_objects()]
+
+The ``Object`` class provides a file-like interface to an object, allowing
+you to read and write content and extended attributes. Object operations using
+the I/O context provide additional functionality and asynchronous capabilities.
+
+
+Cluster Handle API
+==================
+
+The ``Rados`` class provides an interface into the Ceph Storage Daemon.
+
+
+Configuration
+-------------
+
+The ``Rados`` class provides methods for getting and setting configuration
+values, reading the Ceph configuration file, and parsing arguments. You
+do not need to be connected to the Ceph Storage Cluster to invoke the following
+methods. See `Storage Cluster Configuration`_ for details on settings.
+
+.. currentmodule:: rados
+.. automethod:: Rados.conf_get(option)
+.. automethod:: Rados.conf_set(option, val)
+.. automethod:: Rados.conf_read_file(path=None)
+.. automethod:: Rados.conf_parse_argv(args)
+.. automethod:: Rados.version()
+
+
+Connection Management
+---------------------
+
+Once you configure your cluster handle, you may connect to the cluster, check
+the cluster ``fsid``, retrieve cluster statistics, and disconnect (shutdown)
+from the cluster. You may also assert that the cluster handle is in a particular
+state (e.g., "configuring", "connecting", etc.).
+
+.. automethod:: Rados.connect(timeout=0)
+.. automethod:: Rados.shutdown()
+.. automethod:: Rados.get_fsid()
+.. automethod:: Rados.get_cluster_stats()
+
+.. documented manually because it raises warnings because of *args usage in the
+.. signature
+
+.. py:class:: Rados
+
+ .. py:method:: require_state(*args)
+
+ Checks if the Rados object is in a special state
+
+ :param args: Any number of states to check as separate arguments
+ :raises: :class:`RadosStateError`
+
+
+Pool Operations
+---------------
+
+To use pool operation methods, you must connect to the Ceph Storage Cluster
+first. You may list the available pools, create a pool, check to see if a pool
+exists, and delete a pool.
+
+.. automethod:: Rados.list_pools()
+.. automethod:: Rados.create_pool(pool_name, crush_rule=None)
+.. automethod:: Rados.pool_exists()
+.. automethod:: Rados.delete_pool(pool_name)
+
+
+CLI Commands
+------------
+
+The Ceph CLI command is internally using the following librados Python binding methods.
+
+In order to send a command, choose the correct method and choose the correct target.
+
+.. automethod:: Rados.mon_command
+.. automethod:: Rados.osd_command
+.. automethod:: Rados.mgr_command
+.. automethod:: Rados.pg_command
+
+
+Input/Output Context API
+========================
+
+To write data to and read data from the Ceph Object Store, you must create
+an Input/Output context (ioctx). The `Rados` class provides `open_ioctx()`
+and `open_ioctx2()` methods. The remaining ``ioctx`` operations involve
+invoking methods of the `Ioctx` and other classes.
+
+.. automethod:: Rados.open_ioctx(ioctx_name)
+.. automethod:: Ioctx.require_ioctx_open()
+.. automethod:: Ioctx.get_stats()
+.. automethod:: Ioctx.get_last_version()
+.. automethod:: Ioctx.close()
+
+
+.. Pool Snapshots
+.. --------------
+
+.. The Ceph Storage Cluster allows you to make a snapshot of a pool's state.
+.. Whereas, basic pool operations only require a connection to the cluster,
+.. snapshots require an I/O context.
+
+.. Ioctx.create_snap(self, snap_name)
+.. Ioctx.list_snaps(self)
+.. SnapIterator.next(self)
+.. Snap.get_timestamp(self)
+.. Ioctx.lookup_snap(self, snap_name)
+.. Ioctx.remove_snap(self, snap_name)
+
+.. not published. This doesn't seem ready yet.
+
+Object Operations
+-----------------
+
+The Ceph Storage Cluster stores data as objects. You can read and write objects
+synchronously or asynchronously. You can read and write from offsets. An object
+has a name (or key) and data.
+
+
+.. automethod:: Ioctx.aio_write(object_name, to_write, offset=0, oncomplete=None, onsafe=None)
+.. automethod:: Ioctx.aio_write_full(object_name, to_write, oncomplete=None, onsafe=None)
+.. automethod:: Ioctx.aio_append(object_name, to_append, oncomplete=None, onsafe=None)
+.. automethod:: Ioctx.write(key, data, offset=0)
+.. automethod:: Ioctx.write_full(key, data)
+.. automethod:: Ioctx.aio_flush()
+.. automethod:: Ioctx.set_locator_key(loc_key)
+.. automethod:: Ioctx.aio_read(object_name, length, offset, oncomplete)
+.. automethod:: Ioctx.read(key, length=8192, offset=0)
+.. automethod:: Ioctx.stat(key)
+.. automethod:: Ioctx.trunc(key, size)
+.. automethod:: Ioctx.remove_object(key)
+
+
+Object Extended Attributes
+--------------------------
+
+You may set extended attributes (XATTRs) on an object. You can retrieve a list
+of objects or XATTRs and iterate over them.
+
+.. automethod:: Ioctx.set_xattr(key, xattr_name, xattr_value)
+.. automethod:: Ioctx.get_xattrs(oid)
+.. automethod:: XattrIterator.__next__()
+.. automethod:: Ioctx.get_xattr(key, xattr_name)
+.. automethod:: Ioctx.rm_xattr(key, xattr_name)
+
+
+
+Object Interface
+================
+
+From an I/O context, you can retrieve a list of objects from a pool and iterate
+over them. The object interface provide makes each object look like a file, and
+you may perform synchronous operations on the objects. For asynchronous
+operations, you should use the I/O context methods.
+
+.. automethod:: Ioctx.list_objects()
+.. automethod:: ObjectIterator.__next__()
+.. automethod:: Object.read(length = 1024*1024)
+.. automethod:: Object.write(string_to_write)
+.. automethod:: Object.get_xattrs()
+.. automethod:: Object.get_xattr(xattr_name)
+.. automethod:: Object.set_xattr(xattr_name, xattr_value)
+.. automethod:: Object.rm_xattr(xattr_name)
+.. automethod:: Object.stat()
+.. automethod:: Object.remove()
+
+
+
+
+.. _Getting Started: ../../../start
+.. _Storage Cluster Configuration: ../../configuration
+.. _Getting librados for Python: ../librados-intro#getting-librados-for-python
diff --git a/doc/rados/command/list-inconsistent-obj.json b/doc/rados/command/list-inconsistent-obj.json
new file mode 100644
index 000000000..2bdc5f74c
--- /dev/null
+++ b/doc/rados/command/list-inconsistent-obj.json
@@ -0,0 +1,237 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "type": "object",
+ "properties": {
+ "epoch": {
+ "description": "Scrub epoch",
+ "type": "integer"
+ },
+ "inconsistents": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "object": {
+ "description": "Identify a Ceph object",
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "nspace": {
+ "type": "string"
+ },
+ "locator": {
+ "type": "string"
+ },
+ "version": {
+ "type": "integer",
+ "minimum": 0
+ },
+ "snap": {
+ "oneOf": [
+ {
+ "type": "string",
+ "enum": [ "head", "snapdir" ]
+ },
+ {
+ "type": "integer",
+ "minimum": 0
+ }
+ ]
+ }
+ },
+ "required": [
+ "name",
+ "nspace",
+ "locator",
+ "version",
+ "snap"
+ ]
+ },
+ "selected_object_info": {
+ "type": "object",
+ "description": "Selected object information",
+ "additionalProperties": true
+ },
+ "union_shard_errors": {
+ "description": "Union of all shard errors",
+ "type": "array",
+ "items": {
+ "enum": [
+ "missing",
+ "stat_error",
+ "read_error",
+ "data_digest_mismatch_info",
+ "omap_digest_mismatch_info",
+ "size_mismatch_info",
+ "ec_hash_error",
+ "ec_size_error",
+ "info_missing",
+ "info_corrupted",
+ "obj_size_info_mismatch",
+ "snapset_missing",
+ "snapset_corrupted",
+ "hinfo_missing",
+ "hinfo_corrupted"
+ ]
+ },
+ "minItems": 0,
+ "uniqueItems": true
+ },
+ "errors": {
+ "description": "Errors related to the analysis of this object",
+ "type": "array",
+ "items": {
+ "enum": [
+ "object_info_inconsistency",
+ "data_digest_mismatch",
+ "omap_digest_mismatch",
+ "size_mismatch",
+ "attr_value_mismatch",
+ "attr_name_mismatch",
+ "snapset_inconsistency",
+ "hinfo_inconsistency",
+ "size_too_large"
+ ]
+ },
+ "minItems": 0,
+ "uniqueItems": true
+ },
+ "shards": {
+ "description": "All found or expected shards",
+ "type": "array",
+ "items": {
+ "description": "Information about a particular shard of object",
+ "type": "object",
+ "properties": {
+ "object_info": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "object",
+ "description": "Object information",
+ "additionalProperties": true
+ }
+ ]
+ },
+ "snapset": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "object",
+ "description": "Snap set information",
+ "additionalProperties": true
+ }
+ ]
+ },
+ "hashinfo": {
+ "oneOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "object",
+ "description": "Erasure code hash information",
+ "additionalProperties": true
+ }
+ ]
+ },
+ "shard": {
+ "type": "integer"
+ },
+ "osd": {
+ "type": "integer"
+ },
+ "primary": {
+ "type": "boolean"
+ },
+ "size": {
+ "type": "integer"
+ },
+ "omap_digest": {
+ "description": "Hex representation (e.g. 0x1abd1234)",
+ "type": "string"
+ },
+ "data_digest": {
+ "description": "Hex representation (e.g. 0x1abd1234)",
+ "type": "string"
+ },
+ "errors": {
+ "description": "Errors with this shard",
+ "type": "array",
+ "items": {
+ "enum": [
+ "missing",
+ "stat_error",
+ "read_error",
+ "data_digest_mismatch_info",
+ "omap_digest_mismatch_info",
+ "size_mismatch_info",
+ "ec_hash_error",
+ "ec_size_error",
+ "info_missing",
+ "info_corrupted",
+ "obj_size_info_mismatch",
+ "snapset_missing",
+ "snapset_corrupted",
+ "hinfo_missing",
+ "hinfo_corrupted"
+ ]
+ },
+ "minItems": 0,
+ "uniqueItems": true
+ },
+ "attrs": {
+ "description": "If any shard's attr error is set then all attrs are here",
+ "type": "array",
+ "items": {
+ "description": "Information about a particular shard of object",
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "value": {
+ "type": "string"
+ },
+ "Base64": {
+ "type": "boolean"
+ }
+ },
+ "required": [
+ "name",
+ "value",
+ "Base64"
+ ],
+ "additionalProperties": false
+ }
+ }
+ },
+ "additionalProperties": false,
+ "required": [
+ "osd",
+ "primary",
+ "errors"
+ ]
+ }
+ }
+ },
+ "required": [
+ "object",
+ "union_shard_errors",
+ "errors",
+ "shards"
+ ]
+ }
+ }
+ },
+ "required": [
+ "epoch",
+ "inconsistents"
+ ]
+}
diff --git a/doc/rados/command/list-inconsistent-snap.json b/doc/rados/command/list-inconsistent-snap.json
new file mode 100644
index 000000000..55f1d53e9
--- /dev/null
+++ b/doc/rados/command/list-inconsistent-snap.json
@@ -0,0 +1,86 @@
+{
+ "$schema": "http://json-schema.org/draft-04/schema#",
+ "type": "object",
+ "properties": {
+ "epoch": {
+ "description": "Scrub epoch",
+ "type": "integer"
+ },
+ "inconsistents": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "nspace": {
+ "type": "string"
+ },
+ "locator": {
+ "type": "string"
+ },
+ "snap": {
+ "oneOf": [
+ {
+ "type": "string",
+ "enum": [
+ "head",
+ "snapdir"
+ ]
+ },
+ {
+ "type": "integer",
+ "minimum": 0
+ }
+ ]
+ },
+ "errors": {
+ "description": "Errors for this object's snap",
+ "type": "array",
+ "items": {
+ "enum": [
+ "snapset_missing",
+ "snapset_corrupted",
+ "info_missing",
+ "info_corrupted",
+ "snapset_error",
+ "headless",
+ "size_mismatch",
+ "extra_clones",
+ "clone_missing"
+ ]
+ },
+ "minItems": 0,
+ "uniqueItems": true
+ },
+ "missing": {
+ "description": "List of missing clones if clone_missing error set",
+ "type": "array",
+ "items": {
+ "type": "integer"
+ }
+ },
+ "extra_clones": {
+ "description": "List of extra clones if extra_clones error set",
+ "type": "array",
+ "items": {
+ "type": "integer"
+ }
+ }
+ },
+ "required": [
+ "name",
+ "nspace",
+ "locator",
+ "snap",
+ "errors"
+ ]
+ }
+ }
+ },
+ "required": [
+ "epoch",
+ "inconsistents"
+ ]
+}
diff --git a/doc/rados/configuration/auth-config-ref.rst b/doc/rados/configuration/auth-config-ref.rst
new file mode 100644
index 000000000..fc14f4ee6
--- /dev/null
+++ b/doc/rados/configuration/auth-config-ref.rst
@@ -0,0 +1,379 @@
+.. _rados-cephx-config-ref:
+
+========================
+ CephX Config Reference
+========================
+
+The CephX protocol is enabled by default. The cryptographic authentication that
+CephX provides has some computational costs, though they should generally be
+quite low. If the network environment connecting your client and server hosts
+is very safe and you cannot afford authentication, you can disable it.
+**Disabling authentication is not generally recommended**.
+
+.. note:: If you disable authentication, you will be at risk of a
+ man-in-the-middle attack that alters your client/server messages, which
+ could have disastrous security effects.
+
+For information about creating users, see `User Management`_. For details on
+the architecture of CephX, see `Architecture - High Availability
+Authentication`_.
+
+
+Deployment Scenarios
+====================
+
+How you initially configure CephX depends on your scenario. There are two
+common strategies for deploying a Ceph cluster. If you are a first-time Ceph
+user, you should probably take the easiest approach: using ``cephadm`` to
+deploy a cluster. But if your cluster uses other deployment tools (for example,
+Ansible, Chef, Juju, or Puppet), you will need either to use the manual
+deployment procedures or to configure your deployment tool so that it will
+bootstrap your monitor(s).
+
+Manual Deployment
+-----------------
+
+When you deploy a cluster manually, it is necessary to bootstrap the monitors
+manually and to create the ``client.admin`` user and keyring. To bootstrap
+monitors, follow the steps in `Monitor Bootstrapping`_. Follow these steps when
+using third-party deployment tools (for example, Chef, Puppet, and Juju).
+
+
+Enabling/Disabling CephX
+========================
+
+Enabling CephX is possible only if the keys for your monitors, OSDs, and
+metadata servers have already been deployed. If you are simply toggling CephX
+on or off, it is not necessary to repeat the bootstrapping procedures.
+
+Enabling CephX
+--------------
+
+When CephX is enabled, Ceph will look for the keyring in the default search
+path: this path includes ``/etc/ceph/$cluster.$name.keyring``. It is possible
+to override this search-path location by adding a ``keyring`` option in the
+``[global]`` section of your `Ceph configuration`_ file, but this is not
+recommended.
+
+To enable CephX on a cluster for which authentication has been disabled, carry
+out the following procedure. If you (or your deployment utility) have already
+generated the keys, you may skip the steps related to generating keys.
+
+#. Create a ``client.admin`` key, and save a copy of the key for your client
+ host:
+
+ .. prompt:: bash $
+
+ ceph auth get-or-create client.admin mon 'allow *' mds 'allow *' mgr 'allow *' osd 'allow *' -o /etc/ceph/ceph.client.admin.keyring
+
+ **Warning:** This step will clobber any existing
+ ``/etc/ceph/client.admin.keyring`` file. Do not perform this step if a
+ deployment tool has already generated a keyring file for you. Be careful!
+
+#. Create a monitor keyring and generate a monitor secret key:
+
+ .. prompt:: bash $
+
+ ceph-authtool --create-keyring /tmp/ceph.mon.keyring --gen-key -n mon. --cap mon 'allow *'
+
+#. For each monitor, copy the monitor keyring into a ``ceph.mon.keyring`` file
+ in the monitor's ``mon data`` directory. For example, to copy the monitor
+ keyring to ``mon.a`` in a cluster called ``ceph``, run the following
+ command:
+
+ .. prompt:: bash $
+
+ cp /tmp/ceph.mon.keyring /var/lib/ceph/mon/ceph-a/keyring
+
+#. Generate a secret key for every MGR, where ``{$id}`` is the MGR letter:
+
+ .. prompt:: bash $
+
+ ceph auth get-or-create mgr.{$id} mon 'allow profile mgr' mds 'allow *' osd 'allow *' -o /var/lib/ceph/mgr/ceph-{$id}/keyring
+
+#. Generate a secret key for every OSD, where ``{$id}`` is the OSD number:
+
+ .. prompt:: bash $
+
+ ceph auth get-or-create osd.{$id} mon 'allow rwx' osd 'allow *' -o /var/lib/ceph/osd/ceph-{$id}/keyring
+
+#. Generate a secret key for every MDS, where ``{$id}`` is the MDS letter:
+
+ .. prompt:: bash $
+
+ ceph auth get-or-create mds.{$id} mon 'allow rwx' osd 'allow *' mds 'allow *' mgr 'allow profile mds' -o /var/lib/ceph/mds/ceph-{$id}/keyring
+
+#. Enable CephX authentication by setting the following options in the
+ ``[global]`` section of your `Ceph configuration`_ file:
+
+ .. code-block:: ini
+
+ auth_cluster_required = cephx
+ auth_service_required = cephx
+ auth_client_required = cephx
+
+#. Start or restart the Ceph cluster. For details, see `Operating a Cluster`_.
+
+For details on bootstrapping a monitor manually, see `Manual Deployment`_.
+
+
+
+Disabling CephX
+---------------
+
+The following procedure describes how to disable CephX. If your cluster
+environment is safe, you might want to disable CephX in order to offset the
+computational expense of running authentication. **We do not recommend doing
+so.** However, setup and troubleshooting might be easier if authentication is
+temporarily disabled and subsequently re-enabled.
+
+#. Disable CephX authentication by setting the following options in the
+ ``[global]`` section of your `Ceph configuration`_ file:
+
+ .. code-block:: ini
+
+ auth_cluster_required = none
+ auth_service_required = none
+ auth_client_required = none
+
+#. Start or restart the Ceph cluster. For details, see `Operating a Cluster`_.
+
+
+Configuration Settings
+======================
+
+Enablement
+----------
+
+
+``auth_cluster_required``
+
+:Description: If this configuration setting is enabled, the Ceph Storage
+ Cluster daemons (that is, ``ceph-mon``, ``ceph-osd``,
+ ``ceph-mds``, and ``ceph-mgr``) are required to authenticate with
+ each other. Valid settings are ``cephx`` or ``none``.
+
+:Type: String
+:Required: No
+:Default: ``cephx``.
+
+
+``auth_service_required``
+
+:Description: If this configuration setting is enabled, then Ceph clients can
+ access Ceph services only if those clients authenticate with the
+ Ceph Storage Cluster. Valid settings are ``cephx`` or ``none``.
+
+:Type: String
+:Required: No
+:Default: ``cephx``.
+
+
+``auth_client_required``
+
+:Description: If this configuration setting is enabled, then communication
+ between the Ceph client and Ceph Storage Cluster can be
+ established only if the Ceph Storage Cluster authenticates
+ against the Ceph client. Valid settings are ``cephx`` or
+ ``none``.
+
+:Type: String
+:Required: No
+:Default: ``cephx``.
+
+
+.. index:: keys; keyring
+
+Keys
+----
+
+When Ceph is run with authentication enabled, ``ceph`` administrative commands
+and Ceph clients can access the Ceph Storage Cluster only if they use
+authentication keys.
+
+The most common way to make these keys available to ``ceph`` administrative
+commands and Ceph clients is to include a Ceph keyring under the ``/etc/ceph``
+directory. For Octopus and later releases that use ``cephadm``, the filename is
+usually ``ceph.client.admin.keyring``. If the keyring is included in the
+``/etc/ceph`` directory, then it is unnecessary to specify a ``keyring`` entry
+in the Ceph configuration file.
+
+Because the Ceph Storage Cluster's keyring file contains the ``client.admin``
+key, we recommend copying the keyring file to nodes from which you run
+administrative commands.
+
+To perform this step manually, run the following command:
+
+.. prompt:: bash $
+
+ sudo scp {user}@{ceph-cluster-host}:/etc/ceph/ceph.client.admin.keyring /etc/ceph/ceph.client.admin.keyring
+
+.. tip:: Make sure that the ``ceph.keyring`` file has appropriate permissions
+ (for example, ``chmod 644``) set on your client machine.
+
+You can specify the key itself by using the ``key`` setting in the Ceph
+configuration file (this approach is not recommended), or instead specify a
+path to a keyfile by using the ``keyfile`` setting in the Ceph configuration
+file.
+
+``keyring``
+
+:Description: The path to the keyring file.
+:Type: String
+:Required: No
+:Default: ``/etc/ceph/$cluster.$name.keyring,/etc/ceph/$cluster.keyring,/etc/ceph/keyring,/etc/ceph/keyring.bin``
+
+
+``keyfile``
+
+:Description: The path to a keyfile (that is, a file containing only the key).
+:Type: String
+:Required: No
+:Default: None
+
+
+``key``
+
+:Description: The key (that is, the text string of the key itself). We do not
+ recommend that you use this setting unless you know what you're
+ doing.
+:Type: String
+:Required: No
+:Default: None
+
+
+Daemon Keyrings
+---------------
+
+Administrative users or deployment tools (for example, ``cephadm``) generate
+daemon keyrings in the same way that they generate user keyrings. By default,
+Ceph stores the keyring of a daemon inside that daemon's data directory. The
+default keyring locations and the capabilities that are necessary for the
+daemon to function are shown below.
+
+``ceph-mon``
+
+:Location: ``$mon_data/keyring``
+:Capabilities: ``mon 'allow *'``
+
+``ceph-osd``
+
+:Location: ``$osd_data/keyring``
+:Capabilities: ``mgr 'allow profile osd' mon 'allow profile osd' osd 'allow *'``
+
+``ceph-mds``
+
+:Location: ``$mds_data/keyring``
+:Capabilities: ``mds 'allow' mgr 'allow profile mds' mon 'allow profile mds' osd 'allow rwx'``
+
+``ceph-mgr``
+
+:Location: ``$mgr_data/keyring``
+:Capabilities: ``mon 'allow profile mgr' mds 'allow *' osd 'allow *'``
+
+``radosgw``
+
+:Location: ``$rgw_data/keyring``
+:Capabilities: ``mon 'allow rwx' osd 'allow rwx'``
+
+
+.. note:: The monitor keyring (that is, ``mon.``) contains a key but no
+ capabilities, and this keyring is not part of the cluster ``auth`` database.
+
+The daemon's data-directory locations default to directories of the form::
+
+ /var/lib/ceph/$type/$cluster-$id
+
+For example, ``osd.12`` would have the following data directory::
+
+ /var/lib/ceph/osd/ceph-12
+
+It is possible to override these locations, but it is not recommended.
+
+
+.. index:: signatures
+
+Signatures
+----------
+
+Ceph performs a signature check that provides some limited protection against
+messages being tampered with in flight (for example, by a "man in the middle"
+attack).
+
+As with other parts of Ceph authentication, signatures admit of fine-grained
+control. You can enable or disable signatures for service messages between
+clients and Ceph, and for messages between Ceph daemons.
+
+Note that even when signatures are enabled data is not encrypted in flight.
+
+``cephx_require_signatures``
+
+:Description: If this configuration setting is set to ``true``, Ceph requires
+ signatures on all message traffic between the Ceph client and the
+ Ceph Storage Cluster, and between daemons within the Ceph Storage
+ Cluster.
+
+.. note::
+ **ANTIQUATED NOTE:**
+
+ Neither Ceph Argonaut nor Linux kernel versions prior to 3.19
+ support signatures; if one of these clients is in use, ``cephx_require_signatures``
+ can be disabled in order to allow the client to connect.
+
+
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+
+``cephx_cluster_require_signatures``
+
+:Description: If this configuration setting is set to ``true``, Ceph requires
+ signatures on all message traffic between Ceph daemons within the
+ Ceph Storage Cluster.
+
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+
+``cephx_service_require_signatures``
+
+:Description: If this configuration setting is set to ``true``, Ceph requires
+ signatures on all message traffic between Ceph clients and the
+ Ceph Storage Cluster.
+
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+
+``cephx_sign_messages``
+
+:Description: If this configuration setting is set to ``true``, and if the Ceph
+ version supports message signing, then Ceph will sign all
+ messages so that they are more difficult to spoof.
+
+:Type: Boolean
+:Default: ``true``
+
+
+Time to Live
+------------
+
+``auth_service_ticket_ttl``
+
+:Description: When the Ceph Storage Cluster sends a ticket for authentication
+ to a Ceph client, the Ceph Storage Cluster assigns that ticket a
+ Time To Live (TTL).
+
+:Type: Double
+:Default: ``60*60``
+
+
+.. _Monitor Bootstrapping: ../../../install/manual-deployment#monitor-bootstrapping
+.. _Operating a Cluster: ../../operations/operating
+.. _Manual Deployment: ../../../install/manual-deployment
+.. _Ceph configuration: ../ceph-conf
+.. _Architecture - High Availability Authentication: ../../../architecture#high-availability-authentication
+.. _User Management: ../../operations/user-management
diff --git a/doc/rados/configuration/bluestore-config-ref.rst b/doc/rados/configuration/bluestore-config-ref.rst
new file mode 100644
index 000000000..3707be1aa
--- /dev/null
+++ b/doc/rados/configuration/bluestore-config-ref.rst
@@ -0,0 +1,552 @@
+==================================
+ BlueStore Configuration Reference
+==================================
+
+Devices
+=======
+
+BlueStore manages either one, two, or in certain cases three storage devices.
+These *devices* are "devices" in the Linux/Unix sense. This means that they are
+assets listed under ``/dev`` or ``/devices``. Each of these devices may be an
+entire storage drive, or a partition of a storage drive, or a logical volume.
+BlueStore does not create or mount a conventional file system on devices that
+it uses; BlueStore reads and writes to the devices directly in a "raw" fashion.
+
+In the simplest case, BlueStore consumes all of a single storage device. This
+device is known as the *primary device*. The primary device is identified by
+the ``block`` symlink in the data directory.
+
+The data directory is a ``tmpfs`` mount. When this data directory is booted or
+activated by ``ceph-volume``, it is populated with metadata files and links
+that hold information about the OSD: for example, the OSD's identifier, the
+name of the cluster that the OSD belongs to, and the OSD's private keyring.
+
+In more complicated cases, BlueStore is deployed across one or two additional
+devices:
+
+* A *write-ahead log (WAL) device* (identified as ``block.wal`` in the data
+ directory) can be used to separate out BlueStore's internal journal or
+ write-ahead log. Using a WAL device is advantageous only if the WAL device
+ is faster than the primary device (for example, if the WAL device is an SSD
+ and the primary device is an HDD).
+* A *DB device* (identified as ``block.db`` in the data directory) can be used
+ to store BlueStore's internal metadata. BlueStore (or more precisely, the
+ embedded RocksDB) will put as much metadata as it can on the DB device in
+ order to improve performance. If the DB device becomes full, metadata will
+ spill back onto the primary device (where it would have been located in the
+ absence of the DB device). Again, it is advantageous to provision a DB device
+ only if it is faster than the primary device.
+
+If there is only a small amount of fast storage available (for example, less
+than a gigabyte), we recommend using the available space as a WAL device. But
+if more fast storage is available, it makes more sense to provision a DB
+device. Because the BlueStore journal is always placed on the fastest device
+available, using a DB device provides the same benefit that using a WAL device
+would, while *also* allowing additional metadata to be stored off the primary
+device (provided that it fits). DB devices make this possible because whenever
+a DB device is specified but an explicit WAL device is not, the WAL will be
+implicitly colocated with the DB on the faster device.
+
+To provision a single-device (colocated) BlueStore OSD, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph-volume lvm prepare --bluestore --data <device>
+
+To specify a WAL device or DB device, run the following command:
+
+.. prompt:: bash $
+
+ ceph-volume lvm prepare --bluestore --data <device> --block.wal <wal-device> --block.db <db-device>
+
+.. note:: The option ``--data`` can take as its argument any of the the
+ following devices: logical volumes specified using *vg/lv* notation,
+ existing logical volumes, and GPT partitions.
+
+
+
+Provisioning strategies
+-----------------------
+
+BlueStore differs from Filestore in that there are several ways to deploy a
+BlueStore OSD. However, the overall deployment strategy for BlueStore can be
+clarified by examining just these two common arrangements:
+
+.. _bluestore-single-type-device-config:
+
+**block (data) only**
+^^^^^^^^^^^^^^^^^^^^^
+If all devices are of the same type (for example, they are all HDDs), and if
+there are no fast devices available for the storage of metadata, then it makes
+sense to specify the block device only and to leave ``block.db`` and
+``block.wal`` unseparated. The :ref:`ceph-volume-lvm` command for a single
+``/dev/sda`` device is as follows:
+
+.. prompt:: bash $
+
+ ceph-volume lvm create --bluestore --data /dev/sda
+
+If the devices to be used for a BlueStore OSD are pre-created logical volumes,
+then the :ref:`ceph-volume-lvm` call for an logical volume named
+``ceph-vg/block-lv`` is as follows:
+
+.. prompt:: bash $
+
+ ceph-volume lvm create --bluestore --data ceph-vg/block-lv
+
+.. _bluestore-mixed-device-config:
+
+**block and block.db**
+^^^^^^^^^^^^^^^^^^^^^^
+
+If you have a mix of fast and slow devices (for example, SSD or HDD), then we
+recommend placing ``block.db`` on the faster device while ``block`` (that is,
+the data) is stored on the slower device (that is, the rotational drive).
+
+You must create these volume groups and these logical volumes manually. as The
+``ceph-volume`` tool is currently unable to do so [create them?] automatically.
+
+The following procedure illustrates the manual creation of volume groups and
+logical volumes. For this example, we shall assume four rotational drives
+(``sda``, ``sdb``, ``sdc``, and ``sdd``) and one (fast) SSD (``sdx``). First,
+to create the volume groups, run the following commands:
+
+.. prompt:: bash $
+
+ vgcreate ceph-block-0 /dev/sda
+ vgcreate ceph-block-1 /dev/sdb
+ vgcreate ceph-block-2 /dev/sdc
+ vgcreate ceph-block-3 /dev/sdd
+
+Next, to create the logical volumes for ``block``, run the following commands:
+
+.. prompt:: bash $
+
+ lvcreate -l 100%FREE -n block-0 ceph-block-0
+ lvcreate -l 100%FREE -n block-1 ceph-block-1
+ lvcreate -l 100%FREE -n block-2 ceph-block-2
+ lvcreate -l 100%FREE -n block-3 ceph-block-3
+
+Because there are four HDDs, there will be four OSDs. Supposing that there is a
+200GB SSD in ``/dev/sdx``, we can create four 50GB logical volumes by running
+the following commands:
+
+.. prompt:: bash $
+
+ vgcreate ceph-db-0 /dev/sdx
+ lvcreate -L 50GB -n db-0 ceph-db-0
+ lvcreate -L 50GB -n db-1 ceph-db-0
+ lvcreate -L 50GB -n db-2 ceph-db-0
+ lvcreate -L 50GB -n db-3 ceph-db-0
+
+Finally, to create the four OSDs, run the following commands:
+
+.. prompt:: bash $
+
+ ceph-volume lvm create --bluestore --data ceph-block-0/block-0 --block.db ceph-db-0/db-0
+ ceph-volume lvm create --bluestore --data ceph-block-1/block-1 --block.db ceph-db-0/db-1
+ ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
+ ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
+
+After this procedure is finished, there should be four OSDs, ``block`` should
+be on the four HDDs, and each HDD should have a 50GB logical volume
+(specifically, a DB device) on the shared SSD.
+
+Sizing
+======
+When using a :ref:`mixed spinning-and-solid-drive setup
+<bluestore-mixed-device-config>`, it is important to make a large enough
+``block.db`` logical volume for BlueStore. The logical volumes associated with
+``block.db`` should have logical volumes that are *as large as possible*.
+
+It is generally recommended that the size of ``block.db`` be somewhere between
+1% and 4% of the size of ``block``. For RGW workloads, it is recommended that
+the ``block.db`` be at least 4% of the ``block`` size, because RGW makes heavy
+use of ``block.db`` to store metadata (in particular, omap keys). For example,
+if the ``block`` size is 1TB, then ``block.db`` should have a size of at least
+40GB. For RBD workloads, however, ``block.db`` usually needs no more than 1% to
+2% of the ``block`` size.
+
+In older releases, internal level sizes are such that the DB can fully utilize
+only those specific partition / logical volume sizes that correspond to sums of
+L0, L0+L1, L1+L2, and so on--that is, given default settings, sizes of roughly
+3GB, 30GB, 300GB, and so on. Most deployments do not substantially benefit from
+sizing that accommodates L3 and higher, though DB compaction can be facilitated
+by doubling these figures to 6GB, 60GB, and 600GB.
+
+Improvements in Nautilus 14.2.12, Octopus 15.2.6, and subsequent releases allow
+for better utilization of arbitrarily-sized DB devices. Moreover, the Pacific
+release brings experimental dynamic-level support. Because of these advances,
+users of older releases might want to plan ahead by provisioning larger DB
+devices today so that the benefits of scale can be realized when upgrades are
+made in the future.
+
+When *not* using a mix of fast and slow devices, there is no requirement to
+create separate logical volumes for ``block.db`` or ``block.wal``. BlueStore
+will automatically colocate these devices within the space of ``block``.
+
+Automatic Cache Sizing
+======================
+
+BlueStore can be configured to automatically resize its caches, provided that
+certain conditions are met: TCMalloc must be configured as the memory allocator
+and the ``bluestore_cache_autotune`` configuration option must be enabled (note
+that it is currently enabled by default). When automatic cache sizing is in
+effect, BlueStore attempts to keep OSD heap-memory usage under a certain target
+size (as determined by ``osd_memory_target``). This approach makes use of a
+best-effort algorithm and caches do not shrink smaller than the size defined by
+the value of ``osd_memory_cache_min``. Cache ratios are selected in accordance
+with a hierarchy of priorities. But if priority information is not available,
+the values specified in the ``bluestore_cache_meta_ratio`` and
+``bluestore_cache_kv_ratio`` options are used as fallback cache ratios.
+
+.. confval:: bluestore_cache_autotune
+.. confval:: osd_memory_target
+.. confval:: bluestore_cache_autotune_interval
+.. confval:: osd_memory_base
+.. confval:: osd_memory_expected_fragmentation
+.. confval:: osd_memory_cache_min
+.. confval:: osd_memory_cache_resize_interval
+
+
+Manual Cache Sizing
+===================
+
+The amount of memory consumed by each OSD to be used for its BlueStore cache is
+determined by the ``bluestore_cache_size`` configuration option. If that option
+has not been specified (that is, if it remains at 0), then Ceph uses a
+different configuration option to determine the default memory budget:
+``bluestore_cache_size_hdd`` if the primary device is an HDD, or
+``bluestore_cache_size_ssd`` if the primary device is an SSD.
+
+BlueStore and the rest of the Ceph OSD daemon make every effort to work within
+this memory budget. Note that in addition to the configured cache size, there
+is also memory consumed by the OSD itself. There is additional utilization due
+to memory fragmentation and other allocator overhead.
+
+The configured cache-memory budget can be used to store the following types of
+things:
+
+* Key/Value metadata (that is, RocksDB's internal cache)
+* BlueStore metadata
+* BlueStore data (that is, recently read or recently written object data)
+
+Cache memory usage is governed by the configuration options
+``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio``. The fraction
+of the cache that is reserved for data is governed by both the effective
+BlueStore cache size (which depends on the relevant
+``bluestore_cache_size[_ssd|_hdd]`` option and the device class of the primary
+device) and the "meta" and "kv" ratios. This data fraction can be calculated
+with the following formula: ``<effective_cache_size> * (1 -
+bluestore_cache_meta_ratio - bluestore_cache_kv_ratio)``.
+
+.. confval:: bluestore_cache_size
+.. confval:: bluestore_cache_size_hdd
+.. confval:: bluestore_cache_size_ssd
+.. confval:: bluestore_cache_meta_ratio
+.. confval:: bluestore_cache_kv_ratio
+
+Checksums
+=========
+
+BlueStore checksums all metadata and all data written to disk. Metadata
+checksumming is handled by RocksDB and uses the `crc32c` algorithm. By
+contrast, data checksumming is handled by BlueStore and can use either
+`crc32c`, `xxhash32`, or `xxhash64`. Nonetheless, `crc32c` is the default
+checksum algorithm and it is suitable for most purposes.
+
+Full data checksumming increases the amount of metadata that BlueStore must
+store and manage. Whenever possible (for example, when clients hint that data
+is written and read sequentially), BlueStore will checksum larger blocks. In
+many cases, however, it must store a checksum value (usually 4 bytes) for every
+4 KB block of data.
+
+It is possible to obtain a smaller checksum value by truncating the checksum to
+one or two bytes and reducing the metadata overhead. A drawback of this
+approach is that it increases the probability of a random error going
+undetected: about one in four billion given a 32-bit (4 byte) checksum, 1 in
+65,536 given a 16-bit (2 byte) checksum, and 1 in 256 given an 8-bit (1 byte)
+checksum. To use the smaller checksum values, select `crc32c_16` or `crc32c_8`
+as the checksum algorithm.
+
+The *checksum algorithm* can be specified either via a per-pool ``csum_type``
+configuration option or via the global configuration option. For example:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> csum_type <algorithm>
+
+.. confval:: bluestore_csum_type
+
+Inline Compression
+==================
+
+BlueStore supports inline compression using `snappy`, `zlib`, `lz4`, or `zstd`.
+
+Whether data in BlueStore is compressed is determined by two factors: (1) the
+*compression mode* and (2) any client hints associated with a write operation.
+The compression modes are as follows:
+
+* **none**: Never compress data.
+* **passive**: Do not compress data unless the write operation has a
+ *compressible* hint set.
+* **aggressive**: Do compress data unless the write operation has an
+ *incompressible* hint set.
+* **force**: Try to compress data no matter what.
+
+For more information about the *compressible* and *incompressible* I/O hints,
+see :c:func:`rados_set_alloc_hint`.
+
+Note that data in Bluestore will be compressed only if the data chunk will be
+sufficiently reduced in size (as determined by the ``bluestore compression
+required ratio`` setting). No matter which compression modes have been used, if
+the data chunk is too big, then it will be discarded and the original
+(uncompressed) data will be stored instead. For example, if ``bluestore
+compression required ratio`` is set to ``.7``, then data compression will take
+place only if the size of the compressed data is no more than 70% of the size
+of the original data.
+
+The *compression mode*, *compression algorithm*, *compression required ratio*,
+*min blob size*, and *max blob size* settings can be specified either via a
+per-pool property or via a global config option. To specify pool properties,
+run the following commands:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> compression_algorithm <algorithm>
+ ceph osd pool set <pool-name> compression_mode <mode>
+ ceph osd pool set <pool-name> compression_required_ratio <ratio>
+ ceph osd pool set <pool-name> compression_min_blob_size <size>
+ ceph osd pool set <pool-name> compression_max_blob_size <size>
+
+.. confval:: bluestore_compression_algorithm
+.. confval:: bluestore_compression_mode
+.. confval:: bluestore_compression_required_ratio
+.. confval:: bluestore_compression_min_blob_size
+.. confval:: bluestore_compression_min_blob_size_hdd
+.. confval:: bluestore_compression_min_blob_size_ssd
+.. confval:: bluestore_compression_max_blob_size
+.. confval:: bluestore_compression_max_blob_size_hdd
+.. confval:: bluestore_compression_max_blob_size_ssd
+
+.. _bluestore-rocksdb-sharding:
+
+RocksDB Sharding
+================
+
+BlueStore maintains several types of internal key-value data, all of which are
+stored in RocksDB. Each data type in BlueStore is assigned a unique prefix.
+Prior to the Pacific release, all key-value data was stored in a single RocksDB
+column family: 'default'. In Pacific and later releases, however, BlueStore can
+divide key-value data into several RocksDB column families. BlueStore achieves
+better caching and more precise compaction when keys are similar: specifically,
+when keys have similar access frequency, similar modification frequency, and a
+similar lifetime. Under such conditions, performance is improved and less disk
+space is required during compaction (because each column family is smaller and
+is able to compact independently of the others).
+
+OSDs deployed in Pacific or later releases use RocksDB sharding by default.
+However, if Ceph has been upgraded to Pacific or a later version from a
+previous version, sharding is disabled on any OSDs that were created before
+Pacific.
+
+To enable sharding and apply the Pacific defaults to a specific OSD, stop the
+OSD and run the following command:
+
+ .. prompt:: bash #
+
+ ceph-bluestore-tool \
+ --path <data path> \
+ --sharding="m(3) p(3,0-12) o(3,0-13)=block_cache={type=binned_lru} l p" \
+ reshard
+
+.. confval:: bluestore_rocksdb_cf
+.. confval:: bluestore_rocksdb_cfs
+
+Throttling
+==========
+
+.. confval:: bluestore_throttle_bytes
+.. confval:: bluestore_throttle_deferred_bytes
+.. confval:: bluestore_throttle_cost_per_io
+.. confval:: bluestore_throttle_cost_per_io_hdd
+.. confval:: bluestore_throttle_cost_per_io_ssd
+
+SPDK Usage
+==========
+
+To use the SPDK driver for NVMe devices, you must first prepare your system.
+See `SPDK document`__.
+
+.. __: http://www.spdk.io/doc/getting_started.html#getting_started_examples
+
+SPDK offers a script that will configure the device automatically. Run this
+script with root permissions:
+
+.. prompt:: bash $
+
+ sudo src/spdk/scripts/setup.sh
+
+You will need to specify the subject NVMe device's device selector with the
+"spdk:" prefix for ``bluestore_block_path``.
+
+In the following example, you first find the device selector of an Intel NVMe
+SSD by running the following command:
+
+.. prompt:: bash $
+
+ lspci -mm -n -d -d 8086:0953
+
+The form of the device selector is either ``DDDD:BB:DD.FF`` or
+``DDDD.BB.DD.FF``.
+
+Next, supposing that ``0000:01:00.0`` is the device selector found in the
+output of the ``lspci`` command, you can specify the device selector by running
+the following command::
+
+ bluestore_block_path = "spdk:trtype:pcie traddr:0000:01:00.0"
+
+You may also specify a remote NVMeoF target over the TCP transport, as in the
+following example::
+
+ bluestore_block_path = "spdk:trtype:tcp traddr:10.67.110.197 trsvcid:4420 subnqn:nqn.2019-02.io.spdk:cnode1"
+
+To run multiple SPDK instances per node, you must make sure each instance uses
+its own DPDK memory by specifying for each instance the amount of DPDK memory
+(in MB) that the instance will use.
+
+In most cases, a single device can be used for data, DB, and WAL. We describe
+this strategy as *colocating* these components. Be sure to enter the below
+settings to ensure that all I/Os are issued through SPDK::
+
+ bluestore_block_db_path = ""
+ bluestore_block_db_size = 0
+ bluestore_block_wal_path = ""
+ bluestore_block_wal_size = 0
+
+If these settings are not entered, then the current implementation will
+populate the SPDK map files with kernel file system symbols and will use the
+kernel driver to issue DB/WAL I/Os.
+
+Minimum Allocation Size
+=======================
+
+There is a configured minimum amount of storage that BlueStore allocates on an
+underlying storage device. In practice, this is the least amount of capacity
+that even a tiny RADOS object can consume on each OSD's primary device. The
+configuration option in question--:confval:`bluestore_min_alloc_size`--derives
+its value from the value of either :confval:`bluestore_min_alloc_size_hdd` or
+:confval:`bluestore_min_alloc_size_ssd`, depending on the OSD's ``rotational``
+attribute. Thus if an OSD is created on an HDD, BlueStore is initialized with
+the current value of :confval:`bluestore_min_alloc_size_hdd`; but with SSD OSDs
+(including NVMe devices), Bluestore is initialized with the current value of
+:confval:`bluestore_min_alloc_size_ssd`.
+
+In Mimic and earlier releases, the default values were 64KB for rotational
+media (HDD) and 16KB for non-rotational media (SSD). The Octopus release
+changed the the default value for non-rotational media (SSD) to 4KB, and the
+Pacific release changed the default value for rotational media (HDD) to 4KB.
+
+These changes were driven by space amplification that was experienced by Ceph
+RADOS GateWay (RGW) deployments that hosted large numbers of small files
+(S3/Swift objects).
+
+For example, when an RGW client stores a 1 KB S3 object, that object is written
+to a single RADOS object. In accordance with the default
+:confval:`min_alloc_size` value, 4 KB of underlying drive space is allocated.
+This means that roughly 3 KB (that is, 4 KB minus 1 KB) is allocated but never
+used: this corresponds to 300% overhead or 25% efficiency. Similarly, a 5 KB
+user object will be stored as two RADOS objects, a 4 KB RADOS object and a 1 KB
+RADOS object, with the result that 4KB of device capacity is stranded. In this
+case, however, the overhead percentage is much smaller. Think of this in terms
+of the remainder from a modulus operation. The overhead *percentage* thus
+decreases rapidly as object size increases.
+
+There is an additional subtlety that is easily missed: the amplification
+phenomenon just described takes place for *each* replica. For example, when
+using the default of three copies of data (3R), a 1 KB S3 object actually
+strands roughly 9 KB of storage device capacity. If erasure coding (EC) is used
+instead of replication, the amplification might be even higher: for a ``k=4,
+m=2`` pool, our 1 KB S3 object allocates 24 KB (that is, 4 KB multiplied by 6)
+of device capacity.
+
+When an RGW bucket pool contains many relatively large user objects, the effect
+of this phenomenon is often negligible. However, with deployments that can
+expect a significant fraction of relatively small user objects, the effect
+should be taken into consideration.
+
+The 4KB default value aligns well with conventional HDD and SSD devices.
+However, certain novel coarse-IU (Indirection Unit) QLC SSDs perform and wear
+best when :confval:`bluestore_min_alloc_size_ssd` is specified at OSD creation
+to match the device's IU: this might be 8KB, 16KB, or even 64KB. These novel
+storage drives can achieve read performance that is competitive with that of
+conventional TLC SSDs and write performance that is faster than that of HDDs,
+with higher density and lower cost than TLC SSDs.
+
+Note that when creating OSDs on these novel devices, one must be careful to
+apply the non-default value only to appropriate devices, and not to
+conventional HDD and SSD devices. Error can be avoided through careful ordering
+of OSD creation, with custom OSD device classes, and especially by the use of
+central configuration *masks*.
+
+In Quincy and later releases, you can use the
+:confval:`bluestore_use_optimal_io_size_for_min_alloc_size` option to allow
+automatic discovery of the correct value as each OSD is created. Note that the
+use of ``bcache``, ``OpenCAS``, ``dmcrypt``, ``ATA over Ethernet``, `iSCSI`, or
+other device-layering and abstraction technologies might confound the
+determination of correct values. Moreover, OSDs deployed on top of VMware
+storage have sometimes been found to report a ``rotational`` attribute that
+does not match the underlying hardware.
+
+We suggest inspecting such OSDs at startup via logs and admin sockets in order
+to ensure that their behavior is correct. Be aware that this kind of inspection
+might not work as expected with older kernels. To check for this issue,
+examine the presence and value of ``/sys/block/<drive>/queue/optimal_io_size``.
+
+.. note:: When running Reef or a later Ceph release, the ``min_alloc_size``
+ baked into each OSD is conveniently reported by ``ceph osd metadata``.
+
+To inspect a specific OSD, run the following command:
+
+.. prompt:: bash #
+
+ ceph osd metadata osd.1701 | egrep rotational\|alloc
+
+This space amplification might manifest as an unusually high ratio of raw to
+stored data as reported by ``ceph df``. There might also be ``%USE`` / ``VAR``
+values reported by ``ceph osd df`` that are unusually high in comparison to
+other, ostensibly identical, OSDs. Finally, there might be unexpected balancer
+behavior in pools that use OSDs that have mismatched ``min_alloc_size`` values.
+
+This BlueStore attribute takes effect *only* at OSD creation; if the attribute
+is changed later, a specific OSD's behavior will not change unless and until
+the OSD is destroyed and redeployed with the appropriate option value(s).
+Upgrading to a later Ceph release will *not* change the value used by OSDs that
+were deployed under older releases or with other settings.
+
+.. confval:: bluestore_min_alloc_size
+.. confval:: bluestore_min_alloc_size_hdd
+.. confval:: bluestore_min_alloc_size_ssd
+.. confval:: bluestore_use_optimal_io_size_for_min_alloc_size
+
+DSA (Data Streaming Accelerator) Usage
+======================================
+
+If you want to use the DML library to drive the DSA device for offloading
+read/write operations on persistent memory (PMEM) in BlueStore, you need to
+install `DML`_ and the `idxd-config`_ library. This will work only on machines
+that have a SPR (Sapphire Rapids) CPU.
+
+.. _dml: https://github.com/intel/dml
+.. _idxd-config: https://github.com/intel/idxd-config
+
+After installing the DML software, configure the shared work queues (WQs) with
+reference to the following WQ configuration example:
+
+.. prompt:: bash $
+
+ accel-config config-wq --group-id=1 --mode=shared --wq-size=16 --threshold=15 --type=user --name="myapp1" --priority=10 --block-on-fault=1 dsa0/wq0.1
+ accel-config config-engine dsa0/engine0.1 --group-id=1
+ accel-config enable-device dsa0
+ accel-config enable-wq dsa0/wq0.1
diff --git a/doc/rados/configuration/ceph-conf.rst b/doc/rados/configuration/ceph-conf.rst
new file mode 100644
index 000000000..d8d5c9d03
--- /dev/null
+++ b/doc/rados/configuration/ceph-conf.rst
@@ -0,0 +1,715 @@
+.. _configuring-ceph:
+
+==================
+ Configuring Ceph
+==================
+
+When Ceph services start, the initialization process activates a set of
+daemons that run in the background. A :term:`Ceph Storage Cluster` runs at
+least three types of daemons:
+
+- :term:`Ceph Monitor` (``ceph-mon``)
+- :term:`Ceph Manager` (``ceph-mgr``)
+- :term:`Ceph OSD Daemon` (``ceph-osd``)
+
+Any Ceph Storage Cluster that supports the :term:`Ceph File System` also runs
+at least one :term:`Ceph Metadata Server` (``ceph-mds``). Any Cluster that
+supports :term:`Ceph Object Storage` runs Ceph RADOS Gateway daemons
+(``radosgw``).
+
+Each daemon has a number of configuration options, and each of those options
+has a default value. Adjust the behavior of the system by changing these
+configuration options. Make sure to understand the consequences before
+overriding the default values, as it is possible to significantly degrade the
+performance and stability of your cluster. Remember that default values
+sometimes change between releases. For this reason, it is best to review the
+version of this documentation that applies to your Ceph release.
+
+Option names
+============
+
+Each of the Ceph configuration options has a unique name that consists of words
+formed with lowercase characters and connected with underscore characters
+(``_``).
+
+When option names are specified on the command line, underscore (``_``) and
+dash (``-``) characters can be used interchangeably (for example,
+``--mon-host`` is equivalent to ``--mon_host``).
+
+When option names appear in configuration files, spaces can also be used in
+place of underscores or dashes. However, for the sake of clarity and
+convenience, we suggest that you consistently use underscores, as we do
+throughout this documentation.
+
+Config sources
+==============
+
+Each Ceph daemon, process, and library pulls its configuration from one or more
+of the several sources listed below. Sources that occur later in the list
+override those that occur earlier in the list (when both are present).
+
+- the compiled-in default value
+- the monitor cluster's centralized configuration database
+- a configuration file stored on the local host
+- environment variables
+- command-line arguments
+- runtime overrides that are set by an administrator
+
+One of the first things a Ceph process does on startup is parse the
+configuration options provided via the command line, via the environment, and
+via the local configuration file. Next, the process contacts the monitor
+cluster to retrieve centrally-stored configuration for the entire cluster.
+After a complete view of the configuration is available, the startup of the
+daemon or process will commence.
+
+.. _bootstrap-options:
+
+Bootstrap options
+-----------------
+
+Bootstrap options are configuration options that affect the process's ability
+to contact the monitors, to authenticate, and to retrieve the cluster-stored
+configuration. For this reason, these options might need to be stored locally
+on the node, and set by means of a local configuration file. These options
+include the following:
+
+.. confval:: mon_host
+.. confval:: mon_host_override
+
+- :confval:`mon_dns_srv_name`
+- :confval:`mon_data`, :confval:`osd_data`, :confval:`mds_data`,
+ :confval:`mgr_data`, and similar options that define which local directory
+ the daemon stores its data in.
+- :confval:`keyring`, :confval:`keyfile`, and/or :confval:`key`, which can be
+ used to specify the authentication credential to use to authenticate with the
+ monitor. Note that in most cases the default keyring location is in the data
+ directory specified above.
+
+In most cases, there is no reason to modify the default values of these
+options. However, there is one exception to this: the :confval:`mon_host`
+option that identifies the addresses of the cluster's monitors. But when
+:ref:`DNS is used to identify monitors<mon-dns-lookup>`, a local Ceph
+configuration file can be avoided entirely.
+
+
+Skipping monitor config
+-----------------------
+
+The option ``--no-mon-config`` can be passed in any command in order to skip
+the step that retrieves configuration information from the cluster's monitors.
+Skipping this retrieval step can be useful in cases where configuration is
+managed entirely via configuration files, or when maintenance activity needs to
+be done but the monitor cluster is down.
+
+.. _ceph-conf-file:
+
+Configuration sections
+======================
+
+Each of the configuration options associated with a single process or daemon
+has a single value. However, the values for a configuration option can vary
+across daemon types, and can vary even across different daemons of the same
+type. Ceph options that are stored in the monitor configuration database or in
+local configuration files are grouped into sections |---| so-called "configuration
+sections" |---| to indicate which daemons or clients they apply to.
+
+
+These sections include the following:
+
+.. confsec:: global
+
+ Settings under ``global`` affect all daemons and clients
+ in a Ceph Storage Cluster.
+
+ :example: ``log_file = /var/log/ceph/$cluster-$type.$id.log``
+
+.. confsec:: mon
+
+ Settings under ``mon`` affect all ``ceph-mon`` daemons in
+ the Ceph Storage Cluster, and override the same setting in
+ ``global``.
+
+ :example: ``mon_cluster_log_to_syslog = true``
+
+.. confsec:: mgr
+
+ Settings in the ``mgr`` section affect all ``ceph-mgr`` daemons in
+ the Ceph Storage Cluster, and override the same setting in
+ ``global``.
+
+ :example: ``mgr_stats_period = 10``
+
+.. confsec:: osd
+
+ Settings under ``osd`` affect all ``ceph-osd`` daemons in
+ the Ceph Storage Cluster, and override the same setting in
+ ``global``.
+
+ :example: ``osd_op_queue = wpq``
+
+.. confsec:: mds
+
+ Settings in the ``mds`` section affect all ``ceph-mds`` daemons in
+ the Ceph Storage Cluster, and override the same setting in
+ ``global``.
+
+ :example: ``mds_cache_memory_limit = 10G``
+
+.. confsec:: client
+
+ Settings under ``client`` affect all Ceph clients
+ (for example, mounted Ceph File Systems, mounted Ceph Block Devices)
+ as well as RADOS Gateway (RGW) daemons.
+
+ :example: ``objecter_inflight_ops = 512``
+
+
+Configuration sections can also specify an individual daemon or client name. For example,
+``mon.foo``, ``osd.123``, and ``client.smith`` are all valid section names.
+
+
+Any given daemon will draw its settings from the global section, the daemon- or
+client-type section, and the section sharing its name. Settings in the
+most-specific section take precedence so precedence: for example, if the same
+option is specified in both :confsec:`global`, :confsec:`mon`, and ``mon.foo``
+on the same source (i.e. that is, in the same configuration file), the
+``mon.foo`` setting will be used.
+
+If multiple values of the same configuration option are specified in the same
+section, the last value specified takes precedence.
+
+Note that values from the local configuration file always take precedence over
+values from the monitor configuration database, regardless of the section in
+which they appear.
+
+.. _ceph-metavariables:
+
+Metavariables
+=============
+
+Metavariables dramatically simplify Ceph storage cluster configuration. When a
+metavariable is set in a configuration value, Ceph expands the metavariable at
+the time the configuration value is used. In this way, Ceph metavariables
+behave similarly to the way that variable expansion works in the Bash shell.
+
+Ceph supports the following metavariables:
+
+.. describe:: $cluster
+
+ Expands to the Ceph Storage Cluster name. Useful when running
+ multiple Ceph Storage Clusters on the same hardware.
+
+ :example: ``/etc/ceph/$cluster.keyring``
+ :default: ``ceph``
+
+.. describe:: $type
+
+ Expands to a daemon or process type (for example, ``mds``, ``osd``, or ``mon``)
+
+ :example: ``/var/lib/ceph/$type``
+
+.. describe:: $id
+
+ Expands to the daemon or client identifier. For
+ ``osd.0``, this would be ``0``; for ``mds.a``, it would
+ be ``a``.
+
+ :example: ``/var/lib/ceph/$type/$cluster-$id``
+
+.. describe:: $host
+
+ Expands to the host name where the process is running.
+
+.. describe:: $name
+
+ Expands to ``$type.$id``.
+
+ :example: ``/var/run/ceph/$cluster-$name.asok``
+
+.. describe:: $pid
+
+ Expands to daemon pid.
+
+ :example: ``/var/run/ceph/$cluster-$name-$pid.asok``
+
+
+Ceph configuration file
+=======================
+
+On startup, Ceph processes search for a configuration file in the
+following locations:
+
+#. ``$CEPH_CONF`` (that is, the path following the ``$CEPH_CONF``
+ environment variable)
+#. ``-c path/path`` (that is, the ``-c`` command line argument)
+#. ``/etc/ceph/$cluster.conf``
+#. ``~/.ceph/$cluster.conf``
+#. ``./$cluster.conf`` (that is, in the current working directory)
+#. On FreeBSD systems only, ``/usr/local/etc/ceph/$cluster.conf``
+
+Here ``$cluster`` is the cluster's name (default: ``ceph``).
+
+The Ceph configuration file uses an ``ini`` style syntax. You can add "comment
+text" after a pound sign (#) or a semi-colon semicolon (;). For example:
+
+.. code-block:: ini
+
+ # <--A number (#) sign number sign (#) precedes a comment.
+ ; A comment may be anything.
+ # Comments always follow a semi-colon semicolon (;) or a pound sign (#) on each line.
+ # The end of the line terminates a comment.
+ # We recommend that you provide comments in your configuration file(s).
+
+
+.. _ceph-conf-settings:
+
+Config file section names
+-------------------------
+
+The configuration file is divided into sections. Each section must begin with a
+valid configuration section name (see `Configuration sections`_, above) that is
+surrounded by square brackets. For example:
+
+.. code-block:: ini
+
+ [global]
+ debug_ms = 0
+
+ [osd]
+ debug_ms = 1
+
+ [osd.1]
+ debug_ms = 10
+
+ [osd.2]
+ debug_ms = 10
+
+Config file option values
+-------------------------
+
+The value of a configuration option is a string. If the string is too long to
+fit on a single line, you can put a backslash (``\``) at the end of the line
+and the backslash will act as a line continuation marker. In such a case, the
+value of the option will be the string after ``=`` in the current line,
+combined with the string in the next line. Here is an example::
+
+ [global]
+ foo = long long ago\
+ long ago
+
+In this example, the value of the "``foo``" option is "``long long ago long
+ago``".
+
+An option value typically ends with either a newline or a comment. For
+example:
+
+.. code-block:: ini
+
+ [global]
+ obscure_one = difficult to explain # I will try harder in next release
+ simpler_one = nothing to explain
+
+In this example, the value of the "``obscure one``" option is "``difficult to
+explain``" and the value of the "``simpler one`` options is "``nothing to
+explain``".
+
+When an option value contains spaces, it can be enclosed within single quotes
+or double quotes in order to make its scope clear and in order to make sure
+that the first space in the value is not interpreted as the end of the value.
+For example:
+
+.. code-block:: ini
+
+ [global]
+ line = "to be, or not to be"
+
+In option values, there are four characters that are treated as escape
+characters: ``=``, ``#``, ``;`` and ``[``. They are permitted to occur in an
+option value only if they are immediately preceded by the backslash character
+(``\``). For example:
+
+.. code-block:: ini
+
+ [global]
+ secret = "i love \# and \["
+
+Each configuration option falls under one of the following types:
+
+.. describe:: int
+
+ 64-bit signed integer. Some SI suffixes are supported, such as "K", "M",
+ "G", "T", "P", and "E" (meaning, respectively, 10\ :sup:`3`, 10\ :sup:`6`,
+ 10\ :sup:`9`, etc.). "B" is the only supported unit string. Thus "1K", "1M",
+ "128B" and "-1" are all valid option values. When a negative value is
+ assigned to a threshold option, this can indicate that the option is
+ "unlimited" -- that is, that there is no threshold or limit in effect.
+
+ :example: ``42``, ``-1``
+
+.. describe:: uint
+
+ This differs from ``integer`` only in that negative values are not
+ permitted.
+
+ :example: ``256``, ``0``
+
+.. describe:: str
+
+ A string encoded in UTF-8. Certain characters are not permitted. Reference
+ the above notes for the details.
+
+ :example: ``"hello world"``, ``"i love \#"``, ``yet-another-name``
+
+.. describe:: boolean
+
+ Typically either of the two values ``true`` or ``false``. However, any
+ integer is permitted: "0" implies ``false``, and any non-zero value implies
+ ``true``.
+
+ :example: ``true``, ``false``, ``1``, ``0``
+
+.. describe:: addr
+
+ A single address, optionally prefixed with ``v1``, ``v2`` or ``any`` for the
+ messenger protocol. If no prefix is specified, the ``v2`` protocol is used.
+ For more details, see :ref:`address_formats`.
+
+ :example: ``v1:1.2.3.4:567``, ``v2:1.2.3.4:567``, ``1.2.3.4:567``, ``2409:8a1e:8fb6:aa20:1260:4bff:fe92:18f5::567``, ``[::1]:6789``
+
+.. describe:: addrvec
+
+ A set of addresses separated by ",". The addresses can be optionally quoted
+ with ``[`` and ``]``.
+
+ :example: ``[v1:1.2.3.4:567,v2:1.2.3.4:568]``, ``v1:1.2.3.4:567,v1:1.2.3.14:567`` ``[2409:8a1e:8fb6:aa20:1260:4bff:fe92:18f5::567], [2409:8a1e:8fb6:aa20:1260:4bff:fe92:18f5::568]``
+
+.. describe:: uuid
+
+ The string format of a uuid defined by `RFC4122
+ <https://www.ietf.org/rfc/rfc4122.txt>`_. Certain variants are also
+ supported: for more details, see `Boost document
+ <https://www.boost.org/doc/libs/1_74_0/libs/uuid/doc/uuid.html#String%20Generator>`_.
+
+ :example: ``f81d4fae-7dec-11d0-a765-00a0c91e6bf6``
+
+.. describe:: size
+
+ 64-bit unsigned integer. Both SI prefixes and IEC prefixes are supported.
+ "B" is the only supported unit string. Negative values are not permitted.
+
+ :example: ``1Ki``, ``1K``, ``1KiB`` and ``1B``.
+
+.. describe:: secs
+
+ Denotes a duration of time. The default unit of time is the second.
+ The following units of time are supported:
+
+ * second: ``s``, ``sec``, ``second``, ``seconds``
+ * minute: ``m``, ``min``, ``minute``, ``minutes``
+ * hour: ``hs``, ``hr``, ``hour``, ``hours``
+ * day: ``d``, ``day``, ``days``
+ * week: ``w``, ``wk``, ``week``, ``weeks``
+ * month: ``mo``, ``month``, ``months``
+ * year: ``y``, ``yr``, ``year``, ``years``
+
+ :example: ``1 m``, ``1m`` and ``1 week``
+
+.. _ceph-conf-database:
+
+Monitor configuration database
+==============================
+
+The monitor cluster manages a database of configuration options that can be
+consumed by the entire cluster. This allows for streamlined central
+configuration management of the entire system. For ease of administration and
+transparency, the vast majority of configuration options can and should be
+stored in this database.
+
+Some settings might need to be stored in local configuration files because they
+affect the ability of the process to connect to the monitors, to authenticate,
+and to fetch configuration information. In most cases this applies only to the
+``mon_host`` option. This issue can be avoided by using :ref:`DNS SRV
+records<mon-dns-lookup>`.
+
+Sections and masks
+------------------
+
+Configuration options stored by the monitor can be stored in a global section,
+in a daemon-type section, or in a specific daemon section. In this, they are
+no different from the options in a configuration file.
+
+In addition, options may have a *mask* associated with them to further restrict
+which daemons or clients the option applies to. Masks take two forms:
+
+#. ``type:location`` where ``type`` is a CRUSH property like ``rack`` or
+ ``host``, and ``location`` is a value for that property. For example,
+ ``host:foo`` would limit the option only to daemons or clients
+ running on a particular host.
+#. ``class:device-class`` where ``device-class`` is the name of a CRUSH
+ device class (for example, ``hdd`` or ``ssd``). For example,
+ ``class:ssd`` would limit the option only to OSDs backed by SSDs.
+ (This mask has no effect on non-OSD daemons or clients.)
+
+In commands that specify a configuration option, the argument of the option (in
+the following examples, this is the "who" string) may be a section name, a
+mask, or a combination of both separated by a slash character (``/``). For
+example, ``osd/rack:foo`` would refer to all OSD daemons in the ``foo`` rack.
+
+When configuration options are shown, the section name and mask are presented
+in separate fields or columns to make them more readable.
+
+Commands
+--------
+
+The following CLI commands are used to configure the cluster:
+
+* ``ceph config dump`` dumps the entire monitor configuration
+ database for the cluster.
+
+* ``ceph config get <who>`` dumps the configuration options stored in
+ the monitor configuration database for a specific daemon or client
+ (for example, ``mds.a``).
+
+* ``ceph config get <who> <option>`` shows either a configuration value
+ stored in the monitor configuration database for a specific daemon or client
+ (for example, ``mds.a``), or, if that value is not present in the monitor
+ configuration database, the compiled-in default value.
+
+* ``ceph config set <who> <option> <value>`` specifies a configuration
+ option in the monitor configuration database.
+
+* ``ceph config show <who>`` shows the configuration for a running daemon.
+ These settings might differ from those stored by the monitors if there are
+ also local configuration files in use or if options have been overridden on
+ the command line or at run time. The source of the values of the options is
+ displayed in the output.
+
+* ``ceph config assimilate-conf -i <input file> -o <output file>`` ingests a
+ configuration file from *input file* and moves any valid options into the
+ monitor configuration database. Any settings that are unrecognized, are
+ invalid, or cannot be controlled by the monitor will be returned in an
+ abbreviated configuration file stored in *output file*. This command is
+ useful for transitioning from legacy configuration files to centralized
+ monitor-based configuration.
+
+Note that ``ceph config set <who> <option> <value>`` and ``ceph config get
+<who> <option>`` will not necessarily return the same values. The latter
+command will show compiled-in default values. In order to determine whether a
+configuration option is present in the monitor configuration database, run
+``ceph config dump``.
+
+Help
+====
+
+To get help for a particular option, run the following command:
+
+.. prompt:: bash $
+
+ ceph config help <option>
+
+For example:
+
+.. prompt:: bash $
+
+ ceph config help log_file
+
+::
+
+ log_file - path to log file
+ (std::string, basic)
+ Default (non-daemon):
+ Default (daemon): /var/log/ceph/$cluster-$name.log
+ Can update at runtime: false
+ See also: [log_to_stderr,err_to_stderr,log_to_syslog,err_to_syslog]
+
+or:
+
+.. prompt:: bash $
+
+ ceph config help log_file -f json-pretty
+
+::
+
+ {
+ "name": "log_file",
+ "type": "std::string",
+ "level": "basic",
+ "desc": "path to log file",
+ "long_desc": "",
+ "default": "",
+ "daemon_default": "/var/log/ceph/$cluster-$name.log",
+ "tags": [],
+ "services": [],
+ "see_also": [
+ "log_to_stderr",
+ "err_to_stderr",
+ "log_to_syslog",
+ "err_to_syslog"
+ ],
+ "enum_values": [],
+ "min": "",
+ "max": "",
+ "can_update_at_runtime": false
+ }
+
+The ``level`` property can be ``basic``, ``advanced``, or ``dev``. The `dev`
+options are intended for use by developers, generally for testing purposes, and
+are not recommended for use by operators.
+
+.. note:: This command uses the configuration schema that is compiled into the
+ running monitors. If you have a mixed-version cluster (as might exist, for
+ example, during an upgrade), you might want to query the option schema from
+ a specific running daemon by running a command of the following form:
+
+.. prompt:: bash $
+
+ ceph daemon <name> config help [option]
+
+Runtime Changes
+===============
+
+In most cases, Ceph permits changes to the configuration of a daemon at
+run time. This can be used for increasing or decreasing the amount of logging
+output, for enabling or disabling debug settings, and for runtime optimization.
+
+Use the ``ceph config set`` command to update configuration options. For
+example, to enable the most verbose debug log level on a specific OSD, run a
+command of the following form:
+
+.. prompt:: bash $
+
+ ceph config set osd.123 debug_ms 20
+
+.. note:: If an option has been customized in a local configuration file, the
+ `central config
+ <https://ceph.io/en/news/blog/2018/new-mimic-centralized-configuration-management/>`_
+ setting will be ignored because it has a lower priority than the local
+ configuration file.
+
+.. note:: Log levels range from 0 to 20.
+
+Override values
+---------------
+
+Options can be set temporarily by using the Ceph CLI ``tell`` or ``daemon``
+interfaces on the Ceph CLI. These *override* values are ephemeral, which means
+that they affect only the current instance of the daemon and revert to
+persistently configured values when the daemon restarts.
+
+Override values can be set in two ways:
+
+#. From any host, send a message to a daemon with a command of the following
+ form:
+
+ .. prompt:: bash $
+
+ ceph tell <name> config set <option> <value>
+
+ For example:
+
+ .. prompt:: bash $
+
+ ceph tell osd.123 config set debug_osd 20
+
+ The ``tell`` command can also accept a wildcard as the daemon identifier.
+ For example, to adjust the debug level on all OSD daemons, run a command of
+ the following form:
+
+ .. prompt:: bash $
+
+ ceph tell osd.* config set debug_osd 20
+
+#. On the host where the daemon is running, connect to the daemon via a socket
+ in ``/var/run/ceph`` by running a command of the following form:
+
+ .. prompt:: bash $
+
+ ceph daemon <name> config set <option> <value>
+
+ For example:
+
+ .. prompt:: bash $
+
+ ceph daemon osd.4 config set debug_osd 20
+
+.. note:: In the output of the ``ceph config show`` command, these temporary
+ values are shown to have a source of ``override``.
+
+
+Viewing runtime settings
+========================
+
+You can see the current settings specified for a running daemon with the ``ceph
+config show`` command. For example, to see the (non-default) settings for the
+daemon ``osd.0``, run the following command:
+
+.. prompt:: bash $
+
+ ceph config show osd.0
+
+To see a specific setting, run the following command:
+
+.. prompt:: bash $
+
+ ceph config show osd.0 debug_osd
+
+To see all settings (including those with default values), run the following
+command:
+
+.. prompt:: bash $
+
+ ceph config show-with-defaults osd.0
+
+You can see all settings for a daemon that is currently running by connecting
+to it on the local host via the admin socket. For example, to dump all
+current settings, run the following command:
+
+.. prompt:: bash $
+
+ ceph daemon osd.0 config show
+
+To see non-default settings and to see where each value came from (for example,
+a config file, the monitor, or an override), run the following command:
+
+.. prompt:: bash $
+
+ ceph daemon osd.0 config diff
+
+To see the value of a single setting, run the following command:
+
+.. prompt:: bash $
+
+ ceph daemon osd.0 config get debug_osd
+
+
+Changes introduced in Octopus
+=============================
+
+The Octopus release changed the way the configuration file is parsed.
+These changes are as follows:
+
+- Repeated configuration options are allowed, and no warnings will be
+ displayed. This means that the setting that comes last in the file is the one
+ that takes effect. Prior to this change, Ceph displayed warning messages when
+ lines containing duplicate options were encountered, such as::
+
+ warning line 42: 'foo' in section 'bar' redefined
+- Prior to Octopus, options containing invalid UTF-8 characters were ignored
+ with warning messages. But in Octopus, they are treated as fatal errors.
+- The backslash character ``\`` is used as the line-continuation marker that
+ combines the next line with the current one. Prior to Octopus, there was a
+ requirement that any end-of-line backslash be followed by a non-empty line.
+ But in Octopus, an empty line following a backslash is allowed.
+- In the configuration file, each line specifies an individual configuration
+ option. The option's name and its value are separated with ``=``, and the
+ value may be enclosed within single or double quotes. If an invalid
+ configuration is specified, we will treat it as an invalid configuration
+ file::
+
+ bad option ==== bad value
+- Prior to Octopus, if no section name was specified in the configuration file,
+ all options would be set as though they were within the :confsec:`global`
+ section. This approach is discouraged. Since Octopus, any configuration
+ file that has no section name must contain only a single option.
+
+.. |---| unicode:: U+2014 .. EM DASH :trim:
diff --git a/doc/rados/configuration/common.rst b/doc/rados/configuration/common.rst
new file mode 100644
index 000000000..0b373f469
--- /dev/null
+++ b/doc/rados/configuration/common.rst
@@ -0,0 +1,207 @@
+.. _ceph-conf-common-settings:
+
+Common Settings
+===============
+
+The `Hardware Recommendations`_ section provides some hardware guidelines for
+configuring a Ceph Storage Cluster. It is possible for a single :term:`Ceph
+Node` to run multiple daemons. For example, a single node with multiple drives
+ususally runs one ``ceph-osd`` for each drive. Ideally, each node will be
+assigned to a particular type of process. For example, some nodes might run
+``ceph-osd`` daemons, other nodes might run ``ceph-mds`` daemons, and still
+other nodes might run ``ceph-mon`` daemons.
+
+Each node has a name. The name of a node can be found in its ``host`` setting.
+Monitors also specify a network address and port (that is, a domain name or IP
+address) that can be found in the ``addr`` setting. A basic configuration file
+typically specifies only minimal settings for each instance of monitor daemons.
+For example:
+
+
+
+
+.. code-block:: ini
+
+ [global]
+ mon_initial_members = ceph1
+ mon_host = 10.0.0.1
+
+.. important:: The ``host`` setting's value is the short name of the node. It
+ is not an FQDN. It is **NOT** an IP address. To retrieve the name of the
+ node, enter ``hostname -s`` on the command line. Unless you are deploying
+ Ceph manually, do not use ``host`` settings for anything other than initial
+ monitor setup. **DO NOT** specify the ``host`` setting under individual
+ daemons when using deployment tools like ``chef`` or ``cephadm``. Such tools
+ are designed to enter the appropriate values for you in the cluster map.
+
+
+.. _ceph-network-config:
+
+Networks
+========
+
+For more about configuring a network for use with Ceph, see the `Network
+Configuration Reference`_ .
+
+
+Monitors
+========
+
+Ceph production clusters typically provision at least three :term:`Ceph
+Monitor` daemons to ensure availability in the event of a monitor instance
+crash. A minimum of three :term:`Ceph Monitor` daemons ensures that the Paxos
+algorithm is able to determine which version of the :term:`Ceph Cluster Map` is
+the most recent. It makes this determination by consulting a majority of Ceph
+Monitors in the quorum.
+
+.. note:: You may deploy Ceph with a single monitor, but if the instance fails,
+ the lack of other monitors might interrupt data-service availability.
+
+Ceph Monitors normally listen on port ``3300`` for the new v2 protocol, and on
+port ``6789`` for the old v1 protocol.
+
+By default, Ceph expects to store monitor data on the following path::
+
+ /var/lib/ceph/mon/$cluster-$id
+
+You or a deployment tool (for example, ``cephadm``) must create the
+corresponding directory. With metavariables fully expressed and a cluster named
+"ceph", the path specified in the above example evaluates to::
+
+ /var/lib/ceph/mon/ceph-a
+
+For additional details, see the `Monitor Config Reference`_.
+
+.. _Monitor Config Reference: ../mon-config-ref
+
+
+.. _ceph-osd-config:
+
+Authentication
+==============
+
+.. versionadded:: Bobtail 0.56
+
+Authentication is explicitly enabled or disabled in the ``[global]`` section of
+the Ceph configuration file, as shown here:
+
+.. code-block:: ini
+
+ auth_cluster_required = cephx
+ auth_service_required = cephx
+ auth_client_required = cephx
+
+In addition, you should enable message signing. For details, see `Cephx Config
+Reference`_.
+
+.. _Cephx Config Reference: ../auth-config-ref
+
+
+.. _ceph-monitor-config:
+
+
+OSDs
+====
+
+By default, Ceph expects to store a Ceph OSD Daemon's data on the following
+path::
+
+ /var/lib/ceph/osd/$cluster-$id
+
+You or a deployment tool (for example, ``cephadm``) must create the
+corresponding directory. With metavariables fully expressed and a cluster named
+"ceph", the path specified in the above example evaluates to::
+
+ /var/lib/ceph/osd/ceph-0
+
+You can override this path using the ``osd_data`` setting. We recommend that
+you do not change the default location. To create the default directory on your
+OSD host, run the following commands:
+
+.. prompt:: bash $
+
+ ssh {osd-host}
+ sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
+
+The ``osd_data`` path ought to lead to a mount point that has mounted on it a
+device that is distinct from the device that contains the operating system and
+the daemons. To use a device distinct from the device that contains the
+operating system and the daemons, prepare it for use with Ceph and mount it on
+the directory you just created by running the following commands:
+
+.. prompt:: bash $
+
+ ssh {new-osd-host}
+ sudo mkfs -t {fstype} /dev/{disk}
+ sudo mount -o user_xattr /dev/{disk} /var/lib/ceph/osd/ceph-{osd-number}
+
+We recommend using the ``xfs`` file system when running :command:`mkfs`. (The
+``btrfs`` and ``ext4`` file systems are not recommended and are no longer
+tested.)
+
+For additional configuration details, see `OSD Config Reference`_.
+
+
+Heartbeats
+==========
+
+During runtime operations, Ceph OSD Daemons check up on other Ceph OSD Daemons
+and report their findings to the Ceph Monitor. This process does not require
+you to provide any settings. However, if you have network latency issues, you
+might want to modify the default settings.
+
+For additional details, see `Configuring Monitor/OSD Interaction`_.
+
+
+.. _ceph-logging-and-debugging:
+
+Logs / Debugging
+================
+
+You might sometimes encounter issues with Ceph that require you to use Ceph's
+logging and debugging features. For details on log rotation, see `Debugging and
+Logging`_.
+
+.. _Debugging and Logging: ../../troubleshooting/log-and-debug
+
+
+Example ceph.conf
+=================
+
+.. literalinclude:: demo-ceph.conf
+ :language: ini
+
+.. _ceph-runtime-config:
+
+
+
+Naming Clusters (deprecated)
+============================
+
+Each Ceph cluster has an internal name. This internal name is used as part of
+configuration, and as part of "log file" names as well as part of directory
+names and as part of mountpoint names. This name defaults to "ceph". Previous
+releases of Ceph allowed one to specify a custom name instead, for example
+"ceph2". This option was intended to facilitate the running of multiple logical
+clusters on the same physical hardware, but in practice it was rarely
+exploited. Custom cluster names should no longer be attempted. Old
+documentation might lead readers to wrongly think that unique cluster names are
+required to use ``rbd-mirror``. They are not required.
+
+Custom cluster names are now considered deprecated and the ability to deploy
+them has already been removed from some tools, although existing custom-name
+deployments continue to operate. The ability to run and manage clusters with
+custom names might be progressively removed by future Ceph releases, so **it is
+strongly recommended to deploy all new clusters with the default name "ceph"**.
+
+Some Ceph CLI commands accept a ``--cluster`` (cluster name) option. This
+option is present only for the sake of backward compatibility. New tools and
+deployments cannot be relied upon to accommodate this option.
+
+If you need to allow multiple clusters to exist on the same host, use
+:ref:`cephadm`, which uses containers to fully isolate each cluster.
+
+.. _Hardware Recommendations: ../../../start/hardware-recommendations
+.. _Network Configuration Reference: ../network-config-ref
+.. _OSD Config Reference: ../osd-config-ref
+.. _Configuring Monitor/OSD Interaction: ../mon-osd-interaction
diff --git a/doc/rados/configuration/demo-ceph.conf b/doc/rados/configuration/demo-ceph.conf
new file mode 100644
index 000000000..8ba285a42
--- /dev/null
+++ b/doc/rados/configuration/demo-ceph.conf
@@ -0,0 +1,31 @@
+[global]
+fsid = {cluster-id}
+mon_initial_members = {hostname}[, {hostname}]
+mon_host = {ip-address}[, {ip-address}]
+
+#All clusters have a front-side public network.
+#If you have two network interfaces, you can configure a private / cluster
+#network for RADOS object replication, heartbeats, backfill,
+#recovery, etc.
+public_network = {network}[, {network}]
+#cluster_network = {network}[, {network}]
+
+#Clusters require authentication by default.
+auth_cluster_required = cephx
+auth_service_required = cephx
+auth_client_required = cephx
+
+#Choose reasonable number of replicas and placement groups.
+osd_journal_size = {n}
+osd_pool_default_size = {n} # Write an object n times.
+osd_pool_default_min_size = {n} # Allow writing n copies in a degraded state.
+osd_pool_default_pg_autoscale_mode = {mode} # on, off, or warn
+# Only used if autoscaling is off or warn:
+osd_pool_default_pg_num = {n}
+
+#Choose a reasonable crush leaf type.
+#0 for a 1-node cluster.
+#1 for a multi node cluster in a single rack
+#2 for a multi node, multi chassis cluster with multiple hosts in a chassis
+#3 for a multi node cluster with hosts across racks, etc.
+osd_crush_chooseleaf_type = {n}
diff --git a/doc/rados/configuration/filestore-config-ref.rst b/doc/rados/configuration/filestore-config-ref.rst
new file mode 100644
index 000000000..7aefe26b3
--- /dev/null
+++ b/doc/rados/configuration/filestore-config-ref.rst
@@ -0,0 +1,377 @@
+============================
+ Filestore Config Reference
+============================
+
+.. note:: Since the Luminous release of Ceph, Filestore has not been Ceph's
+ default storage back end. Since the Luminous release of Ceph, BlueStore has
+ been Ceph's default storage back end. However, Filestore OSDs are still
+ supported up to Quincy. Filestore OSDs are not supported in Reef. See
+ :ref:`OSD Back Ends <rados_config_storage_devices_osd_backends>`. See
+ :ref:`BlueStore Migration <rados_operations_bluestore_migration>` for
+ instructions explaining how to replace an existing Filestore back end with a
+ BlueStore back end.
+
+
+``filestore_debug_omap_check``
+
+:Description: Debugging check on synchronization. Expensive. For debugging only.
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+
+.. index:: filestore; extended attributes
+
+Extended Attributes
+===================
+
+Extended Attributes (XATTRs) are important for Filestore OSDs. However, Certain
+disadvantages can occur when the underlying file system is used for the storage
+of XATTRs: some file systems have limits on the number of bytes that can be
+stored in XATTRs, and your file system might in some cases therefore run slower
+than would an alternative method of storing XATTRs. For this reason, a method
+of storing XATTRs extrinsic to the underlying file system might improve
+performance. To implement such an extrinsic method, refer to the following
+settings.
+
+If the underlying file system has no size limit, then Ceph XATTRs are stored as
+``inline xattr``, using the XATTRs provided by the file system. But if there is
+a size limit (for example, ext4 imposes a limit of 4 KB total), then some Ceph
+XATTRs will be stored in a key/value database when the limit is reached. More
+precisely, this begins to occur when either the
+``filestore_max_inline_xattr_size`` or ``filestore_max_inline_xattrs``
+threshold is reached.
+
+
+``filestore_max_inline_xattr_size``
+
+:Description: Defines the maximum size per object of an XATTR that can be
+ stored in the file system (for example, XFS, Btrfs, ext4). The
+ specified size should not be larger than the file system can
+ handle. Using the default value of 0 instructs Filestore to use
+ the value specific to the file system.
+:Type: Unsigned 32-bit Integer
+:Required: No
+:Default: ``0``
+
+
+``filestore_max_inline_xattr_size_xfs``
+
+:Description: Defines the maximum size of an XATTR that can be stored in the
+ XFS file system. This setting is used only if
+ ``filestore_max_inline_xattr_size`` == 0.
+:Type: Unsigned 32-bit Integer
+:Required: No
+:Default: ``65536``
+
+
+``filestore_max_inline_xattr_size_btrfs``
+
+:Description: Defines the maximum size of an XATTR that can be stored in the
+ Btrfs file system. This setting is used only if
+ ``filestore_max_inline_xattr_size`` == 0.
+:Type: Unsigned 32-bit Integer
+:Required: No
+:Default: ``2048``
+
+
+``filestore_max_inline_xattr_size_other``
+
+:Description: Defines the maximum size of an XATTR that can be stored in other file systems.
+ This setting is used only if ``filestore_max_inline_xattr_size`` == 0.
+:Type: Unsigned 32-bit Integer
+:Required: No
+:Default: ``512``
+
+
+``filestore_max_inline_xattrs``
+
+:Description: Defines the maximum number of XATTRs per object that can be stored in the file system.
+ Using the default value of 0 instructs Filestore to use the value specific to the file system.
+:Type: 32-bit Integer
+:Required: No
+:Default: ``0``
+
+
+``filestore_max_inline_xattrs_xfs``
+
+:Description: Defines the maximum number of XATTRs per object that can be stored in the XFS file system.
+ This setting is used only if ``filestore_max_inline_xattrs`` == 0.
+:Type: 32-bit Integer
+:Required: No
+:Default: ``10``
+
+
+``filestore_max_inline_xattrs_btrfs``
+
+:Description: Defines the maximum number of XATTRs per object that can be stored in the Btrfs file system.
+ This setting is used only if ``filestore_max_inline_xattrs`` == 0.
+:Type: 32-bit Integer
+:Required: No
+:Default: ``10``
+
+
+``filestore_max_inline_xattrs_other``
+
+:Description: Defines the maximum number of XATTRs per object that can be stored in other file systems.
+ This setting is used only if ``filestore_max_inline_xattrs`` == 0.
+:Type: 32-bit Integer
+:Required: No
+:Default: ``2``
+
+.. index:: filestore; synchronization
+
+Synchronization Intervals
+=========================
+
+Filestore must periodically quiesce writes and synchronize the file system.
+Each synchronization creates a consistent commit point. When the commit point
+is created, Filestore is able to free all journal entries up to that point.
+More-frequent synchronization tends to reduce both synchronization time and
+the amount of data that needs to remain in the journal. Less-frequent
+synchronization allows the backing file system to coalesce small writes and
+metadata updates, potentially increasing synchronization
+efficiency but also potentially increasing tail latency.
+
+
+``filestore_max_sync_interval``
+
+:Description: Defines the maximum interval (in seconds) for synchronizing Filestore.
+:Type: Double
+:Required: No
+:Default: ``5``
+
+
+``filestore_min_sync_interval``
+
+:Description: Defines the minimum interval (in seconds) for synchronizing Filestore.
+:Type: Double
+:Required: No
+:Default: ``.01``
+
+
+.. index:: filestore; flusher
+
+Flusher
+=======
+
+The Filestore flusher forces data from large writes to be written out using
+``sync_file_range`` prior to the synchronization.
+Ideally, this action reduces the cost of the eventual synchronization. In practice, however, disabling
+'filestore_flusher' seems in some cases to improve performance.
+
+
+``filestore_flusher``
+
+:Description: Enables the Filestore flusher.
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+.. deprecated:: v.65
+
+``filestore_flusher_max_fds``
+
+:Description: Defines the maximum number of file descriptors for the flusher.
+:Type: Integer
+:Required: No
+:Default: ``512``
+
+.. deprecated:: v.65
+
+``filestore_sync_flush``
+
+:Description: Enables the synchronization flusher.
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+.. deprecated:: v.65
+
+``filestore_fsync_flushes_journal_data``
+
+:Description: Flushes journal data during file-system synchronization.
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+
+.. index:: filestore; queue
+
+Queue
+=====
+
+The following settings define limits on the size of the Filestore queue:
+
+``filestore_queue_max_ops``
+
+:Description: Defines the maximum number of in-progress operations that Filestore accepts before it blocks the queueing of any new operations.
+:Type: Integer
+:Required: No. Minimal impact on performance.
+:Default: ``50``
+
+
+``filestore_queue_max_bytes``
+
+:Description: Defines the maximum number of bytes permitted per operation.
+:Type: Integer
+:Required: No
+:Default: ``100 << 20``
+
+
+.. index:: filestore; timeouts
+
+Timeouts
+========
+
+``filestore_op_threads``
+
+:Description: Defines the number of file-system operation threads that execute in parallel.
+:Type: Integer
+:Required: No
+:Default: ``2``
+
+
+``filestore_op_thread_timeout``
+
+:Description: Defines the timeout (in seconds) for a file-system operation thread.
+:Type: Integer
+:Required: No
+:Default: ``60``
+
+
+``filestore_op_thread_suicide_timeout``
+
+:Description: Defines the timeout (in seconds) for a commit operation before the commit is cancelled.
+:Type: Integer
+:Required: No
+:Default: ``180``
+
+
+.. index:: filestore; btrfs
+
+B-Tree Filesystem
+=================
+
+
+``filestore_btrfs_snap``
+
+:Description: Enables snapshots for a ``btrfs`` Filestore.
+:Type: Boolean
+:Required: No. Used only for ``btrfs``.
+:Default: ``true``
+
+
+``filestore_btrfs_clone_range``
+
+:Description: Enables cloning ranges for a ``btrfs`` Filestore.
+:Type: Boolean
+:Required: No. Used only for ``btrfs``.
+:Default: ``true``
+
+
+.. index:: filestore; journal
+
+Journal
+=======
+
+
+``filestore_journal_parallel``
+
+:Description: Enables parallel journaling, default for ``btrfs``.
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+
+``filestore_journal_writeahead``
+
+:Description: Enables write-ahead journaling, default for XFS.
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+
+``filestore_journal_trailing``
+
+:Description: Deprecated. **Never use.**
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+
+Misc
+====
+
+
+``filestore_merge_threshold``
+
+:Description: Defines the minimum number of files permitted in a subdirectory before the subdirectory is merged into its parent directory.
+ NOTE: A negative value means that subdirectory merging is disabled.
+:Type: Integer
+:Required: No
+:Default: ``-10``
+
+
+``filestore_split_multiple``
+
+:Description: ``(filestore_split_multiple * abs(filestore_merge_threshold) + (rand() % filestore_split_rand_factor)) * 16``
+ is the maximum number of files permitted in a subdirectory
+ before the subdirectory is split into child directories.
+
+:Type: Integer
+:Required: No
+:Default: ``2``
+
+
+``filestore_split_rand_factor``
+
+:Description: A random factor added to the split threshold to avoid
+ too many (expensive) Filestore splits occurring at the same time.
+ For details, see ``filestore_split_multiple``.
+ To change this setting for an existing OSD, it is necessary to take the OSD
+ offline before running the ``ceph-objectstore-tool apply-layout-settings`` command.
+
+:Type: Unsigned 32-bit Integer
+:Required: No
+:Default: ``20``
+
+
+``filestore_update_to``
+
+:Description: Limits automatic upgrades to a specified version of Filestore. Useful in cases in which you want to avoid upgrading to a specific version.
+:Type: Integer
+:Required: No
+:Default: ``1000``
+
+
+``filestore_blackhole``
+
+:Description: Drops any new transactions on the floor, similar to redirecting to NULL.
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+
+``filestore_dump_file``
+
+:Description: Defines the file that transaction dumps are stored on.
+:Type: Boolean
+:Required: No
+:Default: ``false``
+
+
+``filestore_kill_at``
+
+:Description: Injects a failure at the *n*\th opportunity.
+:Type: String
+:Required: No
+:Default: ``false``
+
+
+``filestore_fail_eio``
+
+:Description: Fail/Crash on EIO.
+:Type: Boolean
+:Required: No
+:Default: ``true``
diff --git a/doc/rados/configuration/general-config-ref.rst b/doc/rados/configuration/general-config-ref.rst
new file mode 100644
index 000000000..f4613456a
--- /dev/null
+++ b/doc/rados/configuration/general-config-ref.rst
@@ -0,0 +1,19 @@
+==========================
+ General Config Reference
+==========================
+
+.. confval:: admin_socket
+ :default: /var/run/ceph/$cluster-$name.asok
+.. confval:: pid_file
+.. confval:: chdir
+.. confval:: fatal_signal_handlers
+.. describe:: max_open_files
+
+ If set, when the :term:`Ceph Storage Cluster` starts, Ceph sets
+ the max open FDs at the OS level (i.e., the max # of file
+ descriptors). A suitably large value prevents Ceph Daemons from running out
+ of file descriptors.
+
+ :Type: 64-bit Integer
+ :Required: No
+ :Default: ``0``
diff --git a/doc/rados/configuration/index.rst b/doc/rados/configuration/index.rst
new file mode 100644
index 000000000..715b999d1
--- /dev/null
+++ b/doc/rados/configuration/index.rst
@@ -0,0 +1,53 @@
+===============
+ Configuration
+===============
+
+Each Ceph process, daemon, or utility draws its configuration from several
+sources on startup. Such sources can include (1) a local configuration, (2) the
+monitors, (3) the command line, and (4) environment variables.
+
+Configuration options can be set globally so that they apply (1) to all
+daemons, (2) to all daemons or services of a particular type, or (3) to only a
+specific daemon, process, or client.
+
+.. raw:: html
+
+ <table cellpadding="10"><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>Configuring the Object Store</h3>
+
+For general object store configuration, refer to the following:
+
+.. toctree::
+ :maxdepth: 1
+
+ Storage devices <storage-devices>
+ ceph-conf
+
+
+.. raw:: html
+
+ </td><td><h3>Reference</h3>
+
+To optimize the performance of your cluster, refer to the following:
+
+.. toctree::
+ :maxdepth: 1
+
+ Common Settings <common>
+ Network Settings <network-config-ref>
+ Messenger v2 protocol <msgr2>
+ Auth Settings <auth-config-ref>
+ Monitor Settings <mon-config-ref>
+ mon-lookup-dns
+ Heartbeat Settings <mon-osd-interaction>
+ OSD Settings <osd-config-ref>
+ DmClock Settings <mclock-config-ref>
+ BlueStore Settings <bluestore-config-ref>
+ FileStore Settings <filestore-config-ref>
+ Journal Settings <journal-ref>
+ Pool, PG & CRUSH Settings <pool-pg-config-ref.rst>
+ General Settings <general-config-ref>
+
+
+.. raw:: html
+
+ </td></tr></tbody></table>
diff --git a/doc/rados/configuration/journal-ref.rst b/doc/rados/configuration/journal-ref.rst
new file mode 100644
index 000000000..5ce5a5e2d
--- /dev/null
+++ b/doc/rados/configuration/journal-ref.rst
@@ -0,0 +1,39 @@
+==========================
+ Journal Config Reference
+==========================
+.. warning:: Filestore has been deprecated in the Reef release and is no longer supported.
+.. index:: journal; journal configuration
+
+Filestore OSDs use a journal for two reasons: speed and consistency. Note
+that since Luminous, the BlueStore OSD back end has been preferred and default.
+This information is provided for pre-existing OSDs and for rare situations where
+Filestore is preferred for new deployments.
+
+- **Speed:** The journal enables the Ceph OSD Daemon to commit small writes
+ quickly. Ceph writes small, random i/o to the journal sequentially, which
+ tends to speed up bursty workloads by allowing the backing file system more
+ time to coalesce writes. The Ceph OSD Daemon's journal, however, can lead
+ to spiky performance with short spurts of high-speed writes followed by
+ periods without any write progress as the file system catches up to the
+ journal.
+
+- **Consistency:** Ceph OSD Daemons require a file system interface that
+ guarantees atomic compound operations. Ceph OSD Daemons write a description
+ of the operation to the journal and apply the operation to the file system.
+ This enables atomic updates to an object (for example, placement group
+ metadata). Every few seconds--between ``filestore max sync interval`` and
+ ``filestore min sync interval``--the Ceph OSD Daemon stops writes and
+ synchronizes the journal with the file system, allowing Ceph OSD Daemons to
+ trim operations from the journal and reuse the space. On failure, Ceph
+ OSD Daemons replay the journal starting after the last synchronization
+ operation.
+
+Ceph OSD Daemons recognize the following journal settings:
+
+.. confval:: journal_dio
+.. confval:: journal_aio
+.. confval:: journal_block_align
+.. confval:: journal_max_write_bytes
+.. confval:: journal_max_write_entries
+.. confval:: journal_align_min_size
+.. confval:: journal_zero_on_create
diff --git a/doc/rados/configuration/mclock-config-ref.rst b/doc/rados/configuration/mclock-config-ref.rst
new file mode 100644
index 000000000..a338aa6da
--- /dev/null
+++ b/doc/rados/configuration/mclock-config-ref.rst
@@ -0,0 +1,699 @@
+========================
+ mClock Config Reference
+========================
+
+.. index:: mclock; configuration
+
+QoS support in Ceph is implemented using a queuing scheduler based on `the
+dmClock algorithm`_. See :ref:`dmclock-qos` section for more details.
+
+To make the usage of mclock more user-friendly and intuitive, mclock config
+profiles are introduced. The mclock profiles mask the low level details from
+users, making it easier to configure and use mclock.
+
+The following input parameters are required for a mclock profile to configure
+the QoS related parameters:
+
+* total capacity (IOPS) of each OSD (determined automatically -
+ See `OSD Capacity Determination (Automated)`_)
+
+* the max sequential bandwidth capacity (MiB/s) of each OSD -
+ See *osd_mclock_max_sequential_bandwidth_[hdd|ssd]* option
+
+* an mclock profile type to enable
+
+Using the settings in the specified profile, an OSD determines and applies the
+lower-level mclock and Ceph parameters. The parameters applied by the mclock
+profile make it possible to tune the QoS between client I/O and background
+operations in the OSD.
+
+
+.. index:: mclock; mclock clients
+
+mClock Client Types
+===================
+
+The mclock scheduler handles requests from different types of Ceph services.
+Each service can be considered as a type of client from mclock's perspective.
+Depending on the type of requests handled, mclock clients are classified into
+the buckets as shown in the table below,
+
++------------------------+--------------------------------------------------------------+
+| Client Type | Request Types |
++========================+==============================================================+
+| Client | I/O requests issued by external clients of Ceph |
++------------------------+--------------------------------------------------------------+
+| Background recovery | Internal recovery requests |
++------------------------+--------------------------------------------------------------+
+| Background best-effort | Internal backfill, scrub, snap trim and PG deletion requests |
++------------------------+--------------------------------------------------------------+
+
+The mclock profiles allocate parameters like reservation, weight and limit
+(see :ref:`dmclock-qos`) differently for each client type. The next sections
+describe the mclock profiles in greater detail.
+
+
+.. index:: mclock; profile definition
+
+mClock Profiles - Definition and Purpose
+========================================
+
+A mclock profile is *“a configuration setting that when applied on a running
+Ceph cluster enables the throttling of the operations(IOPS) belonging to
+different client classes (background recovery, scrub, snaptrim, client op,
+osd subop)”*.
+
+The mclock profile uses the capacity limits and the mclock profile type selected
+by the user to determine the low-level mclock resource control configuration
+parameters and apply them transparently. Additionally, other Ceph configuration
+parameters are also applied. Please see sections below for more information.
+
+The low-level mclock resource control parameters are the *reservation*,
+*limit*, and *weight* that provide control of the resource shares, as
+described in the :ref:`dmclock-qos` section.
+
+
+.. index:: mclock; profile types
+
+mClock Profile Types
+====================
+
+mclock profiles can be broadly classified into *built-in* and *custom* profiles,
+
+Built-in Profiles
+-----------------
+Users can choose between the following built-in profile types:
+
+.. note:: The values mentioned in the tables below represent the proportion
+ of the total IOPS capacity of the OSD allocated for the service type.
+
+* balanced (default)
+* high_client_ops
+* high_recovery_ops
+
+balanced (*default*)
+^^^^^^^^^^^^^^^^^^^^
+The *balanced* profile is the default mClock profile. This profile allocates
+equal reservation/priority to client operations and background recovery
+operations. Background best-effort ops are given lower reservation and therefore
+take a longer time to complete when are are competing operations. This profile
+helps meet the normal/steady-state requirements of the cluster. This is the
+case when external client performance requirement is not critical and there are
+other background operations that still need attention within the OSD.
+
+But there might be instances that necessitate giving higher allocations to either
+client ops or recovery ops. In order to deal with such a situation, the alternate
+built-in profiles may be enabled by following the steps mentioned in next sections.
+
++------------------------+-------------+--------+-------+
+| Service Type | Reservation | Weight | Limit |
++========================+=============+========+=======+
+| client | 50% | 1 | MAX |
++------------------------+-------------+--------+-------+
+| background recovery | 50% | 1 | MAX |
++------------------------+-------------+--------+-------+
+| background best-effort | MIN | 1 | 90% |
++------------------------+-------------+--------+-------+
+
+high_client_ops
+^^^^^^^^^^^^^^^
+This profile optimizes client performance over background activities by
+allocating more reservation and limit to client operations as compared to
+background operations in the OSD. This profile, for example, may be enabled
+to provide the needed performance for I/O intensive applications for a
+sustained period of time at the cost of slower recoveries. The table shows
+the resource control parameters set by the profile:
+
++------------------------+-------------+--------+-------+
+| Service Type | Reservation | Weight | Limit |
++========================+=============+========+=======+
+| client | 60% | 2 | MAX |
++------------------------+-------------+--------+-------+
+| background recovery | 40% | 1 | MAX |
++------------------------+-------------+--------+-------+
+| background best-effort | MIN | 1 | 70% |
++------------------------+-------------+--------+-------+
+
+high_recovery_ops
+^^^^^^^^^^^^^^^^^
+This profile optimizes background recovery performance as compared to external
+clients and other background operations within the OSD. This profile, for
+example, may be enabled by an administrator temporarily to speed-up background
+recoveries during non-peak hours. The table shows the resource control
+parameters set by the profile:
+
++------------------------+-------------+--------+-------+
+| Service Type | Reservation | Weight | Limit |
++========================+=============+========+=======+
+| client | 30% | 1 | MAX |
++------------------------+-------------+--------+-------+
+| background recovery | 70% | 2 | MAX |
++------------------------+-------------+--------+-------+
+| background best-effort | MIN | 1 | MAX |
++------------------------+-------------+--------+-------+
+
+.. note:: Across the built-in profiles, internal background best-effort clients
+ of mclock include "backfill", "scrub", "snap trim", and "pg deletion"
+ operations.
+
+
+Custom Profile
+--------------
+This profile gives users complete control over all the mclock configuration
+parameters. This profile should be used with caution and is meant for advanced
+users, who understand mclock and Ceph related configuration options.
+
+
+.. index:: mclock; built-in profiles
+
+mClock Built-in Profiles - Locked Config Options
+=================================================
+The below sections describe the config options that are locked to certain values
+in order to ensure mClock scheduler is able to provide predictable QoS.
+
+mClock Config Options
+---------------------
+.. important:: These defaults cannot be changed using any of the config
+ subsytem commands like *config set* or via the *config daemon* or *config
+ tell* interfaces. Although the above command(s) report success, the mclock
+ QoS parameters are reverted to their respective built-in profile defaults.
+
+When a built-in profile is enabled, the mClock scheduler calculates the low
+level mclock parameters [*reservation*, *weight*, *limit*] based on the profile
+enabled for each client type. The mclock parameters are calculated based on
+the max OSD capacity provided beforehand. As a result, the following mclock
+config parameters cannot be modified when using any of the built-in profiles:
+
+- :confval:`osd_mclock_scheduler_client_res`
+- :confval:`osd_mclock_scheduler_client_wgt`
+- :confval:`osd_mclock_scheduler_client_lim`
+- :confval:`osd_mclock_scheduler_background_recovery_res`
+- :confval:`osd_mclock_scheduler_background_recovery_wgt`
+- :confval:`osd_mclock_scheduler_background_recovery_lim`
+- :confval:`osd_mclock_scheduler_background_best_effort_res`
+- :confval:`osd_mclock_scheduler_background_best_effort_wgt`
+- :confval:`osd_mclock_scheduler_background_best_effort_lim`
+
+Recovery/Backfill Options
+-------------------------
+.. warning:: The recommendation is to not change these options as the built-in
+ profiles are optimized based on them. Changing these defaults can result in
+ unexpected performance outcomes.
+
+The following recovery and backfill related Ceph options are overridden to
+mClock defaults:
+
+- :confval:`osd_max_backfills`
+- :confval:`osd_recovery_max_active`
+- :confval:`osd_recovery_max_active_hdd`
+- :confval:`osd_recovery_max_active_ssd`
+
+The following table shows the mClock defaults which is the same as the current
+defaults. This is done to maximize the performance of the foreground (client)
+operations:
+
++----------------------------------------+------------------+----------------+
+| Config Option | Original Default | mClock Default |
++========================================+==================+================+
+| :confval:`osd_max_backfills` | 1 | 1 |
++----------------------------------------+------------------+----------------+
+| :confval:`osd_recovery_max_active` | 0 | 0 |
++----------------------------------------+------------------+----------------+
+| :confval:`osd_recovery_max_active_hdd` | 3 | 3 |
++----------------------------------------+------------------+----------------+
+| :confval:`osd_recovery_max_active_ssd` | 10 | 10 |
++----------------------------------------+------------------+----------------+
+
+The above mClock defaults, can be modified only if necessary by enabling
+:confval:`osd_mclock_override_recovery_settings` (default: false). The
+steps for this is discussed in the
+`Steps to Modify mClock Max Backfills/Recovery Limits`_ section.
+
+Sleep Options
+-------------
+If any mClock profile (including "custom") is active, the following Ceph config
+sleep options are disabled (set to 0),
+
+- :confval:`osd_recovery_sleep`
+- :confval:`osd_recovery_sleep_hdd`
+- :confval:`osd_recovery_sleep_ssd`
+- :confval:`osd_recovery_sleep_hybrid`
+- :confval:`osd_scrub_sleep`
+- :confval:`osd_delete_sleep`
+- :confval:`osd_delete_sleep_hdd`
+- :confval:`osd_delete_sleep_ssd`
+- :confval:`osd_delete_sleep_hybrid`
+- :confval:`osd_snap_trim_sleep`
+- :confval:`osd_snap_trim_sleep_hdd`
+- :confval:`osd_snap_trim_sleep_ssd`
+- :confval:`osd_snap_trim_sleep_hybrid`
+
+The above sleep options are disabled to ensure that mclock scheduler is able to
+determine when to pick the next op from its operation queue and transfer it to
+the operation sequencer. This results in the desired QoS being provided across
+all its clients.
+
+
+.. index:: mclock; enable built-in profile
+
+Steps to Enable mClock Profile
+==============================
+
+As already mentioned, the default mclock profile is set to *balanced*.
+The other values for the built-in profiles include *high_client_ops* and
+*high_recovery_ops*.
+
+If there is a requirement to change the default profile, then the option
+:confval:`osd_mclock_profile` may be set during runtime by using the following
+command:
+
+ .. prompt:: bash #
+
+ ceph config set osd.N osd_mclock_profile <value>
+
+For example, to change the profile to allow faster recoveries on "osd.0", the
+following command can be used to switch to the *high_recovery_ops* profile:
+
+ .. prompt:: bash #
+
+ ceph config set osd.0 osd_mclock_profile high_recovery_ops
+
+.. note:: The *custom* profile is not recommended unless you are an advanced
+ user.
+
+And that's it! You are ready to run workloads on the cluster and check if the
+QoS requirements are being met.
+
+
+Switching Between Built-in and Custom Profiles
+==============================================
+
+There may be situations requiring switching from a built-in profile to the
+*custom* profile and vice-versa. The following sections outline the steps to
+accomplish this.
+
+Steps to Switch From a Built-in to the Custom Profile
+-----------------------------------------------------
+
+The following command can be used to switch to the *custom* profile:
+
+ .. prompt:: bash #
+
+ ceph config set osd osd_mclock_profile custom
+
+For example, to change the profile to *custom* on all OSDs, the following
+command can be used:
+
+ .. prompt:: bash #
+
+ ceph config set osd osd_mclock_profile custom
+
+After switching to the *custom* profile, the desired mClock configuration
+option may be modified. For example, to change the client reservation IOPS
+ratio for a specific OSD (say osd.0) to 0.5 (or 50%), the following command
+can be used:
+
+ .. prompt:: bash #
+
+ ceph config set osd.0 osd_mclock_scheduler_client_res 0.5
+
+.. important:: Care must be taken to change the reservations of other services
+ like recovery and background best effort accordingly to ensure that the sum
+ of the reservations do not exceed the maximum proportion (1.0) of the IOPS
+ capacity of the OSD.
+
+.. tip:: The reservation and limit parameter allocations are per-shard based on
+ the type of backing device (HDD/SSD) under the OSD. See
+ :confval:`osd_op_num_shards_hdd` and :confval:`osd_op_num_shards_ssd` for
+ more details.
+
+Steps to Switch From the Custom Profile to a Built-in Profile
+-------------------------------------------------------------
+
+Switching from the *custom* profile to a built-in profile requires an
+intermediate step of removing the custom settings from the central config
+database for the changes to take effect.
+
+The following sequence of commands can be used to switch to a built-in profile:
+
+#. Set the desired built-in profile using:
+
+ .. prompt:: bash #
+
+ ceph config set osd <mClock Configuration Option>
+
+ For example, to set the built-in profile to ``high_client_ops`` on all
+ OSDs, run the following command:
+
+ .. prompt:: bash #
+
+ ceph config set osd osd_mclock_profile high_client_ops
+#. Determine the existing custom mClock configuration settings in the central
+ config database using the following command:
+
+ .. prompt:: bash #
+
+ ceph config dump
+#. Remove the custom mClock configuration settings determined in the previous
+ step from the central config database:
+
+ .. prompt:: bash #
+
+ ceph config rm osd <mClock Configuration Option>
+
+ For example, to remove the configuration option
+ :confval:`osd_mclock_scheduler_client_res` that was set on all OSDs, run the
+ following command:
+
+ .. prompt:: bash #
+
+ ceph config rm osd osd_mclock_scheduler_client_res
+#. After all existing custom mClock configuration settings have been removed
+ from the central config database, the configuration settings pertaining to
+ ``high_client_ops`` will come into effect. For e.g., to verify the settings
+ on osd.0 use:
+
+ .. prompt:: bash #
+
+ ceph config show osd.0
+
+Switch Temporarily Between mClock Profiles
+------------------------------------------
+
+To switch between mClock profiles on a temporary basis, the following commands
+may be used to override the settings:
+
+.. warning:: This section is for advanced users or for experimental testing. The
+ recommendation is to not use the below commands on a running cluster as it
+ could have unexpected outcomes.
+
+.. note:: The configuration changes on an OSD using the below commands are
+ ephemeral and are lost when it restarts. It is also important to note that
+ the config options overridden using the below commands cannot be modified
+ further using the *ceph config set osd.N ...* command. The changes will not
+ take effect until a given OSD is restarted. This is intentional, as per the
+ config subsystem design. However, any further modification can still be made
+ ephemerally using the commands mentioned below.
+
+#. Run the *injectargs* command as shown to override the mclock settings:
+
+ .. prompt:: bash #
+
+ ceph tell osd.N injectargs '--<mClock Configuration Option>=<value>'
+
+ For example, the following command overrides the
+ :confval:`osd_mclock_profile` option on osd.0:
+
+ .. prompt:: bash #
+
+ ceph tell osd.0 injectargs '--osd_mclock_profile=high_recovery_ops'
+
+
+#. An alternate command that can be used is:
+
+ .. prompt:: bash #
+
+ ceph daemon osd.N config set <mClock Configuration Option> <value>
+
+ For example, the following command overrides the
+ :confval:`osd_mclock_profile` option on osd.0:
+
+ .. prompt:: bash #
+
+ ceph daemon osd.0 config set osd_mclock_profile high_recovery_ops
+
+The individual QoS-related config options for the *custom* profile can also be
+modified ephemerally using the above commands.
+
+
+Steps to Modify mClock Max Backfills/Recovery Limits
+====================================================
+
+This section describes the steps to modify the default max backfills or recovery
+limits if the need arises.
+
+.. warning:: This section is for advanced users or for experimental testing. The
+ recommendation is to retain the defaults as is on a running cluster as
+ modifying them could have unexpected performance outcomes. The values may
+ be modified only if the cluster is unable to cope/showing poor performance
+ with the default settings or for performing experiments on a test cluster.
+
+.. important:: The max backfill/recovery options that can be modified are listed
+ in section `Recovery/Backfill Options`_. The modification of the mClock
+ default backfills/recovery limit is gated by the
+ :confval:`osd_mclock_override_recovery_settings` option, which is set to
+ *false* by default. Attempting to modify any default recovery/backfill
+ limits without setting the gating option will reset that option back to the
+ mClock defaults along with a warning message logged in the cluster log. Note
+ that it may take a few seconds for the default value to come back into
+ effect. Verify the limit using the *config show* command as shown below.
+
+#. Set the :confval:`osd_mclock_override_recovery_settings` config option on all
+ osds to *true* using:
+
+ .. prompt:: bash #
+
+ ceph config set osd osd_mclock_override_recovery_settings true
+
+#. Set the desired max backfill/recovery option using:
+
+ .. prompt:: bash #
+
+ ceph config set osd osd_max_backfills <value>
+
+ For example, the following command modifies the :confval:`osd_max_backfills`
+ option on all osds to 5.
+
+ .. prompt:: bash #
+
+ ceph config set osd osd_max_backfills 5
+
+#. Wait for a few seconds and verify the running configuration for a specific
+ OSD using:
+
+ .. prompt:: bash #
+
+ ceph config show osd.N | grep osd_max_backfills
+
+ For example, the following command shows the running configuration of
+ :confval:`osd_max_backfills` on osd.0.
+
+ .. prompt:: bash #
+
+ ceph config show osd.0 | grep osd_max_backfills
+
+#. Reset the :confval:`osd_mclock_override_recovery_settings` config option on
+ all osds to *false* using:
+
+ .. prompt:: bash #
+
+ ceph config set osd osd_mclock_override_recovery_settings false
+
+
+OSD Capacity Determination (Automated)
+======================================
+
+The OSD capacity in terms of total IOPS is determined automatically during OSD
+initialization. This is achieved by running the OSD bench tool and overriding
+the default value of ``osd_mclock_max_capacity_iops_[hdd, ssd]`` option
+depending on the device type. No other action/input is expected from the user
+to set the OSD capacity.
+
+.. note:: If you wish to manually benchmark OSD(s) or manually tune the
+ Bluestore throttle parameters, see section
+ `Steps to Manually Benchmark an OSD (Optional)`_.
+
+You may verify the capacity of an OSD after the cluster is brought up by using
+the following command:
+
+ .. prompt:: bash #
+
+ ceph config show osd.N osd_mclock_max_capacity_iops_[hdd, ssd]
+
+For example, the following command shows the max capacity for "osd.0" on a Ceph
+node whose underlying device type is SSD:
+
+ .. prompt:: bash #
+
+ ceph config show osd.0 osd_mclock_max_capacity_iops_ssd
+
+Mitigation of Unrealistic OSD Capacity From Automated Test
+----------------------------------------------------------
+In certain conditions, the OSD bench tool may show unrealistic/inflated result
+depending on the drive configuration and other environment related conditions.
+To mitigate the performance impact due to this unrealistic capacity, a couple
+of threshold config options depending on the osd's device type are defined and
+used:
+
+- :confval:`osd_mclock_iops_capacity_threshold_hdd` = 500
+- :confval:`osd_mclock_iops_capacity_threshold_ssd` = 80000
+
+The following automated step is performed:
+
+Fallback to using default OSD capacity (automated)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+If OSD bench reports a measurement that exceeds the above threshold values
+depending on the underlying device type, the fallback mechanism reverts to the
+default value of :confval:`osd_mclock_max_capacity_iops_hdd` or
+:confval:`osd_mclock_max_capacity_iops_ssd`. The threshold config options
+can be reconfigured based on the type of drive used. Additionally, a cluster
+warning is logged in case the measurement exceeds the threshold. For example, ::
+
+ 2022-10-27T15:30:23.270+0000 7f9b5dbe95c0 0 log_channel(cluster) log [WRN]
+ : OSD bench result of 39546.479392 IOPS exceeded the threshold limit of
+ 25000.000000 IOPS for osd.1. IOPS capacity is unchanged at 21500.000000
+ IOPS. The recommendation is to establish the osd's IOPS capacity using other
+ benchmark tools (e.g. Fio) and then override
+ osd_mclock_max_capacity_iops_[hdd|ssd].
+
+If the default capacity doesn't accurately represent the OSD's capacity, the
+following additional step is recommended to address this:
+
+Run custom drive benchmark if defaults are not accurate (manual)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+If the default OSD capacity is not accurate, the recommendation is to run a
+custom benchmark using your preferred tool (e.g. Fio) on the drive and then
+override the ``osd_mclock_max_capacity_iops_[hdd, ssd]`` option as described
+in the `Specifying Max OSD Capacity`_ section.
+
+This step is highly recommended until an alternate mechansim is worked upon.
+
+Steps to Manually Benchmark an OSD (Optional)
+=============================================
+
+.. note:: These steps are only necessary if you want to override the OSD
+ capacity already determined automatically during OSD initialization.
+ Otherwise, you may skip this section entirely.
+
+.. tip:: If you have already determined the benchmark data and wish to manually
+ override the max osd capacity for an OSD, you may skip to section
+ `Specifying Max OSD Capacity`_.
+
+
+Any existing benchmarking tool (e.g. Fio) can be used for this purpose. In this
+case, the steps use the *Ceph OSD Bench* command described in the next section.
+Regardless of the tool/command used, the steps outlined further below remain the
+same.
+
+As already described in the :ref:`dmclock-qos` section, the number of
+shards and the bluestore's throttle parameters have an impact on the mclock op
+queues. Therefore, it is critical to set these values carefully in order to
+maximize the impact of the mclock scheduler.
+
+:Number of Operational Shards:
+ We recommend using the default number of shards as defined by the
+ configuration options ``osd_op_num_shards``, ``osd_op_num_shards_hdd``, and
+ ``osd_op_num_shards_ssd``. In general, a lower number of shards will increase
+ the impact of the mclock queues.
+
+:Bluestore Throttle Parameters:
+ We recommend using the default values as defined by
+ :confval:`bluestore_throttle_bytes` and
+ :confval:`bluestore_throttle_deferred_bytes`. But these parameters may also be
+ determined during the benchmarking phase as described below.
+
+OSD Bench Command Syntax
+------------------------
+
+The :ref:`osd-subsystem` section describes the OSD bench command. The syntax
+used for benchmarking is shown below :
+
+.. prompt:: bash #
+
+ ceph tell osd.N bench [TOTAL_BYTES] [BYTES_PER_WRITE] [OBJ_SIZE] [NUM_OBJS]
+
+where,
+
+* ``TOTAL_BYTES``: Total number of bytes to write
+* ``BYTES_PER_WRITE``: Block size per write
+* ``OBJ_SIZE``: Bytes per object
+* ``NUM_OBJS``: Number of objects to write
+
+Benchmarking Test Steps Using OSD Bench
+---------------------------------------
+
+The steps below use the default shards and detail the steps used to determine
+the correct bluestore throttle values (optional).
+
+#. Bring up your Ceph cluster and login to the Ceph node hosting the OSDs that
+ you wish to benchmark.
+#. Run a simple 4KiB random write workload on an OSD using the following
+ commands:
+
+ .. note:: Note that before running the test, caches must be cleared to get an
+ accurate measurement.
+
+ For example, if you are running the benchmark test on osd.0, run the following
+ commands:
+
+ .. prompt:: bash #
+
+ ceph tell osd.0 cache drop
+
+ .. prompt:: bash #
+
+ ceph tell osd.0 bench 12288000 4096 4194304 100
+
+#. Note the overall throughput(IOPS) obtained from the output of the osd bench
+ command. This value is the baseline throughput(IOPS) when the default
+ bluestore throttle options are in effect.
+#. If the intent is to determine the bluestore throttle values for your
+ environment, then set the two options, :confval:`bluestore_throttle_bytes`
+ and :confval:`bluestore_throttle_deferred_bytes` to 32 KiB(32768 Bytes) each
+ to begin with. Otherwise, you may skip to the next section.
+#. Run the 4KiB random write test as before using OSD bench.
+#. Note the overall throughput from the output and compare the value
+ against the baseline throughput recorded in step 3.
+#. If the throughput doesn't match with the baseline, increment the bluestore
+ throttle options by 2x and repeat steps 5 through 7 until the obtained
+ throughput is very close to the baseline value.
+
+For example, during benchmarking on a machine with NVMe SSDs, a value of 256 KiB
+for both bluestore throttle and deferred bytes was determined to maximize the
+impact of mclock. For HDDs, the corresponding value was 40 MiB, where the
+overall throughput was roughly equal to the baseline throughput. Note that in
+general for HDDs, the bluestore throttle values are expected to be higher when
+compared to SSDs.
+
+
+Specifying Max OSD Capacity
+----------------------------
+
+The steps in this section may be performed only if you want to override the
+max osd capacity automatically set during OSD initialization. The option
+``osd_mclock_max_capacity_iops_[hdd, ssd]`` for an OSD can be set by running the
+following command:
+
+ .. prompt:: bash #
+
+ ceph config set osd.N osd_mclock_max_capacity_iops_[hdd,ssd] <value>
+
+For example, the following command sets the max capacity for a specific OSD
+(say "osd.0") whose underlying device type is HDD to 350 IOPS:
+
+ .. prompt:: bash #
+
+ ceph config set osd.0 osd_mclock_max_capacity_iops_hdd 350
+
+Alternatively, you may specify the max capacity for OSDs within the Ceph
+configuration file under the respective [osd.N] section. See
+:ref:`ceph-conf-settings` for more details.
+
+
+.. index:: mclock; config settings
+
+mClock Config Options
+=====================
+
+.. confval:: osd_mclock_profile
+.. confval:: osd_mclock_max_capacity_iops_hdd
+.. confval:: osd_mclock_max_capacity_iops_ssd
+.. confval:: osd_mclock_max_sequential_bandwidth_hdd
+.. confval:: osd_mclock_max_sequential_bandwidth_ssd
+.. confval:: osd_mclock_force_run_benchmark_on_init
+.. confval:: osd_mclock_skip_benchmark
+.. confval:: osd_mclock_override_recovery_settings
+.. confval:: osd_mclock_iops_capacity_threshold_hdd
+.. confval:: osd_mclock_iops_capacity_threshold_ssd
+
+.. _the dmClock algorithm: https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Gulati.pdf
diff --git a/doc/rados/configuration/mon-config-ref.rst b/doc/rados/configuration/mon-config-ref.rst
new file mode 100644
index 000000000..e0a12d093
--- /dev/null
+++ b/doc/rados/configuration/mon-config-ref.rst
@@ -0,0 +1,642 @@
+.. _monitor-config-reference:
+
+==========================
+ Monitor Config Reference
+==========================
+
+Understanding how to configure a :term:`Ceph Monitor` is an important part of
+building a reliable :term:`Ceph Storage Cluster`. **All Ceph Storage Clusters
+have at least one monitor**. The monitor complement usually remains fairly
+consistent, but you can add, remove or replace a monitor in a cluster. See
+`Adding/Removing a Monitor`_ for details.
+
+
+.. index:: Ceph Monitor; Paxos
+
+Background
+==========
+
+Ceph Monitors maintain a "master copy" of the :term:`Cluster Map`.
+
+The :term:`Cluster Map` makes it possible for :term:`Ceph client`\s to
+determine the location of all Ceph Monitors, Ceph OSD Daemons, and Ceph
+Metadata Servers. Clients do this by connecting to one Ceph Monitor and
+retrieving a current cluster map. Ceph clients must connect to a Ceph Monitor
+before they can read from or write to Ceph OSD Daemons or Ceph Metadata
+Servers. A Ceph client that has a current copy of the cluster map and the CRUSH
+algorithm can compute the location of any RADOS object within the cluster. This
+makes it possible for Ceph clients to talk directly to Ceph OSD Daemons. Direct
+communication between clients and Ceph OSD Daemons improves upon traditional
+storage architectures that required clients to communicate with a central
+component. See `Scalability and High Availability`_ for more on this subject.
+
+The Ceph Monitor's primary function is to maintain a master copy of the cluster
+map. Monitors also provide authentication and logging services. All changes in
+the monitor services are written by the Ceph Monitor to a single Paxos
+instance, and Paxos writes the changes to a key/value store. This provides
+strong consistency. Ceph Monitors are able to query the most recent version of
+the cluster map during sync operations, and they use the key/value store's
+snapshots and iterators (using RocksDB) to perform store-wide synchronization.
+
+.. ditaa::
+ /-------------\ /-------------\
+ | Monitor | Write Changes | Paxos |
+ | cCCC +-------------->+ cCCC |
+ | | | |
+ +-------------+ \------+------/
+ | Auth | |
+ +-------------+ | Write Changes
+ | Log | |
+ +-------------+ v
+ | Monitor Map | /------+------\
+ +-------------+ | Key / Value |
+ | OSD Map | | Store |
+ +-------------+ | cCCC |
+ | PG Map | \------+------/
+ +-------------+ ^
+ | MDS Map | | Read Changes
+ +-------------+ |
+ | cCCC |*---------------------+
+ \-------------/
+
+.. index:: Ceph Monitor; cluster map
+
+Cluster Maps
+------------
+
+The cluster map is a composite of maps, including the monitor map, the OSD map,
+the placement group map and the metadata server map. The cluster map tracks a
+number of important things: which processes are ``in`` the Ceph Storage Cluster;
+which processes that are ``in`` the Ceph Storage Cluster are ``up`` and running
+or ``down``; whether, the placement groups are ``active`` or ``inactive``, and
+``clean`` or in some other state; and, other details that reflect the current
+state of the cluster such as the total amount of storage space, and the amount
+of storage used.
+
+When there is a significant change in the state of the cluster--e.g., a Ceph OSD
+Daemon goes down, a placement group falls into a degraded state, etc.--the
+cluster map gets updated to reflect the current state of the cluster.
+Additionally, the Ceph Monitor also maintains a history of the prior states of
+the cluster. The monitor map, OSD map, placement group map and metadata server
+map each maintain a history of their map versions. We call each version an
+"epoch."
+
+When operating your Ceph Storage Cluster, keeping track of these states is an
+important part of your system administration duties. See `Monitoring a Cluster`_
+and `Monitoring OSDs and PGs`_ for additional details.
+
+.. index:: high availability; quorum
+
+Monitor Quorum
+--------------
+
+Our Configuring ceph section provides a trivial `Ceph configuration file`_ that
+provides for one monitor in the test cluster. A cluster will run fine with a
+single monitor; however, **a single monitor is a single-point-of-failure**. To
+ensure high availability in a production Ceph Storage Cluster, you should run
+Ceph with multiple monitors so that the failure of a single monitor **WILL NOT**
+bring down your entire cluster.
+
+When a Ceph Storage Cluster runs multiple Ceph Monitors for high availability,
+Ceph Monitors use `Paxos`_ to establish consensus about the master cluster map.
+A consensus requires a majority of monitors running to establish a quorum for
+consensus about the cluster map (e.g., 1; 2 out of 3; 3 out of 5; 4 out of 6;
+etc.).
+
+.. confval:: mon_force_quorum_join
+
+.. index:: Ceph Monitor; consistency
+
+Consistency
+-----------
+
+When you add monitor settings to your Ceph configuration file, you need to be
+aware of some of the architectural aspects of Ceph Monitors. **Ceph imposes
+strict consistency requirements** for a Ceph monitor when discovering another
+Ceph Monitor within the cluster. Whereas, Ceph Clients and other Ceph daemons
+use the Ceph configuration file to discover monitors, monitors discover each
+other using the monitor map (monmap), not the Ceph configuration file.
+
+A Ceph Monitor always refers to the local copy of the monmap when discovering
+other Ceph Monitors in the Ceph Storage Cluster. Using the monmap instead of the
+Ceph configuration file avoids errors that could break the cluster (e.g., typos
+in ``ceph.conf`` when specifying a monitor address or port). Since monitors use
+monmaps for discovery and they share monmaps with clients and other Ceph
+daemons, **the monmap provides monitors with a strict guarantee that their
+consensus is valid.**
+
+Strict consistency also applies to updates to the monmap. As with any other
+updates on the Ceph Monitor, changes to the monmap always run through a
+distributed consensus algorithm called `Paxos`_. The Ceph Monitors must agree on
+each update to the monmap, such as adding or removing a Ceph Monitor, to ensure
+that each monitor in the quorum has the same version of the monmap. Updates to
+the monmap are incremental so that Ceph Monitors have the latest agreed upon
+version, and a set of previous versions. Maintaining a history enables a Ceph
+Monitor that has an older version of the monmap to catch up with the current
+state of the Ceph Storage Cluster.
+
+If Ceph Monitors were to discover each other through the Ceph configuration file
+instead of through the monmap, additional risks would be introduced because
+Ceph configuration files are not updated and distributed automatically. Ceph
+Monitors might inadvertently use an older Ceph configuration file, fail to
+recognize a Ceph Monitor, fall out of a quorum, or develop a situation where
+`Paxos`_ is not able to determine the current state of the system accurately.
+
+
+.. index:: Ceph Monitor; bootstrapping monitors
+
+Bootstrapping Monitors
+----------------------
+
+In most configuration and deployment cases, tools that deploy Ceph help
+bootstrap the Ceph Monitors by generating a monitor map for you (e.g.,
+``cephadm``, etc). A Ceph Monitor requires a few explicit
+settings:
+
+- **Filesystem ID**: The ``fsid`` is the unique identifier for your
+ object store. Since you can run multiple clusters on the same
+ hardware, you must specify the unique ID of the object store when
+ bootstrapping a monitor. Deployment tools usually do this for you
+ (e.g., ``cephadm`` can call a tool like ``uuidgen``), but you
+ may specify the ``fsid`` manually too.
+
+- **Monitor ID**: A monitor ID is a unique ID assigned to each monitor within
+ the cluster. It is an alphanumeric value, and by convention the identifier
+ usually follows an alphabetical increment (e.g., ``a``, ``b``, etc.). This
+ can be set in a Ceph configuration file (e.g., ``[mon.a]``, ``[mon.b]``, etc.),
+ by a deployment tool, or using the ``ceph`` commandline.
+
+- **Keys**: The monitor must have secret keys. A deployment tool such as
+ ``cephadm`` usually does this for you, but you may
+ perform this step manually too. See `Monitor Keyrings`_ for details.
+
+For additional details on bootstrapping, see `Bootstrapping a Monitor`_.
+
+.. index:: Ceph Monitor; configuring monitors
+
+Configuring Monitors
+====================
+
+To apply configuration settings to the entire cluster, enter the configuration
+settings under ``[global]``. To apply configuration settings to all monitors in
+your cluster, enter the configuration settings under ``[mon]``. To apply
+configuration settings to specific monitors, specify the monitor instance
+(e.g., ``[mon.a]``). By convention, monitor instance names use alpha notation.
+
+.. code-block:: ini
+
+ [global]
+
+ [mon]
+
+ [mon.a]
+
+ [mon.b]
+
+ [mon.c]
+
+
+Minimum Configuration
+---------------------
+
+The bare minimum monitor settings for a Ceph monitor via the Ceph configuration
+file include a hostname and a network address for each monitor. You can configure
+these under ``[mon]`` or under the entry for a specific monitor.
+
+.. code-block:: ini
+
+ [global]
+ mon_host = 10.0.0.2,10.0.0.3,10.0.0.4
+
+.. code-block:: ini
+
+ [mon.a]
+ host = hostname1
+ mon_addr = 10.0.0.10:6789
+
+See the `Network Configuration Reference`_ for details.
+
+.. note:: This minimum configuration for monitors assumes that a deployment
+ tool generates the ``fsid`` and the ``mon.`` key for you.
+
+Once you deploy a Ceph cluster, you **SHOULD NOT** change the IP addresses of
+monitors. However, if you decide to change the monitor's IP address, you
+must follow a specific procedure. See :ref:`Changing a Monitor's IP address` for
+details.
+
+Monitors can also be found by clients by using DNS SRV records. See `Monitor lookup through DNS`_ for details.
+
+Cluster ID
+----------
+
+Each Ceph Storage Cluster has a unique identifier (``fsid``). If specified, it
+usually appears under the ``[global]`` section of the configuration file.
+Deployment tools usually generate the ``fsid`` and store it in the monitor map,
+so the value may not appear in a configuration file. The ``fsid`` makes it
+possible to run daemons for multiple clusters on the same hardware.
+
+.. confval:: fsid
+
+.. index:: Ceph Monitor; initial members
+
+Initial Members
+---------------
+
+We recommend running a production Ceph Storage Cluster with at least three Ceph
+Monitors to ensure high availability. When you run multiple monitors, you may
+specify the initial monitors that must be members of the cluster in order to
+establish a quorum. This may reduce the time it takes for your cluster to come
+online.
+
+.. code-block:: ini
+
+ [mon]
+ mon_initial_members = a,b,c
+
+
+.. confval:: mon_initial_members
+
+.. index:: Ceph Monitor; data path
+
+Data
+----
+
+Ceph provides a default path where Ceph Monitors store data. For optimal
+performance in a production Ceph Storage Cluster, we recommend running Ceph
+Monitors on separate hosts and drives from Ceph OSD Daemons. As leveldb uses
+``mmap()`` for writing the data, Ceph Monitors flush their data from memory to disk
+very often, which can interfere with Ceph OSD Daemon workloads if the data
+store is co-located with the OSD Daemons.
+
+In Ceph versions 0.58 and earlier, Ceph Monitors store their data in plain files. This
+approach allows users to inspect monitor data with common tools like ``ls``
+and ``cat``. However, this approach didn't provide strong consistency.
+
+In Ceph versions 0.59 and later, Ceph Monitors store their data as key/value
+pairs. Ceph Monitors require `ACID`_ transactions. Using a data store prevents
+recovering Ceph Monitors from running corrupted versions through Paxos, and it
+enables multiple modification operations in one single atomic batch, among other
+advantages.
+
+Generally, we do not recommend changing the default data location. If you modify
+the default location, we recommend that you make it uniform across Ceph Monitors
+by setting it in the ``[mon]`` section of the configuration file.
+
+
+.. confval:: mon_data
+.. confval:: mon_data_size_warn
+.. confval:: mon_data_avail_warn
+.. confval:: mon_data_avail_crit
+.. confval:: mon_warn_on_crush_straw_calc_version_zero
+.. confval:: mon_warn_on_legacy_crush_tunables
+.. confval:: mon_crush_min_required_version
+.. confval:: mon_warn_on_osd_down_out_interval_zero
+.. confval:: mon_warn_on_slow_ping_ratio
+.. confval:: mon_warn_on_slow_ping_time
+.. confval:: mon_warn_on_pool_no_redundancy
+.. confval:: mon_cache_target_full_warn_ratio
+.. confval:: mon_health_to_clog
+.. confval:: mon_health_to_clog_tick_interval
+.. confval:: mon_health_to_clog_interval
+
+.. index:: Ceph Storage Cluster; capacity planning, Ceph Monitor; capacity planning
+
+.. _storage-capacity:
+
+Storage Capacity
+----------------
+
+When a Ceph Storage Cluster gets close to its maximum capacity
+(see``mon_osd_full ratio``), Ceph prevents you from writing to or reading from OSDs
+as a safety measure to prevent data loss. Therefore, letting a
+production Ceph Storage Cluster approach its full ratio is not a good practice,
+because it sacrifices high availability. The default full ratio is ``.95``, or
+95% of capacity. This a very aggressive setting for a test cluster with a small
+number of OSDs.
+
+.. tip:: When monitoring your cluster, be alert to warnings related to the
+ ``nearfull`` ratio. This means that a failure of some OSDs could result
+ in a temporary service disruption if one or more OSDs fails. Consider adding
+ more OSDs to increase storage capacity.
+
+A common scenario for test clusters involves a system administrator removing an
+OSD from the Ceph Storage Cluster, watching the cluster rebalance, then removing
+another OSD, and another, until at least one OSD eventually reaches the full
+ratio and the cluster locks up. We recommend a bit of capacity
+planning even with a test cluster. Planning enables you to gauge how much spare
+capacity you will need in order to maintain high availability. Ideally, you want
+to plan for a series of Ceph OSD Daemon failures where the cluster can recover
+to an ``active+clean`` state without replacing those OSDs
+immediately. Cluster operation continues in the ``active+degraded`` state, but this
+is not ideal for normal operation and should be addressed promptly.
+
+The following diagram depicts a simplistic Ceph Storage Cluster containing 33
+Ceph Nodes with one OSD per host, each OSD reading from
+and writing to a 3TB drive. So this exemplary Ceph Storage Cluster has a maximum
+actual capacity of 99TB. With a ``mon osd full ratio`` of ``0.95``, if the Ceph
+Storage Cluster falls to 5TB of remaining capacity, the cluster will not allow
+Ceph Clients to read and write data. So the Ceph Storage Cluster's operating
+capacity is 95TB, not 99TB.
+
+.. ditaa::
+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+
+ | Rack 1 | | Rack 2 | | Rack 3 | | Rack 4 | | Rack 5 | | Rack 6 |
+ | cCCC | | cF00 | | cCCC | | cCCC | | cCCC | | cCCC |
+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+
+ | OSD 1 | | OSD 7 | | OSD 13 | | OSD 19 | | OSD 25 | | OSD 31 |
+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+
+ | OSD 2 | | OSD 8 | | OSD 14 | | OSD 20 | | OSD 26 | | OSD 32 |
+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+
+ | OSD 3 | | OSD 9 | | OSD 15 | | OSD 21 | | OSD 27 | | OSD 33 |
+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+
+ | OSD 4 | | OSD 10 | | OSD 16 | | OSD 22 | | OSD 28 | | Spare |
+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+
+ | OSD 5 | | OSD 11 | | OSD 17 | | OSD 23 | | OSD 29 | | Spare |
+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+
+ | OSD 6 | | OSD 12 | | OSD 18 | | OSD 24 | | OSD 30 | | Spare |
+ +--------+ +--------+ +--------+ +--------+ +--------+ +--------+
+
+It is normal in such a cluster for one or two OSDs to fail. A less frequent but
+reasonable scenario involves a rack's router or power supply failing, which
+brings down multiple OSDs simultaneously (e.g., OSDs 7-12). In such a scenario,
+you should still strive for a cluster that can remain operational and achieve an
+``active + clean`` state--even if that means adding a few hosts with additional
+OSDs in short order. If your capacity utilization is too high, you may not lose
+data, but you could still sacrifice data availability while resolving an outage
+within a failure domain if capacity utilization of the cluster exceeds the full
+ratio. For this reason, we recommend at least some rough capacity planning.
+
+Identify two numbers for your cluster:
+
+#. The number of OSDs.
+#. The total capacity of the cluster
+
+If you divide the total capacity of your cluster by the number of OSDs in your
+cluster, you will find the mean average capacity of an OSD within your cluster.
+Consider multiplying that number by the number of OSDs you expect will fail
+simultaneously during normal operations (a relatively small number). Finally
+multiply the capacity of the cluster by the full ratio to arrive at a maximum
+operating capacity; then, subtract the number of amount of data from the OSDs
+you expect to fail to arrive at a reasonable full ratio. Repeat the foregoing
+process with a higher number of OSD failures (e.g., a rack of OSDs) to arrive at
+a reasonable number for a near full ratio.
+
+The following settings only apply on cluster creation and are then stored in
+the OSDMap. To clarify, in normal operation the values that are used by OSDs
+are those found in the OSDMap, not those in the configuration file or central
+config store.
+
+.. code-block:: ini
+
+ [global]
+ mon_osd_full_ratio = .80
+ mon_osd_backfillfull_ratio = .75
+ mon_osd_nearfull_ratio = .70
+
+
+``mon_osd_full_ratio``
+
+:Description: The threshold percentage of device space utilized before an OSD is
+ considered ``full``.
+
+:Type: Float
+:Default: ``0.95``
+
+
+``mon_osd_backfillfull_ratio``
+
+:Description: The threshold percentage of device space utilized before an OSD is
+ considered too ``full`` to backfill.
+
+:Type: Float
+:Default: ``0.90``
+
+
+``mon_osd_nearfull_ratio``
+
+:Description: The threshold percentage of device space used before an OSD is
+ considered ``nearfull``.
+
+:Type: Float
+:Default: ``0.85``
+
+
+.. tip:: If some OSDs are nearfull, but others have plenty of capacity, you
+ may have an inaccurate CRUSH weight set for the nearfull OSDs.
+
+.. tip:: These settings only apply during cluster creation. Afterwards they need
+ to be changed in the OSDMap using ``ceph osd set-nearfull-ratio`` and
+ ``ceph osd set-full-ratio``
+
+.. index:: heartbeat
+
+Heartbeat
+---------
+
+Ceph monitors know about the cluster by requiring reports from each OSD, and by
+receiving reports from OSDs about the status of their neighboring OSDs. Ceph
+provides reasonable default settings for monitor/OSD interaction; however, you
+may modify them as needed. See `Monitor/OSD Interaction`_ for details.
+
+
+.. index:: Ceph Monitor; leader, Ceph Monitor; provider, Ceph Monitor; requester, Ceph Monitor; synchronization
+
+Monitor Store Synchronization
+-----------------------------
+
+When you run a production cluster with multiple monitors (recommended), each
+monitor checks to see if a neighboring monitor has a more recent version of the
+cluster map (e.g., a map in a neighboring monitor with one or more epoch numbers
+higher than the most current epoch in the map of the instant monitor).
+Periodically, one monitor in the cluster may fall behind the other monitors to
+the point where it must leave the quorum, synchronize to retrieve the most
+current information about the cluster, and then rejoin the quorum. For the
+purposes of synchronization, monitors may assume one of three roles:
+
+#. **Leader**: The `Leader` is the first monitor to achieve the most recent
+ Paxos version of the cluster map.
+
+#. **Provider**: The `Provider` is a monitor that has the most recent version
+ of the cluster map, but wasn't the first to achieve the most recent version.
+
+#. **Requester:** A `Requester` is a monitor that has fallen behind the leader
+ and must synchronize in order to retrieve the most recent information about
+ the cluster before it can rejoin the quorum.
+
+These roles enable a leader to delegate synchronization duties to a provider,
+which prevents synchronization requests from overloading the leader--improving
+performance. In the following diagram, the requester has learned that it has
+fallen behind the other monitors. The requester asks the leader to synchronize,
+and the leader tells the requester to synchronize with a provider.
+
+
+.. ditaa::
+ +-----------+ +---------+ +----------+
+ | Requester | | Leader | | Provider |
+ +-----------+ +---------+ +----------+
+ | | |
+ | | |
+ | Ask to Synchronize | |
+ |------------------->| |
+ | | |
+ |<-------------------| |
+ | Tell Requester to | |
+ | Sync with Provider | |
+ | | |
+ | Synchronize |
+ |--------------------+-------------------->|
+ | | |
+ |<-------------------+---------------------|
+ | Send Chunk to Requester |
+ | (repeat as necessary) |
+ | Requester Acks Chuck to Provider |
+ |--------------------+-------------------->|
+ | |
+ | Sync Complete |
+ | Notification |
+ |------------------->|
+ | |
+ |<-------------------|
+ | Ack |
+ | |
+
+
+Synchronization always occurs when a new monitor joins the cluster. During
+runtime operations, monitors may receive updates to the cluster map at different
+times. This means the leader and provider roles may migrate from one monitor to
+another. If this happens while synchronizing (e.g., a provider falls behind the
+leader), the provider can terminate synchronization with a requester.
+
+Once synchronization is complete, Ceph performs trimming across the cluster.
+Trimming requires that the placement groups are ``active+clean``.
+
+
+.. confval:: mon_sync_timeout
+.. confval:: mon_sync_max_payload_size
+.. confval:: paxos_max_join_drift
+.. confval:: paxos_stash_full_interval
+.. confval:: paxos_propose_interval
+.. confval:: paxos_min
+.. confval:: paxos_min_wait
+.. confval:: paxos_trim_min
+.. confval:: paxos_trim_max
+.. confval:: paxos_service_trim_min
+.. confval:: paxos_service_trim_max
+.. confval:: paxos_service_trim_max_multiplier
+.. confval:: mon_mds_force_trim_to
+.. confval:: mon_osd_force_trim_to
+.. confval:: mon_osd_cache_size
+.. confval:: mon_election_timeout
+.. confval:: mon_lease
+.. confval:: mon_lease_renew_interval_factor
+.. confval:: mon_lease_ack_timeout_factor
+.. confval:: mon_accept_timeout_factor
+.. confval:: mon_min_osdmap_epochs
+.. confval:: mon_max_log_epochs
+
+
+.. index:: Ceph Monitor; clock
+
+.. _mon-config-ref-clock:
+
+Clock
+-----
+
+Ceph daemons pass critical messages to each other, which must be processed
+before daemons reach a timeout threshold. If the clocks in Ceph monitors
+are not synchronized, it can lead to a number of anomalies. For example:
+
+- Daemons ignoring received messages (e.g., timestamps outdated)
+- Timeouts triggered too soon/late when a message wasn't received in time.
+
+See `Monitor Store Synchronization`_ for details.
+
+
+.. tip:: You must configure NTP or PTP daemons on your Ceph monitor hosts to
+ ensure that the monitor cluster operates with synchronized clocks.
+ It can be advantageous to have monitor hosts sync with each other
+ as well as with multiple quality upstream time sources.
+
+Clock drift may still be noticeable with NTP even though the discrepancy is not
+yet harmful. Ceph's clock drift / clock skew warnings may get triggered even
+though NTP maintains a reasonable level of synchronization. Increasing your
+clock drift may be tolerable under such circumstances; however, a number of
+factors such as workload, network latency, configuring overrides to default
+timeouts and the `Monitor Store Synchronization`_ settings may influence
+the level of acceptable clock drift without compromising Paxos guarantees.
+
+Ceph provides the following tunable options to allow you to find
+acceptable values.
+
+.. confval:: mon_tick_interval
+.. confval:: mon_clock_drift_allowed
+.. confval:: mon_clock_drift_warn_backoff
+.. confval:: mon_timecheck_interval
+.. confval:: mon_timecheck_skew_interval
+
+Client
+------
+
+.. confval:: mon_client_hunt_interval
+.. confval:: mon_client_ping_interval
+.. confval:: mon_client_max_log_entries_per_message
+.. confval:: mon_client_bytes
+
+.. _pool-settings:
+
+Pool settings
+=============
+
+Since version v0.94 there is support for pool flags which allow or disallow changes to be made to pools.
+Monitors can also disallow removal of pools if appropriately configured. The inconvenience of this guardrail
+is far outweighed by the number of accidental pool (and thus data) deletions it prevents.
+
+.. confval:: mon_allow_pool_delete
+.. confval:: osd_pool_default_ec_fast_read
+.. confval:: osd_pool_default_flag_hashpspool
+.. confval:: osd_pool_default_flag_nodelete
+.. confval:: osd_pool_default_flag_nopgchange
+.. confval:: osd_pool_default_flag_nosizechange
+
+For more information about the pool flags see :ref:`Pool values <setpoolvalues>`.
+
+Miscellaneous
+=============
+
+.. confval:: mon_max_osd
+.. confval:: mon_globalid_prealloc
+.. confval:: mon_subscribe_interval
+.. confval:: mon_stat_smooth_intervals
+.. confval:: mon_probe_timeout
+.. confval:: mon_daemon_bytes
+.. confval:: mon_max_log_entries_per_event
+.. confval:: mon_osd_prime_pg_temp
+.. confval:: mon_osd_prime_pg_temp_max_time
+.. confval:: mon_osd_prime_pg_temp_max_estimate
+.. confval:: mon_mds_skip_sanity
+.. confval:: mon_max_mdsmap_epochs
+.. confval:: mon_config_key_max_entry_size
+.. confval:: mon_scrub_interval
+.. confval:: mon_scrub_max_keys
+.. confval:: mon_compact_on_start
+.. confval:: mon_compact_on_bootstrap
+.. confval:: mon_compact_on_trim
+.. confval:: mon_cpu_threads
+.. confval:: mon_osd_mapping_pgs_per_chunk
+.. confval:: mon_session_timeout
+.. confval:: mon_osd_cache_size_min
+.. confval:: mon_memory_target
+.. confval:: mon_memory_autotune
+
+.. _Paxos: https://en.wikipedia.org/wiki/Paxos_(computer_science)
+.. _Monitor Keyrings: ../../../dev/mon-bootstrap#secret-keys
+.. _Ceph configuration file: ../ceph-conf/#monitors
+.. _Network Configuration Reference: ../network-config-ref
+.. _Monitor lookup through DNS: ../mon-lookup-dns
+.. _ACID: https://en.wikipedia.org/wiki/ACID
+.. _Adding/Removing a Monitor: ../../operations/add-or-rm-mons
+.. _Monitoring a Cluster: ../../operations/monitoring
+.. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg
+.. _Bootstrapping a Monitor: ../../../dev/mon-bootstrap
+.. _Monitor/OSD Interaction: ../mon-osd-interaction
+.. _Scalability and High Availability: ../../../architecture#scalability-and-high-availability
diff --git a/doc/rados/configuration/mon-lookup-dns.rst b/doc/rados/configuration/mon-lookup-dns.rst
new file mode 100644
index 000000000..129a083c4
--- /dev/null
+++ b/doc/rados/configuration/mon-lookup-dns.rst
@@ -0,0 +1,58 @@
+.. _mon-dns-lookup:
+
+===============================
+Looking up Monitors through DNS
+===============================
+
+Since Ceph version 11.0.0 (Kraken), RADOS has supported looking up monitors
+through DNS.
+
+The addition of the ability to look up monitors through DNS means that daemons
+and clients do not require a *mon host* configuration directive in their
+``ceph.conf`` configuration file.
+
+With a DNS update, clients and daemons can be made aware of changes
+in the monitor topology. To be more precise and technical, clients look up the
+monitors by using ``DNS SRV TCP`` records.
+
+By default, clients and daemons look for the TCP service called *ceph-mon*,
+which is configured by the *mon_dns_srv_name* configuration directive.
+
+
+.. confval:: mon_dns_srv_name
+
+Example
+-------
+When the DNS search domain is set to *example.com* a DNS zone file might contain the following elements.
+
+First, create records for the Monitors, either IPv4 (A) or IPv6 (AAAA).
+
+::
+
+ mon1.example.com. AAAA 2001:db8::100
+ mon2.example.com. AAAA 2001:db8::200
+ mon3.example.com. AAAA 2001:db8::300
+
+::
+
+ mon1.example.com. A 192.168.0.1
+ mon2.example.com. A 192.168.0.2
+ mon3.example.com. A 192.168.0.3
+
+
+With those records now existing we can create the SRV TCP records with the name *ceph-mon* pointing to the three Monitors.
+
+::
+
+ _ceph-mon._tcp.example.com. 60 IN SRV 10 20 6789 mon1.example.com.
+ _ceph-mon._tcp.example.com. 60 IN SRV 10 30 6789 mon2.example.com.
+ _ceph-mon._tcp.example.com. 60 IN SRV 20 50 6789 mon3.example.com.
+
+Now all Monitors are running on port *6789*, with priorities 10, 10, 20 and weights 20, 30, 50 respectively.
+
+Monitor clients choose monitor by referencing the SRV records. If a cluster has multiple Monitor SRV records
+with the same priority value, clients and daemons will load balance the connections to Monitors in proportion
+to the values of the SRV weight fields.
+
+For the above example, this will result in approximate 40% of the clients and daemons connecting to mon1,
+60% of them connecting to mon2. However, if neither of them is reachable, then mon3 will be reconsidered as a fallback.
diff --git a/doc/rados/configuration/mon-osd-interaction.rst b/doc/rados/configuration/mon-osd-interaction.rst
new file mode 100644
index 000000000..8cf09707d
--- /dev/null
+++ b/doc/rados/configuration/mon-osd-interaction.rst
@@ -0,0 +1,245 @@
+=====================================
+ Configuring Monitor/OSD Interaction
+=====================================
+
+.. index:: heartbeat
+
+After you have completed your initial Ceph configuration, you may deploy and run
+Ceph. When you execute a command such as ``ceph health`` or ``ceph -s``, the
+:term:`Ceph Monitor` reports on the current state of the :term:`Ceph Storage
+Cluster`. The Ceph Monitor knows about the Ceph Storage Cluster by requiring
+reports from each :term:`Ceph OSD Daemon`, and by receiving reports from Ceph
+OSD Daemons about the status of their neighboring Ceph OSD Daemons. If the Ceph
+Monitor doesn't receive reports, or if it receives reports of changes in the
+Ceph Storage Cluster, the Ceph Monitor updates the status of the :term:`Ceph
+Cluster Map`.
+
+Ceph provides reasonable default settings for Ceph Monitor/Ceph OSD Daemon
+interaction. However, you may override the defaults. The following sections
+describe how Ceph Monitors and Ceph OSD Daemons interact for the purposes of
+monitoring the Ceph Storage Cluster.
+
+.. index:: heartbeat interval
+
+OSDs Check Heartbeats
+=====================
+
+Each Ceph OSD Daemon checks the heartbeat of other Ceph OSD Daemons at random
+intervals less than every 6 seconds. If a neighboring Ceph OSD Daemon doesn't
+show a heartbeat within a 20 second grace period, the Ceph OSD Daemon may
+consider the neighboring Ceph OSD Daemon ``down`` and report it back to a Ceph
+Monitor, which will update the Ceph Cluster Map. You may change this grace
+period by adding an ``osd heartbeat grace`` setting under the ``[mon]``
+and ``[osd]`` or ``[global]`` section of your Ceph configuration file,
+or by setting the value at runtime.
+
+
+.. ditaa::
+ +---------+ +---------+
+ | OSD 1 | | OSD 2 |
+ +---------+ +---------+
+ | |
+ |----+ Heartbeat |
+ | | Interval |
+ |<---+ Exceeded |
+ | |
+ | Check |
+ | Heartbeat |
+ |------------------->|
+ | |
+ |<-------------------|
+ | Heart Beating |
+ | |
+ |----+ Heartbeat |
+ | | Interval |
+ |<---+ Exceeded |
+ | |
+ | Check |
+ | Heartbeat |
+ |------------------->|
+ | |
+ |----+ Grace |
+ | | Period |
+ |<---+ Exceeded |
+ | |
+ |----+ Mark |
+ | | OSD 2 |
+ |<---+ Down |
+
+
+.. index:: OSD down report
+
+OSDs Report Down OSDs
+=====================
+
+By default, two Ceph OSD Daemons from different hosts must report to the Ceph
+Monitors that another Ceph OSD Daemon is ``down`` before the Ceph Monitors
+acknowledge that the reported Ceph OSD Daemon is ``down``. But there is chance
+that all the OSDs reporting the failure are hosted in a rack with a bad switch
+which has trouble connecting to another OSD. To avoid this sort of false alarm,
+we consider the peers reporting a failure a proxy for a potential "subcluster"
+over the overall cluster that is similarly laggy. This is clearly not true in
+all cases, but will sometimes help us localize the grace correction to a subset
+of the system that is unhappy. ``mon osd reporter subtree level`` is used to
+group the peers into the "subcluster" by their common ancestor type in CRUSH
+map. By default, only two reports from different subtree are required to report
+another Ceph OSD Daemon ``down``. You can change the number of reporters from
+unique subtrees and the common ancestor type required to report a Ceph OSD
+Daemon ``down`` to a Ceph Monitor by adding an ``mon osd min down reporters``
+and ``mon osd reporter subtree level`` settings under the ``[mon]`` section of
+your Ceph configuration file, or by setting the value at runtime.
+
+
+.. ditaa::
+
+ +---------+ +---------+ +---------+
+ | OSD 1 | | OSD 2 | | Monitor |
+ +---------+ +---------+ +---------+
+ | | |
+ | OSD 3 Is Down | |
+ |---------------+--------------->|
+ | | |
+ | | |
+ | | OSD 3 Is Down |
+ | |--------------->|
+ | | |
+ | | |
+ | | |---------+ Mark
+ | | | | OSD 3
+ | | |<--------+ Down
+
+
+.. index:: peering failure
+
+OSDs Report Peering Failure
+===========================
+
+If a Ceph OSD Daemon cannot peer with any of the Ceph OSD Daemons defined in its
+Ceph configuration file (or the cluster map), it will ping a Ceph Monitor for
+the most recent copy of the cluster map every 30 seconds. You can change the
+Ceph Monitor heartbeat interval by adding an ``osd mon heartbeat interval``
+setting under the ``[osd]`` section of your Ceph configuration file, or by
+setting the value at runtime.
+
+.. ditaa::
+
+ +---------+ +---------+ +-------+ +---------+
+ | OSD 1 | | OSD 2 | | OSD 3 | | Monitor |
+ +---------+ +---------+ +-------+ +---------+
+ | | | |
+ | Request To | | |
+ | Peer | | |
+ |-------------->| | |
+ |<--------------| | |
+ | Peering | |
+ | | |
+ | Request To | |
+ | Peer | |
+ |----------------------------->| |
+ | |
+ |----+ OSD Monitor |
+ | | Heartbeat |
+ |<---+ Interval Exceeded |
+ | |
+ | Failed to Peer with OSD 3 |
+ |-------------------------------------------->|
+ |<--------------------------------------------|
+ | Receive New Cluster Map |
+
+
+.. index:: OSD status
+
+OSDs Report Their Status
+========================
+
+If an Ceph OSD Daemon doesn't report to a Ceph Monitor, the Ceph Monitor will
+consider the Ceph OSD Daemon ``down`` after the ``mon osd report timeout``
+elapses. A Ceph OSD Daemon sends a report to a Ceph Monitor when a reportable
+event such as a failure, a change in placement group stats, a change in
+``up_thru`` or when it boots within 5 seconds. You can change the Ceph OSD
+Daemon minimum report interval by adding an ``osd mon report interval``
+setting under the ``[osd]`` section of your Ceph configuration file, or by
+setting the value at runtime. A Ceph OSD Daemon sends a report to a Ceph
+Monitor every 120 seconds irrespective of whether any notable changes occur.
+You can change the Ceph Monitor report interval by adding an ``osd mon report
+interval max`` setting under the ``[osd]`` section of your Ceph configuration
+file, or by setting the value at runtime.
+
+
+.. ditaa::
+
+ +---------+ +---------+
+ | OSD 1 | | Monitor |
+ +---------+ +---------+
+ | |
+ |----+ Report Min |
+ | | Interval |
+ |<---+ Exceeded |
+ | |
+ |----+ Reportable |
+ | | Event |
+ |<---+ Occurs |
+ | |
+ | Report To |
+ | Monitor |
+ |------------------->|
+ | |
+ |----+ Report Max |
+ | | Interval |
+ |<---+ Exceeded |
+ | |
+ | Report To |
+ | Monitor |
+ |------------------->|
+ | |
+ |----+ Monitor |
+ | | Fails |
+ |<---+ |
+ +----+ Monitor OSD
+ | | Report Timeout
+ |<---+ Exceeded
+ |
+ +----+ Mark
+ | | OSD 1
+ |<---+ Down
+
+
+
+
+Configuration Settings
+======================
+
+When modifying heartbeat settings, you should include them in the ``[global]``
+section of your configuration file.
+
+.. index:: monitor heartbeat
+
+Monitor Settings
+----------------
+
+.. confval:: mon_osd_min_up_ratio
+.. confval:: mon_osd_min_in_ratio
+.. confval:: mon_osd_laggy_halflife
+.. confval:: mon_osd_laggy_weight
+.. confval:: mon_osd_laggy_max_interval
+.. confval:: mon_osd_adjust_heartbeat_grace
+.. confval:: mon_osd_adjust_down_out_interval
+.. confval:: mon_osd_auto_mark_in
+.. confval:: mon_osd_auto_mark_auto_out_in
+.. confval:: mon_osd_auto_mark_new_in
+.. confval:: mon_osd_down_out_interval
+.. confval:: mon_osd_down_out_subtree_limit
+.. confval:: mon_osd_report_timeout
+.. confval:: mon_osd_min_down_reporters
+.. confval:: mon_osd_reporter_subtree_level
+
+.. index:: OSD heartbeat
+
+OSD Settings
+------------
+
+.. confval:: osd_heartbeat_interval
+.. confval:: osd_heartbeat_grace
+.. confval:: osd_mon_heartbeat_interval
+.. confval:: osd_mon_heartbeat_stat_stale
+.. confval:: osd_mon_report_interval
diff --git a/doc/rados/configuration/msgr2.rst b/doc/rados/configuration/msgr2.rst
new file mode 100644
index 000000000..33fe4e022
--- /dev/null
+++ b/doc/rados/configuration/msgr2.rst
@@ -0,0 +1,257 @@
+.. _msgr2:
+
+Messenger v2
+============
+
+What is it
+----------
+
+The messenger v2 protocol, or msgr2, is the second major revision on
+Ceph's on-wire protocol. It brings with it several key features:
+
+* A *secure* mode that encrypts all data passing over the network
+* Improved encapsulation of authentication payloads, enabling future
+ integration of new authentication modes like Kerberos
+* Improved earlier feature advertisement and negotiation, enabling
+ future protocol revisions
+
+Ceph daemons can now bind to multiple ports, allowing both legacy Ceph
+clients and new v2-capable clients to connect to the same cluster.
+
+By default, monitors now bind to the new IANA-assigned port ``3300``
+(ce4h or 0xce4) for the new v2 protocol, while also binding to the
+old default port ``6789`` for the legacy v1 protocol.
+
+.. _address_formats:
+
+Address formats
+---------------
+
+Prior to Nautilus, all network addresses were rendered like
+``1.2.3.4:567/89012`` where there was an IP address, a port, and a
+nonce to uniquely identify a client or daemon on the network.
+Starting with Nautilus, we now have three different address types:
+
+* **v2**: ``v2:1.2.3.4:578/89012`` identifies a daemon binding to a
+ port speaking the new v2 protocol
+* **v1**: ``v1:1.2.3.4:578/89012`` identifies a daemon binding to a
+ port speaking the legacy v1 protocol. Any address that was
+ previously shown with any prefix is now shown as a ``v1:`` address.
+* **TYPE_ANY** ``any:1.2.3.4:578/89012`` identifies a client that can
+ speak either version of the protocol. Prior to nautilus, clients would appear as
+ ``1.2.3.4:0/123456``, where the port of 0 indicates they are clients
+ and do not accept incoming connections. Starting with Nautilus,
+ these clients are now internally represented by a **TYPE_ANY**
+ address, and still shown with no prefix, because they may
+ connect to daemons using the v2 or v1 protocol, depending on what
+ protocol(s) the daemons are using.
+
+Because daemons now bind to multiple ports, they are now described by
+a vector of addresses instead of a single address. For example,
+dumping the monitor map on a Nautilus cluster now includes lines
+like::
+
+ epoch 1
+ fsid 50fcf227-be32-4bcb-8b41-34ca8370bd16
+ last_changed 2019-02-25 11:10:46.700821
+ created 2019-02-25 11:10:46.700821
+ min_mon_release 14 (nautilus)
+ 0: [v2:10.0.0.10:3300/0,v1:10.0.0.10:6789/0] mon.foo
+ 1: [v2:10.0.0.11:3300/0,v1:10.0.0.11:6789/0] mon.bar
+ 2: [v2:10.0.0.12:3300/0,v1:10.0.0.12:6789/0] mon.baz
+
+The bracketed list or vector of addresses means that the same daemon can be
+reached on multiple ports (and protocols). Any client or other daemon
+connecting to that daemon will use the v2 protocol (listed first) if
+possible; otherwise it will back to the legacy v1 protocol. Legacy
+clients will only see the v1 addresses and will continue to connect as
+they did before, with the v1 protocol.
+
+Starting in Nautilus, the ``mon_host`` configuration option and ``-m
+<mon-host>`` command line options support the same bracketed address
+vector syntax.
+
+
+Bind configuration options
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Two new configuration options control whether the v1 and/or v2
+protocol is used:
+
+ * :confval:`ms_bind_msgr1` [default: true] controls whether a daemon binds
+ to a port speaking the v1 protocol
+ * :confval:`ms_bind_msgr2` [default: true] controls whether a daemon binds
+ to a port speaking the v2 protocol
+
+Similarly, two options control whether IPv4 and IPv6 addresses are used:
+
+ * :confval:`ms_bind_ipv4` [default: true] controls whether a daemon binds
+ to an IPv4 address
+ * :confval:`ms_bind_ipv6` [default: false] controls whether a daemon binds
+ to an IPv6 address
+
+.. note:: The ability to bind to multiple ports has paved the way for
+ dual-stack IPv4 and IPv6 support. That said, dual-stack operation is
+ not yet supported as of Quincy v17.2.0.
+
+Connection modes
+----------------
+
+The v2 protocol supports two connection modes:
+
+* *crc* mode provides:
+
+ - a strong initial authentication when the connection is established
+ (with cephx, mutual authentication of both parties with protection
+ from a man-in-the-middle or eavesdropper), and
+ - a crc32c integrity check to protect against bit flips due to flaky
+ hardware or cosmic rays
+
+ *crc* mode does *not* provide:
+
+ - secrecy (an eavesdropper on the network can see all
+ post-authentication traffic as it goes by) or
+ - protection from a malicious man-in-the-middle (who can deliberate
+ modify traffic as it goes by, as long as they are careful to
+ adjust the crc32c values to match)
+
+* *secure* mode provides:
+
+ - a strong initial authentication when the connection is established
+ (with cephx, mutual authentication of both parties with protection
+ from a man-in-the-middle or eavesdropper), and
+ - full encryption of all post-authentication traffic, including a
+ cryptographic integrity check.
+
+ In Nautilus, secure mode uses the `AES-GCM
+ <https://en.wikipedia.org/wiki/Galois/Counter_Mode>`_ stream cipher,
+ which is generally very fast on modern processors (e.g., faster than
+ a SHA-256 cryptographic hash).
+
+Connection mode configuration options
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For most connections, there are options that control which modes are used:
+
+.. confval:: ms_cluster_mode
+.. confval:: ms_service_mode
+.. confval:: ms_client_mode
+
+There are a parallel set of options that apply specifically to
+monitors, allowing administrators to set different (usually more
+secure) requirements on communication with the monitors.
+
+.. confval:: ms_mon_cluster_mode
+.. confval:: ms_mon_service_mode
+.. confval:: ms_mon_client_mode
+
+
+Compression modes
+-----------------
+
+The v2 protocol supports two compression modes:
+
+* *force* mode provides:
+
+ - In multi-availability zones deployment, compressing replication messages between OSDs saves latency.
+ - In the public cloud, inter-AZ communications are expensive. Thus, minimizing message
+ size reduces network costs to cloud provider.
+ - When using instance storage on AWS (probably other public clouds as well) the instances with NVMe
+ provide low network bandwidth relative to the device bandwidth.
+ In this case, NW compression can improve the overall performance since this is clearly
+ the bottleneck.
+
+* *none* mode provides:
+
+ - messages are transmitted without compression.
+
+
+Compression mode configuration options
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For all connections, there is an option that controls compression usage in secure mode
+
+.. confval:: ms_compress_secure
+
+There is a parallel set of options that apply specifically to OSDs,
+allowing administrators to set different requirements on communication between OSDs.
+
+.. confval:: ms_osd_compress_mode
+.. confval:: ms_osd_compress_min_size
+.. confval:: ms_osd_compression_algorithm
+
+Transitioning from v1-only to v2-plus-v1
+----------------------------------------
+
+By default, ``ms_bind_msgr2`` is true starting with Nautilus 14.2.z.
+However, until the monitors start using v2, only limited services will
+start advertising v2 addresses.
+
+For most users, the monitors are binding to the default legacy port ``6789``
+for the v1 protocol. When this is the case, enabling v2 is as simple as:
+
+.. prompt:: bash $
+
+ ceph mon enable-msgr2
+
+If the monitors are bound to non-standard ports, you will need to
+specify an additional port for v2 explicitly. For example, if your
+monitor ``mon.a`` binds to ``1.2.3.4:1111``, and you want to add v2 on
+port ``1112``:
+
+.. prompt:: bash $
+
+ ceph mon set-addrs a [v2:1.2.3.4:1112,v1:1.2.3.4:1111]
+
+Once the monitors bind to v2, each daemon will start advertising a v2
+address when it is next restarted.
+
+
+.. _msgr2_ceph_conf:
+
+Updating ceph.conf and mon_host
+-------------------------------
+
+Prior to Nautilus, a CLI user or daemon will normally discover the
+monitors via the ``mon_host`` option in ``/etc/ceph/ceph.conf``. The
+syntax for this option has expanded starting with Nautilus to allow
+support the new bracketed list format. For example, an old line
+like::
+
+ mon_host = 10.0.0.1:6789,10.0.0.2:6789,10.0.0.3:6789
+
+Can be changed to::
+
+ mon_host = [v2:10.0.0.1:3300/0,v1:10.0.0.1:6789/0],[v2:10.0.0.2:3300/0,v1:10.0.0.2:6789/0],[v2:10.0.0.3:3300/0,v1:10.0.0.3:6789/0]
+
+However, when default ports are used (``3300`` and ``6789``), they can
+be omitted::
+
+ mon_host = 10.0.0.1,10.0.0.2,10.0.0.3
+
+Once v2 has been enabled on the monitors, ``ceph.conf`` may need to be
+updated to either specify no ports (this is usually simplest), or
+explicitly specify both the v2 and v1 addresses. Note, however, that
+the new bracketed syntax is only understood by Nautilus and later, so
+do not make that change on hosts that have not yet had their ceph
+packages upgraded.
+
+When you are updating ``ceph.conf``, note the new ``ceph config
+generate-minimal-conf`` command (which generates a barebones config
+file with just enough information to reach the monitors) and the
+``ceph config assimilate-conf`` (which moves config file options into
+the monitors' configuration database) may be helpful. For example,::
+
+ # ceph config assimilate-conf < /etc/ceph/ceph.conf
+ # ceph config generate-minimal-config > /etc/ceph/ceph.conf.new
+ # cat /etc/ceph/ceph.conf.new
+ # minimal ceph.conf for 0e5a806b-0ce5-4bc6-b949-aa6f68f5c2a3
+ [global]
+ fsid = 0e5a806b-0ce5-4bc6-b949-aa6f68f5c2a3
+ mon_host = [v2:10.0.0.1:3300/0,v1:10.0.0.1:6789/0]
+ # mv /etc/ceph/ceph.conf.new /etc/ceph/ceph.conf
+
+Protocol
+--------
+
+For a detailed description of the v2 wire protocol, see :ref:`msgr2-protocol`.
diff --git a/doc/rados/configuration/network-config-ref.rst b/doc/rados/configuration/network-config-ref.rst
new file mode 100644
index 000000000..81e85c5d1
--- /dev/null
+++ b/doc/rados/configuration/network-config-ref.rst
@@ -0,0 +1,355 @@
+=================================
+ Network Configuration Reference
+=================================
+
+Network configuration is critical for building a high performance :term:`Ceph
+Storage Cluster`. The Ceph Storage Cluster does not perform request routing or
+dispatching on behalf of the :term:`Ceph Client`. Instead, Ceph Clients make
+requests directly to Ceph OSD Daemons. Ceph OSD Daemons perform data replication
+on behalf of Ceph Clients, which means replication and other factors impose
+additional loads on Ceph Storage Cluster networks.
+
+Our Quick Start configurations provide a trivial Ceph configuration file that
+sets monitor IP addresses and daemon host names only. Unless you specify a
+cluster network, Ceph assumes a single "public" network. Ceph functions just
+fine with a public network only, but you may see significant performance
+improvement with a second "cluster" network in a large cluster.
+
+It is possible to run a Ceph Storage Cluster with two networks: a public
+(client, front-side) network and a cluster (private, replication, back-side)
+network. However, this approach
+complicates network configuration (both hardware and software) and does not usually
+have a significant impact on overall performance. For this reason, we recommend
+that for resilience and capacity dual-NIC systems either active/active bond
+these interfaces or implement a layer 3 multipath strategy with eg. FRR.
+
+If, despite the complexity, one still wishes to use two networks, each
+:term:`Ceph Node` will need to have more than one network interface or VLAN. See `Hardware
+Recommendations - Networks`_ for additional details.
+
+.. ditaa::
+ +-------------+
+ | Ceph Client |
+ +----*--*-----+
+ | ^
+ Request | : Response
+ v |
+ /----------------------------------*--*-------------------------------------\
+ | Public Network |
+ \---*--*------------*--*-------------*--*------------*--*------------*--*---/
+ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ | | | | | | | | | |
+ | : | : | : | : | :
+ v v v v v v v v v v
+ +---*--*---+ +---*--*---+ +---*--*---+ +---*--*---+ +---*--*---+
+ | Ceph MON | | Ceph MDS | | Ceph OSD | | Ceph OSD | | Ceph OSD |
+ +----------+ +----------+ +---*--*---+ +---*--*---+ +---*--*---+
+ ^ ^ ^ ^ ^ ^
+ The cluster network relieves | | | | | |
+ OSD replication and heartbeat | : | : | :
+ traffic from the public network. v v v v v v
+ /------------------------------------*--*------------*--*------------*--*---\
+ | cCCC Cluster Network |
+ \---------------------------------------------------------------------------/
+
+
+IP Tables
+=========
+
+By default, daemons `bind`_ to ports within the ``6800:7300`` range. You may
+configure this range at your discretion. Before configuring your IP tables,
+check the default ``iptables`` configuration.
+
+.. prompt:: bash $
+
+ sudo iptables -L
+
+Some Linux distributions include rules that reject all inbound requests
+except SSH from all network interfaces. For example::
+
+ REJECT all -- anywhere anywhere reject-with icmp-host-prohibited
+
+You will need to delete these rules on both your public and cluster networks
+initially, and replace them with appropriate rules when you are ready to
+harden the ports on your Ceph Nodes.
+
+
+Monitor IP Tables
+-----------------
+
+Ceph Monitors listen on ports ``3300`` and ``6789`` by
+default. Additionally, Ceph Monitors always operate on the public
+network. When you add the rule using the example below, make sure you
+replace ``{iface}`` with the public network interface (e.g., ``eth0``,
+``eth1``, etc.), ``{ip-address}`` with the IP address of the public
+network and ``{netmask}`` with the netmask for the public network. :
+
+.. prompt:: bash $
+
+ sudo iptables -A INPUT -i {iface} -p tcp -s {ip-address}/{netmask} --dport 6789 -j ACCEPT
+
+
+MDS and Manager IP Tables
+-------------------------
+
+A :term:`Ceph Metadata Server` or :term:`Ceph Manager` listens on the first
+available port on the public network beginning at port 6800. Note that this
+behavior is not deterministic, so if you are running more than one OSD or MDS
+on the same host, or if you restart the daemons within a short window of time,
+the daemons will bind to higher ports. You should open the entire 6800-7300
+range by default. When you add the rule using the example below, make sure
+you replace ``{iface}`` with the public network interface (e.g., ``eth0``,
+``eth1``, etc.), ``{ip-address}`` with the IP address of the public network
+and ``{netmask}`` with the netmask of the public network.
+
+For example:
+
+.. prompt:: bash $
+
+ sudo iptables -A INPUT -i {iface} -m multiport -p tcp -s {ip-address}/{netmask} --dports 6800:7300 -j ACCEPT
+
+
+OSD IP Tables
+-------------
+
+By default, Ceph OSD Daemons `bind`_ to the first available ports on a Ceph Node
+beginning at port 6800. Note that this behavior is not deterministic, so if you
+are running more than one OSD or MDS on the same host, or if you restart the
+daemons within a short window of time, the daemons will bind to higher ports.
+Each Ceph OSD Daemon on a Ceph Node may use up to four ports:
+
+#. One for talking to clients and monitors.
+#. One for sending data to other OSDs.
+#. Two for heartbeating on each interface.
+
+.. ditaa::
+ /---------------\
+ | OSD |
+ | +---+----------------+-----------+
+ | | Clients & Monitors | Heartbeat |
+ | +---+----------------+-----------+
+ | |
+ | +---+----------------+-----------+
+ | | Data Replication | Heartbeat |
+ | +---+----------------+-----------+
+ | cCCC |
+ \---------------/
+
+When a daemon fails and restarts without letting go of the port, the restarted
+daemon will bind to a new port. You should open the entire 6800-7300 port range
+to handle this possibility.
+
+If you set up separate public and cluster networks, you must add rules for both
+the public network and the cluster network, because clients will connect using
+the public network and other Ceph OSD Daemons will connect using the cluster
+network. When you add the rule using the example below, make sure you replace
+``{iface}`` with the network interface (e.g., ``eth0``, ``eth1``, etc.),
+``{ip-address}`` with the IP address and ``{netmask}`` with the netmask of the
+public or cluster network. For example:
+
+.. prompt:: bash $
+
+ sudo iptables -A INPUT -i {iface} -m multiport -p tcp -s {ip-address}/{netmask} --dports 6800:7300 -j ACCEPT
+
+.. tip:: If you run Ceph Metadata Servers on the same Ceph Node as the
+ Ceph OSD Daemons, you can consolidate the public network configuration step.
+
+
+Ceph Networks
+=============
+
+To configure Ceph networks, you must add a network configuration to the
+``[global]`` section of the configuration file. Our 5-minute Quick Start
+provides a trivial Ceph configuration file that assumes one public network
+with client and server on the same network and subnet. Ceph functions just fine
+with a public network only. However, Ceph allows you to establish much more
+specific criteria, including multiple IP network and subnet masks for your
+public network. You can also establish a separate cluster network to handle OSD
+heartbeat, object replication and recovery traffic. Don't confuse the IP
+addresses you set in your configuration with the public-facing IP addresses
+network clients may use to access your service. Typical internal IP networks are
+often ``192.168.0.0`` or ``10.0.0.0``.
+
+.. tip:: If you specify more than one IP address and subnet mask for
+ either the public or the cluster network, the subnets within the network
+ must be capable of routing to each other. Additionally, make sure you
+ include each IP address/subnet in your IP tables and open ports for them
+ as necessary.
+
+.. note:: Ceph uses `CIDR`_ notation for subnets (e.g., ``10.0.0.0/24``).
+
+When you have configured your networks, you may restart your cluster or restart
+each daemon. Ceph daemons bind dynamically, so you do not have to restart the
+entire cluster at once if you change your network configuration.
+
+
+Public Network
+--------------
+
+To configure a public network, add the following option to the ``[global]``
+section of your Ceph configuration file.
+
+.. code-block:: ini
+
+ [global]
+ # ... elided configuration
+ public_network = {public-network/netmask}
+
+.. _cluster-network:
+
+Cluster Network
+---------------
+
+If you declare a cluster network, OSDs will route heartbeat, object replication
+and recovery traffic over the cluster network. This may improve performance
+compared to using a single network. To configure a cluster network, add the
+following option to the ``[global]`` section of your Ceph configuration file.
+
+.. code-block:: ini
+
+ [global]
+ # ... elided configuration
+ cluster_network = {cluster-network/netmask}
+
+We prefer that the cluster network is **NOT** reachable from the public network
+or the Internet for added security.
+
+IPv4/IPv6 Dual Stack Mode
+-------------------------
+
+If you want to run in an IPv4/IPv6 dual stack mode and want to define your public and/or
+cluster networks, then you need to specify both your IPv4 and IPv6 networks for each:
+
+.. code-block:: ini
+
+ [global]
+ # ... elided configuration
+ public_network = {IPv4 public-network/netmask}, {IPv6 public-network/netmask}
+
+This is so that Ceph can find a valid IP address for both address families.
+
+If you want just an IPv4 or an IPv6 stack environment, then make sure you set the `ms bind`
+options correctly.
+
+.. note::
+ Binding to IPv4 is enabled by default, so if you just add the option to bind to IPv6
+ you'll actually put yourself into dual stack mode. If you want just IPv6, then disable IPv4 and
+ enable IPv6. See `Bind`_ below.
+
+Ceph Daemons
+============
+
+Monitor daemons are each configured to bind to a specific IP address. These
+addresses are normally configured by your deployment tool. Other components
+in the Ceph cluster discover the monitors via the ``mon host`` configuration
+option, normally specified in the ``[global]`` section of the ``ceph.conf`` file.
+
+.. code-block:: ini
+
+ [global]
+ mon_host = 10.0.0.2, 10.0.0.3, 10.0.0.4
+
+The ``mon_host`` value can be a list of IP addresses or a name that is
+looked up via DNS. In the case of a DNS name with multiple A or AAAA
+records, all records are probed in order to discover a monitor. Once
+one monitor is reached, all other current monitors are discovered, so
+the ``mon host`` configuration option only needs to be sufficiently up
+to date such that a client can reach one monitor that is currently online.
+
+The MGR, OSD, and MDS daemons will bind to any available address and
+do not require any special configuration. However, it is possible to
+specify a specific IP address for them to bind to with the ``public
+addr`` (and/or, in the case of OSD daemons, the ``cluster addr``)
+configuration option. For example,
+
+.. code-block:: ini
+
+ [osd.0]
+ public_addr = {host-public-ip-address}
+ cluster_addr = {host-cluster-ip-address}
+
+.. topic:: One NIC OSD in a Two Network Cluster
+
+ Generally, we do not recommend deploying an OSD host with a single network interface in a
+ cluster with two networks. However, you may accomplish this by forcing the
+ OSD host to operate on the public network by adding a ``public_addr`` entry
+ to the ``[osd.n]`` section of the Ceph configuration file, where ``n``
+ refers to the ID of the OSD with one network interface. Additionally, the public
+ network and cluster network must be able to route traffic to each other,
+ which we don't recommend for security reasons.
+
+
+Network Config Settings
+=======================
+
+Network configuration settings are not required. Ceph assumes a public network
+with all hosts operating on it unless you specifically configure a cluster
+network.
+
+
+Public Network
+--------------
+
+The public network configuration allows you specifically define IP addresses
+and subnets for the public network. You may specifically assign static IP
+addresses or override ``public_network`` settings using the ``public_addr``
+setting for a specific daemon.
+
+.. confval:: public_network
+.. confval:: public_addr
+
+Cluster Network
+---------------
+
+The cluster network configuration allows you to declare a cluster network, and
+specifically define IP addresses and subnets for the cluster network. You may
+specifically assign static IP addresses or override ``cluster_network``
+settings using the ``cluster_addr`` setting for specific OSD daemons.
+
+
+.. confval:: cluster_network
+.. confval:: cluster_addr
+
+Bind
+----
+
+Bind settings set the default port ranges Ceph OSD and MDS daemons use. The
+default range is ``6800:7300``. Ensure that your `IP Tables`_ configuration
+allows you to use the configured port range.
+
+You may also enable Ceph daemons to bind to IPv6 addresses instead of IPv4
+addresses.
+
+.. confval:: ms_bind_port_min
+.. confval:: ms_bind_port_max
+.. confval:: ms_bind_ipv4
+.. confval:: ms_bind_ipv6
+.. confval:: public_bind_addr
+
+TCP
+---
+
+Ceph disables TCP buffering by default.
+
+.. confval:: ms_tcp_nodelay
+.. confval:: ms_tcp_rcvbuf
+
+General Settings
+----------------
+
+.. confval:: ms_type
+.. confval:: ms_async_op_threads
+.. confval:: ms_initial_backoff
+.. confval:: ms_max_backoff
+.. confval:: ms_die_on_bad_msg
+.. confval:: ms_dispatch_throttle_bytes
+.. confval:: ms_inject_socket_failures
+
+
+.. _Scalability and High Availability: ../../../architecture#scalability-and-high-availability
+.. _Hardware Recommendations - Networks: ../../../start/hardware-recommendations#networks
+.. _hardware recommendations: ../../../start/hardware-recommendations
+.. _Monitor / OSD Interaction: ../mon-osd-interaction
+.. _Message Signatures: ../auth-config-ref#signatures
+.. _CIDR: https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing
+.. _Nagle's Algorithm: https://en.wikipedia.org/wiki/Nagle's_algorithm
diff --git a/doc/rados/configuration/osd-config-ref.rst b/doc/rados/configuration/osd-config-ref.rst
new file mode 100644
index 000000000..060121200
--- /dev/null
+++ b/doc/rados/configuration/osd-config-ref.rst
@@ -0,0 +1,445 @@
+======================
+ OSD Config Reference
+======================
+
+.. index:: OSD; configuration
+
+You can configure Ceph OSD Daemons in the Ceph configuration file (or in recent
+releases, the central config store), but Ceph OSD
+Daemons can use the default values and a very minimal configuration. A minimal
+Ceph OSD Daemon configuration sets ``host`` and
+uses default values for nearly everything else.
+
+Ceph OSD Daemons are numerically identified in incremental fashion, beginning
+with ``0`` using the following convention. ::
+
+ osd.0
+ osd.1
+ osd.2
+
+In a configuration file, you may specify settings for all Ceph OSD Daemons in
+the cluster by adding configuration settings to the ``[osd]`` section of your
+configuration file. To add settings directly to a specific Ceph OSD Daemon
+(e.g., ``host``), enter it in an OSD-specific section of your configuration
+file. For example:
+
+.. code-block:: ini
+
+ [osd]
+ osd_journal_size = 5120
+
+ [osd.0]
+ host = osd-host-a
+
+ [osd.1]
+ host = osd-host-b
+
+
+.. index:: OSD; config settings
+
+General Settings
+================
+
+The following settings provide a Ceph OSD Daemon's ID, and determine paths to
+data and journals. Ceph deployment scripts typically generate the UUID
+automatically.
+
+.. warning:: **DO NOT** change the default paths for data or journals, as it
+ makes it more problematic to troubleshoot Ceph later.
+
+When using Filestore, the journal size should be at least twice the product of the expected drive
+speed multiplied by ``filestore_max_sync_interval``. However, the most common
+practice is to partition the journal drive (often an SSD), and mount it such
+that Ceph uses the entire partition for the journal.
+
+.. confval:: osd_uuid
+.. confval:: osd_data
+.. confval:: osd_max_write_size
+.. confval:: osd_max_object_size
+.. confval:: osd_client_message_size_cap
+.. confval:: osd_class_dir
+ :default: $libdir/rados-classes
+
+.. index:: OSD; file system
+
+File System Settings
+====================
+Ceph builds and mounts file systems which are used for Ceph OSDs.
+
+``osd_mkfs_options {fs-type}``
+
+:Description: Options used when creating a new Ceph Filestore OSD of type {fs-type}.
+
+:Type: String
+:Default for xfs: ``-f -i 2048``
+:Default for other file systems: {empty string}
+
+For example::
+ ``osd_mkfs_options_xfs = -f -d agcount=24``
+
+``osd_mount_options {fs-type}``
+
+:Description: Options used when mounting a Ceph Filestore OSD of type {fs-type}.
+
+:Type: String
+:Default for xfs: ``rw,noatime,inode64``
+:Default for other file systems: ``rw, noatime``
+
+For example::
+ ``osd_mount_options_xfs = rw, noatime, inode64, logbufs=8``
+
+
+.. index:: OSD; journal settings
+
+Journal Settings
+================
+
+This section applies only to the older Filestore OSD back end. Since Luminous
+BlueStore has been default and preferred.
+
+By default, Ceph expects that you will provision a Ceph OSD Daemon's journal at
+the following path, which is usually a symlink to a device or partition::
+
+ /var/lib/ceph/osd/$cluster-$id/journal
+
+When using a single device type (for example, spinning drives), the journals
+should be *colocated*: the logical volume (or partition) should be in the same
+device as the ``data`` logical volume.
+
+When using a mix of fast (SSDs, NVMe) devices with slower ones (like spinning
+drives) it makes sense to place the journal on the faster device, while
+``data`` occupies the slower device fully.
+
+The default ``osd_journal_size`` value is 5120 (5 gigabytes), but it can be
+larger, in which case it will need to be set in the ``ceph.conf`` file.
+A value of 10 gigabytes is common in practice::
+
+ osd_journal_size = 10240
+
+
+.. confval:: osd_journal
+.. confval:: osd_journal_size
+
+See `Journal Config Reference`_ for additional details.
+
+
+Monitor OSD Interaction
+=======================
+
+Ceph OSD Daemons check each other's heartbeats and report to monitors
+periodically. Ceph can use default values in many cases. However, if your
+network has latency issues, you may need to adopt longer intervals. See
+`Configuring Monitor/OSD Interaction`_ for a detailed discussion of heartbeats.
+
+
+Data Placement
+==============
+
+See `Pool & PG Config Reference`_ for details.
+
+
+.. index:: OSD; scrubbing
+
+.. _rados_config_scrubbing:
+
+Scrubbing
+=========
+
+One way that Ceph ensures data integrity is by "scrubbing" placement groups.
+Ceph scrubbing is analogous to ``fsck`` on the object storage layer. Ceph
+generates a catalog of all objects in each placement group and compares each
+primary object to its replicas, ensuring that no objects are missing or
+mismatched. Light scrubbing checks the object size and attributes, and is
+usually done daily. Deep scrubbing reads the data and uses checksums to ensure
+data integrity, and is usually done weekly. The freqeuncies of both light
+scrubbing and deep scrubbing are determined by the cluster's configuration,
+which is fully under your control and subject to the settings explained below
+in this section.
+
+Although scrubbing is important for maintaining data integrity, it can reduce
+the performance of the Ceph cluster. You can adjust the following settings to
+increase or decrease the frequency and depth of scrubbing operations.
+
+
+.. confval:: osd_max_scrubs
+.. confval:: osd_scrub_begin_hour
+.. confval:: osd_scrub_end_hour
+.. confval:: osd_scrub_begin_week_day
+.. confval:: osd_scrub_end_week_day
+.. confval:: osd_scrub_during_recovery
+.. confval:: osd_scrub_load_threshold
+.. confval:: osd_scrub_min_interval
+.. confval:: osd_scrub_max_interval
+.. confval:: osd_scrub_chunk_min
+.. confval:: osd_scrub_chunk_max
+.. confval:: osd_scrub_sleep
+.. confval:: osd_deep_scrub_interval
+.. confval:: osd_scrub_interval_randomize_ratio
+.. confval:: osd_deep_scrub_stride
+.. confval:: osd_scrub_auto_repair
+.. confval:: osd_scrub_auto_repair_num_errors
+
+.. index:: OSD; operations settings
+
+Operations
+==========
+
+.. confval:: osd_op_num_shards
+.. confval:: osd_op_num_shards_hdd
+.. confval:: osd_op_num_shards_ssd
+.. confval:: osd_op_queue
+.. confval:: osd_op_queue_cut_off
+.. confval:: osd_client_op_priority
+.. confval:: osd_recovery_op_priority
+.. confval:: osd_scrub_priority
+.. confval:: osd_requested_scrub_priority
+.. confval:: osd_snap_trim_priority
+.. confval:: osd_snap_trim_sleep
+.. confval:: osd_snap_trim_sleep_hdd
+.. confval:: osd_snap_trim_sleep_ssd
+.. confval:: osd_snap_trim_sleep_hybrid
+.. confval:: osd_op_thread_timeout
+.. confval:: osd_op_complaint_time
+.. confval:: osd_op_history_size
+.. confval:: osd_op_history_duration
+.. confval:: osd_op_log_threshold
+.. confval:: osd_op_thread_suicide_timeout
+.. note:: See https://old.ceph.com/planet/dealing-with-some-osd-timeouts/ for
+ more on ``osd_op_thread_suicide_timeout``. Be aware that this is a link to a
+ reworking of a blog post from 2017, and that its conclusion will direct you
+ back to this page "for more information".
+
+.. _dmclock-qos:
+
+QoS Based on mClock
+-------------------
+
+Ceph's use of mClock is now more refined and can be used by following the
+steps as described in `mClock Config Reference`_.
+
+Core Concepts
+`````````````
+
+Ceph's QoS support is implemented using a queueing scheduler
+based on `the dmClock algorithm`_. This algorithm allocates the I/O
+resources of the Ceph cluster in proportion to weights, and enforces
+the constraints of minimum reservation and maximum limitation, so that
+the services can compete for the resources fairly. Currently the
+*mclock_scheduler* operation queue divides Ceph services involving I/O
+resources into following buckets:
+
+- client op: the iops issued by client
+- osd subop: the iops issued by primary OSD
+- snap trim: the snap trimming related requests
+- pg recovery: the recovery related requests
+- pg scrub: the scrub related requests
+
+And the resources are partitioned using following three sets of tags. In other
+words, the share of each type of service is controlled by three tags:
+
+#. reservation: the minimum IOPS allocated for the service.
+#. limitation: the maximum IOPS allocated for the service.
+#. weight: the proportional share of capacity if extra capacity or system
+ oversubscribed.
+
+In Ceph, operations are graded with "cost". And the resources allocated
+for serving various services are consumed by these "costs". So, for
+example, the more reservation a services has, the more resource it is
+guaranteed to possess, as long as it requires. Assuming there are 2
+services: recovery and client ops:
+
+- recovery: (r:1, l:5, w:1)
+- client ops: (r:2, l:0, w:9)
+
+The settings above ensure that the recovery won't get more than 5
+requests per second serviced, even if it requires so (see CURRENT
+IMPLEMENTATION NOTE below), and no other services are competing with
+it. But if the clients start to issue large amount of I/O requests,
+neither will they exhaust all the I/O resources. 1 request per second
+is always allocated for recovery jobs as long as there are any such
+requests. So the recovery jobs won't be starved even in a cluster with
+high load. And in the meantime, the client ops can enjoy a larger
+portion of the I/O resource, because its weight is "9", while its
+competitor "1". In the case of client ops, it is not clamped by the
+limit setting, so it can make use of all the resources if there is no
+recovery ongoing.
+
+CURRENT IMPLEMENTATION NOTE: the current implementation enforces the limit
+values. Therefore, if a service crosses the enforced limit, the op remains
+in the operation queue until the limit is restored.
+
+Subtleties of mClock
+````````````````````
+
+The reservation and limit values have a unit of requests per
+second. The weight, however, does not technically have a unit and the
+weights are relative to one another. So if one class of requests has a
+weight of 1 and another a weight of 9, then the latter class of
+requests should get 9 executed at a 9 to 1 ratio as the first class.
+However that will only happen once the reservations are met and those
+values include the operations executed under the reservation phase.
+
+Even though the weights do not have units, one must be careful in
+choosing their values due how the algorithm assigns weight tags to
+requests. If the weight is *W*, then for a given class of requests,
+the next one that comes in will have a weight tag of *1/W* plus the
+previous weight tag or the current time, whichever is larger. That
+means if *W* is sufficiently large and therefore *1/W* is sufficiently
+small, the calculated tag may never be assigned as it will get a value
+of the current time. The ultimate lesson is that values for weight
+should not be too large. They should be under the number of requests
+one expects to be serviced each second.
+
+Caveats
+```````
+
+There are some factors that can reduce the impact of the mClock op
+queues within Ceph. First, requests to an OSD are sharded by their
+placement group identifier. Each shard has its own mClock queue and
+these queues neither interact nor share information among them. The
+number of shards can be controlled with the configuration options
+:confval:`osd_op_num_shards`, :confval:`osd_op_num_shards_hdd`, and
+:confval:`osd_op_num_shards_ssd`. A lower number of shards will increase the
+impact of the mClock queues, but may have other deleterious effects.
+
+Second, requests are transferred from the operation queue to the
+operation sequencer, in which they go through the phases of
+execution. The operation queue is where mClock resides and mClock
+determines the next op to transfer to the operation sequencer. The
+number of operations allowed in the operation sequencer is a complex
+issue. In general we want to keep enough operations in the sequencer
+so it's always getting work done on some operations while it's waiting
+for disk and network access to complete on other operations. On the
+other hand, once an operation is transferred to the operation
+sequencer, mClock no longer has control over it. Therefore to maximize
+the impact of mClock, we want to keep as few operations in the
+operation sequencer as possible. So we have an inherent tension.
+
+The configuration options that influence the number of operations in
+the operation sequencer are :confval:`bluestore_throttle_bytes`,
+:confval:`bluestore_throttle_deferred_bytes`,
+:confval:`bluestore_throttle_cost_per_io`,
+:confval:`bluestore_throttle_cost_per_io_hdd`, and
+:confval:`bluestore_throttle_cost_per_io_ssd`.
+
+A third factor that affects the impact of the mClock algorithm is that
+we're using a distributed system, where requests are made to multiple
+OSDs and each OSD has (can have) multiple shards. Yet we're currently
+using the mClock algorithm, which is not distributed (note: dmClock is
+the distributed version of mClock).
+
+Various organizations and individuals are currently experimenting with
+mClock as it exists in this code base along with their modifications
+to the code base. We hope you'll share you're experiences with your
+mClock and dmClock experiments on the ``ceph-devel`` mailing list.
+
+.. confval:: osd_async_recovery_min_cost
+.. confval:: osd_push_per_object_cost
+.. confval:: osd_mclock_scheduler_client_res
+.. confval:: osd_mclock_scheduler_client_wgt
+.. confval:: osd_mclock_scheduler_client_lim
+.. confval:: osd_mclock_scheduler_background_recovery_res
+.. confval:: osd_mclock_scheduler_background_recovery_wgt
+.. confval:: osd_mclock_scheduler_background_recovery_lim
+.. confval:: osd_mclock_scheduler_background_best_effort_res
+.. confval:: osd_mclock_scheduler_background_best_effort_wgt
+.. confval:: osd_mclock_scheduler_background_best_effort_lim
+
+.. _the dmClock algorithm: https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Gulati.pdf
+
+.. index:: OSD; backfilling
+
+Backfilling
+===========
+
+When you add or remove Ceph OSD Daemons to a cluster, CRUSH will
+rebalance the cluster by moving placement groups to or from Ceph OSDs
+to restore balanced utilization. The process of migrating placement groups and
+the objects they contain can reduce the cluster's operational performance
+considerably. To maintain operational performance, Ceph performs this migration
+with 'backfilling', which allows Ceph to set backfill operations to a lower
+priority than requests to read or write data.
+
+
+.. confval:: osd_max_backfills
+.. confval:: osd_backfill_scan_min
+.. confval:: osd_backfill_scan_max
+.. confval:: osd_backfill_retry_interval
+
+.. index:: OSD; osdmap
+
+OSD Map
+=======
+
+OSD maps reflect the OSD daemons operating in the cluster. Over time, the
+number of map epochs increases. Ceph provides some settings to ensure that
+Ceph performs well as the OSD map grows larger.
+
+.. confval:: osd_map_dedup
+.. confval:: osd_map_cache_size
+.. confval:: osd_map_message_max
+
+.. index:: OSD; recovery
+
+Recovery
+========
+
+When the cluster starts or when a Ceph OSD Daemon crashes and restarts, the OSD
+begins peering with other Ceph OSD Daemons before writes can occur. See
+`Monitoring OSDs and PGs`_ for details.
+
+If a Ceph OSD Daemon crashes and comes back online, usually it will be out of
+sync with other Ceph OSD Daemons containing more recent versions of objects in
+the placement groups. When this happens, the Ceph OSD Daemon goes into recovery
+mode and seeks to get the latest copy of the data and bring its map back up to
+date. Depending upon how long the Ceph OSD Daemon was down, the OSD's objects
+and placement groups may be significantly out of date. Also, if a failure domain
+went down (e.g., a rack), more than one Ceph OSD Daemon may come back online at
+the same time. This can make the recovery process time consuming and resource
+intensive.
+
+To maintain operational performance, Ceph performs recovery with limitations on
+the number recovery requests, threads and object chunk sizes which allows Ceph
+perform well in a degraded state.
+
+.. confval:: osd_recovery_delay_start
+.. confval:: osd_recovery_max_active
+.. confval:: osd_recovery_max_active_hdd
+.. confval:: osd_recovery_max_active_ssd
+.. confval:: osd_recovery_max_chunk
+.. confval:: osd_recovery_max_single_start
+.. confval:: osd_recover_clone_overlap
+.. confval:: osd_recovery_sleep
+.. confval:: osd_recovery_sleep_hdd
+.. confval:: osd_recovery_sleep_ssd
+.. confval:: osd_recovery_sleep_hybrid
+.. confval:: osd_recovery_priority
+
+Tiering
+=======
+
+.. confval:: osd_agent_max_ops
+.. confval:: osd_agent_max_low_ops
+
+See `cache target dirty high ratio`_ for when the tiering agent flushes dirty
+objects within the high speed mode.
+
+Miscellaneous
+=============
+
+.. confval:: osd_default_notify_timeout
+.. confval:: osd_check_for_log_corruption
+.. confval:: osd_delete_sleep
+.. confval:: osd_delete_sleep_hdd
+.. confval:: osd_delete_sleep_ssd
+.. confval:: osd_delete_sleep_hybrid
+.. confval:: osd_command_max_records
+.. confval:: osd_fast_fail_on_connection_refused
+
+.. _pool: ../../operations/pools
+.. _Configuring Monitor/OSD Interaction: ../mon-osd-interaction
+.. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg#peering
+.. _Pool & PG Config Reference: ../pool-pg-config-ref
+.. _Journal Config Reference: ../journal-ref
+.. _cache target dirty high ratio: ../../operations/pools#cache-target-dirty-high-ratio
+.. _mClock Config Reference: ../mclock-config-ref
diff --git a/doc/rados/configuration/pool-pg-config-ref.rst b/doc/rados/configuration/pool-pg-config-ref.rst
new file mode 100644
index 000000000..902c80346
--- /dev/null
+++ b/doc/rados/configuration/pool-pg-config-ref.rst
@@ -0,0 +1,46 @@
+.. _rados_config_pool_pg_crush_ref:
+
+======================================
+ Pool, PG and CRUSH Config Reference
+======================================
+
+.. index:: pools; configuration
+
+Ceph uses default values to determine how many placement groups (PGs) will be
+assigned to each pool. We recommend overriding some of the defaults.
+Specifically, we recommend setting a pool's replica size and overriding the
+default number of placement groups. You can set these values when running
+`pool`_ commands. You can also override the defaults by adding new ones in the
+``[global]`` section of your Ceph configuration file.
+
+
+.. literalinclude:: pool-pg.conf
+ :language: ini
+
+.. confval:: mon_max_pool_pg_num
+.. confval:: mon_pg_stuck_threshold
+.. confval:: mon_pg_warn_min_per_osd
+.. confval:: mon_pg_warn_min_objects
+.. confval:: mon_pg_warn_min_pool_objects
+.. confval:: mon_pg_check_down_all_threshold
+.. confval:: mon_pg_warn_max_object_skew
+.. confval:: mon_delta_reset_interval
+.. confval:: osd_crush_chooseleaf_type
+.. confval:: osd_crush_initial_weight
+.. confval:: osd_pool_default_crush_rule
+.. confval:: osd_pool_erasure_code_stripe_unit
+.. confval:: osd_pool_default_size
+.. confval:: osd_pool_default_min_size
+.. confval:: osd_pool_default_pg_num
+.. confval:: osd_pool_default_pgp_num
+.. confval:: osd_pool_default_pg_autoscale_mode
+.. confval:: osd_pool_default_flags
+.. confval:: osd_max_pgls
+.. confval:: osd_min_pg_log_entries
+.. confval:: osd_max_pg_log_entries
+.. confval:: osd_default_data_pool_replay_window
+.. confval:: osd_max_pg_per_osd_hard_ratio
+
+.. _pool: ../../operations/pools
+.. _Monitoring OSDs and PGs: ../../operations/monitoring-osd-pg#peering
+.. _Weighting Bucket Items: ../../operations/crush-map#weightingbucketitems
diff --git a/doc/rados/configuration/pool-pg.conf b/doc/rados/configuration/pool-pg.conf
new file mode 100644
index 000000000..6765d37df
--- /dev/null
+++ b/doc/rados/configuration/pool-pg.conf
@@ -0,0 +1,21 @@
+[global]
+
+ # By default, Ceph makes three replicas of RADOS objects. If you want
+ # to maintain four copies of an object the default value--a primary
+ # copy and three replica copies--reset the default values as shown in
+ # 'osd_pool_default_size'. If you want to allow Ceph to accept an I/O
+ # operation to a degraded PG, set 'osd_pool_default_min_size' to a
+ # number less than the 'osd_pool_default_size' value.
+
+ osd_pool_default_size = 3 # Write an object three times.
+ osd_pool_default_min_size = 2 # Accept an I/O operation to a PG that has two copies of an object.
+
+ # Note: by default, PG autoscaling is enabled and this value is used only
+ # in specific circumstances. It is however still recommend to set it.
+ # Ensure you have a realistic number of placement groups. We recommend
+ # approximately 100 per OSD. E.g., total number of OSDs multiplied by 100
+ # divided by the number of replicas (i.e., 'osd_pool_default_size'). So for
+ # 10 OSDs and 'osd_pool_default_size' = 4, we'd recommend approximately
+ # (100 * 10) / 4 = 250.
+ # Always use the nearest power of two.
+ osd_pool_default_pg_num = 256
diff --git a/doc/rados/configuration/storage-devices.rst b/doc/rados/configuration/storage-devices.rst
new file mode 100644
index 000000000..c83e87da7
--- /dev/null
+++ b/doc/rados/configuration/storage-devices.rst
@@ -0,0 +1,93 @@
+=================
+ Storage Devices
+=================
+
+There are several Ceph daemons in a storage cluster:
+
+.. _rados_configuration_storage-devices_ceph_osd:
+
+* **Ceph OSDs** (Object Storage Daemons) store most of the data
+ in Ceph. Usually each OSD is backed by a single storage device.
+ This can be a traditional hard disk (HDD) or a solid state disk
+ (SSD). OSDs can also be backed by a combination of devices: for
+ example, a HDD for most data and an SSD (or partition of an
+ SSD) for some metadata. The number of OSDs in a cluster is
+ usually a function of the amount of data to be stored, the size
+ of each storage device, and the level and type of redundancy
+ specified (replication or erasure coding).
+* **Ceph Monitor** daemons manage critical cluster state. This
+ includes cluster membership and authentication information.
+ Small clusters require only a few gigabytes of storage to hold
+ the monitor database. In large clusters, however, the monitor
+ database can reach sizes of tens of gigabytes to hundreds of
+ gigabytes.
+* **Ceph Manager** daemons run alongside monitor daemons, providing
+ additional monitoring and providing interfaces to external
+ monitoring and management systems.
+
+.. _rados_config_storage_devices_osd_backends:
+
+OSD Back Ends
+=============
+
+There are two ways that OSDs manage the data they store. As of the Luminous
+12.2.z release, the default (and recommended) back end is *BlueStore*. Prior
+to the Luminous release, the default (and only) back end was *Filestore*.
+
+.. _rados_config_storage_devices_bluestore:
+
+BlueStore
+---------
+
+BlueStore is a special-purpose storage back end designed specifically for
+managing data on disk for Ceph OSD workloads. BlueStore's design is based on
+a decade of experience of supporting and managing Filestore OSDs.
+
+Key BlueStore features include:
+
+* Direct management of storage devices. BlueStore consumes raw block devices or
+ partitions. This avoids intervening layers of abstraction (such as local file
+ systems like XFS) that can limit performance or add complexity.
+* Metadata management with RocksDB. RocksDB's key/value database is embedded
+ in order to manage internal metadata, including the mapping of object
+ names to block locations on disk.
+* Full data and metadata checksumming. By default, all data and
+ metadata written to BlueStore is protected by one or more
+ checksums. No data or metadata is read from disk or returned
+ to the user without being verified.
+* Inline compression. Data can be optionally compressed before being written
+ to disk.
+* Multi-device metadata tiering. BlueStore allows its internal
+ journal (write-ahead log) to be written to a separate, high-speed
+ device (like an SSD, NVMe, or NVDIMM) for increased performance. If
+ a significant amount of faster storage is available, internal
+ metadata can be stored on the faster device.
+* Efficient copy-on-write. RBD and CephFS snapshots rely on a
+ copy-on-write *clone* mechanism that is implemented efficiently in
+ BlueStore. This results in efficient I/O both for regular snapshots
+ and for erasure-coded pools (which rely on cloning to implement
+ efficient two-phase commits).
+
+For more information, see :doc:`bluestore-config-ref` and :doc:`/rados/operations/bluestore-migration`.
+
+FileStore
+---------
+.. warning:: Filestore has been deprecated in the Reef release and is no longer supported.
+
+
+FileStore is the legacy approach to storing objects in Ceph. It
+relies on a standard file system (normally XFS) in combination with a
+key/value database (traditionally LevelDB, now RocksDB) for some
+metadata.
+
+FileStore is well-tested and widely used in production. However, it
+suffers from many performance deficiencies due to its overall design
+and its reliance on a traditional file system for object data storage.
+
+Although FileStore is capable of functioning on most POSIX-compatible
+file systems (including btrfs and ext4), we recommend that only the
+XFS file system be used with Ceph. Both btrfs and ext4 have known bugs and
+deficiencies and their use may lead to data loss. By default, all Ceph
+provisioning tools use XFS.
+
+For more information, see :doc:`filestore-config-ref`.
diff --git a/doc/rados/index.rst b/doc/rados/index.rst
new file mode 100644
index 000000000..b506b7a7e
--- /dev/null
+++ b/doc/rados/index.rst
@@ -0,0 +1,81 @@
+.. _rados-index:
+
+======================
+ Ceph Storage Cluster
+======================
+
+The :term:`Ceph Storage Cluster` is the foundation for all Ceph deployments.
+Based upon :abbr:`RADOS (Reliable Autonomic Distributed Object Store)`, Ceph
+Storage Clusters consist of several types of daemons:
+
+ 1. a :term:`Ceph OSD Daemon` (OSD) stores data as objects on a storage node
+ 2. a :term:`Ceph Monitor` (MON) maintains a master copy of the cluster map.
+ 3. a :term:`Ceph Manager` manager daemon
+
+A Ceph Storage Cluster might contain thousands of storage nodes. A
+minimal system has at least one Ceph Monitor and two Ceph OSD
+Daemons for data replication.
+
+The Ceph File System, Ceph Object Storage and Ceph Block Devices read data from
+and write data to the Ceph Storage Cluster.
+
+.. container:: columns-3
+
+ .. container:: column
+
+ .. raw:: html
+
+ <h3>Config and Deploy</h3>
+
+ Ceph Storage Clusters have a few required settings, but most configuration
+ settings have default values. A typical deployment uses a deployment tool
+ to define a cluster and bootstrap a monitor. See :ref:`cephadm` for details.
+
+ .. toctree::
+ :maxdepth: 2
+
+ Configuration <configuration/index>
+
+ .. container:: column
+
+ .. raw:: html
+
+ <h3>Operations</h3>
+
+ Once you have deployed a Ceph Storage Cluster, you may begin operating
+ your cluster.
+
+ .. toctree::
+ :maxdepth: 2
+
+ Operations <operations/index>
+
+ .. toctree::
+ :maxdepth: 1
+
+ Man Pages <man/index>
+
+ .. toctree::
+ :hidden:
+
+ troubleshooting/index
+
+ .. container:: column
+
+ .. raw:: html
+
+ <h3>APIs</h3>
+
+ Most Ceph deployments use `Ceph Block Devices`_, `Ceph Object Storage`_ and/or the
+ `Ceph File System`_. You may also develop applications that talk directly to
+ the Ceph Storage Cluster.
+
+ .. toctree::
+ :maxdepth: 2
+
+ APIs <api/index>
+
+.. _Ceph Block Devices: ../rbd/
+.. _Ceph File System: ../cephfs/
+.. _Ceph Object Storage: ../radosgw/
+.. _Deployment: ../cephadm/
diff --git a/doc/rados/man/index.rst b/doc/rados/man/index.rst
new file mode 100644
index 000000000..bac56aa46
--- /dev/null
+++ b/doc/rados/man/index.rst
@@ -0,0 +1,32 @@
+=======================
+ Object Store Manpages
+=======================
+
+.. toctree::
+ :maxdepth: 1
+
+ ../../man/8/ceph-volume.rst
+ ../../man/8/ceph-volume-systemd.rst
+ ../../man/8/ceph.rst
+ ../../man/8/ceph-authtool.rst
+ ../../man/8/ceph-clsinfo.rst
+ ../../man/8/ceph-conf.rst
+ ../../man/8/ceph-debugpack.rst
+ ../../man/8/ceph-dencoder.rst
+ ../../man/8/ceph-mon.rst
+ ../../man/8/ceph-osd.rst
+ ../../man/8/ceph-kvstore-tool.rst
+ ../../man/8/ceph-run.rst
+ ../../man/8/ceph-syn.rst
+ ../../man/8/crushdiff.rst
+ ../../man/8/crushtool.rst
+ ../../man/8/librados-config.rst
+ ../../man/8/monmaptool.rst
+ ../../man/8/osdmaptool.rst
+ ../../man/8/rados.rst
+
+
+.. toctree::
+ :hidden:
+
+ ../../man/8/ceph-post-file.rst
diff --git a/doc/rados/operations/add-or-rm-mons.rst b/doc/rados/operations/add-or-rm-mons.rst
new file mode 100644
index 000000000..3688bb798
--- /dev/null
+++ b/doc/rados/operations/add-or-rm-mons.rst
@@ -0,0 +1,458 @@
+.. _adding-and-removing-monitors:
+
+==========================
+ Adding/Removing Monitors
+==========================
+
+It is possible to add monitors to a running cluster as long as redundancy is
+maintained. To bootstrap a monitor, see `Manual Deployment`_ or `Monitor
+Bootstrap`_.
+
+.. _adding-monitors:
+
+Adding Monitors
+===============
+
+Ceph monitors serve as the single source of truth for the cluster map. It is
+possible to run a cluster with only one monitor, but for a production cluster
+it is recommended to have at least three monitors provisioned and in quorum.
+Ceph monitors use a variation of the `Paxos`_ algorithm to maintain consensus
+about maps and about other critical information across the cluster. Due to the
+nature of Paxos, Ceph is able to maintain quorum (and thus establish
+consensus) only if a majority of the monitors are ``active``.
+
+It is best to run an odd number of monitors. This is because a cluster that is
+running an odd number of monitors is more resilient than a cluster running an
+even number. For example, in a two-monitor deployment, no failures can be
+tolerated if quorum is to be maintained; in a three-monitor deployment, one
+failure can be tolerated; in a four-monitor deployment, one failure can be
+tolerated; and in a five-monitor deployment, two failures can be tolerated. In
+general, a cluster running an odd number of monitors is best because it avoids
+what is called the *split brain* phenomenon. In short, Ceph is able to operate
+only if a majority of monitors are ``active`` and able to communicate with each
+other, (for example: there must be a single monitor, two out of two monitors,
+two out of three monitors, three out of five monitors, or the like).
+
+For small or non-critical deployments of multi-node Ceph clusters, it is
+recommended to deploy three monitors. For larger clusters or for clusters that
+are intended to survive a double failure, it is recommended to deploy five
+monitors. Only in rare circumstances is there any justification for deploying
+seven or more monitors.
+
+It is possible to run a monitor on the same host that is running an OSD.
+However, this approach has disadvantages: for example: `fsync` issues with the
+kernel might weaken performance, monitor and OSD daemons might be inactive at
+the same time and cause disruption if the node crashes, is rebooted, or is
+taken down for maintenance. Because of these risks, it is instead
+recommended to run monitors and managers on dedicated hosts.
+
+.. note:: A *majority* of monitors in your cluster must be able to
+ reach each other in order for quorum to be established.
+
+Deploying your Hardware
+-----------------------
+
+Some operators choose to add a new monitor host at the same time that they add
+a new monitor. For details on the minimum recommendations for monitor hardware,
+see `Hardware Recommendations`_. Before adding a monitor host to the cluster,
+make sure that there is an up-to-date version of Linux installed.
+
+Add the newly installed monitor host to a rack in your cluster, connect the
+host to the network, and make sure that the host has network connectivity.
+
+.. _Hardware Recommendations: ../../../start/hardware-recommendations
+
+Installing the Required Software
+--------------------------------
+
+In manually deployed clusters, it is necessary to install Ceph packages
+manually. For details, see `Installing Packages`_. Configure SSH so that it can
+be used by a user that has passwordless authentication and root permissions.
+
+.. _Installing Packages: ../../../install/install-storage-cluster
+
+
+.. _Adding a Monitor (Manual):
+
+Adding a Monitor (Manual)
+-------------------------
+
+The procedure in this section creates a ``ceph-mon`` data directory, retrieves
+both the monitor map and the monitor keyring, and adds a ``ceph-mon`` daemon to
+the cluster. The procedure might result in a Ceph cluster that contains only
+two monitor daemons. To add more monitors until there are enough ``ceph-mon``
+daemons to establish quorum, repeat the procedure.
+
+This is a good point at which to define the new monitor's ``id``. Monitors have
+often been named with single letters (``a``, ``b``, ``c``, etc.), but you are
+free to define the ``id`` however you see fit. In this document, ``{mon-id}``
+refers to the ``id`` exclusive of the ``mon.`` prefix: for example, if
+``mon.a`` has been chosen as the ``id`` of a monitor, then ``{mon-id}`` is
+``a``. ???
+
+#. Create a data directory on the machine that will host the new monitor:
+
+ .. prompt:: bash $
+
+ ssh {new-mon-host}
+ sudo mkdir /var/lib/ceph/mon/ceph-{mon-id}
+
+#. Create a temporary directory ``{tmp}`` that will contain the files needed
+ during this procedure. This directory should be different from the data
+ directory created in the previous step. Because this is a temporary
+ directory, it can be removed after the procedure is complete:
+
+ .. prompt:: bash $
+
+ mkdir {tmp}
+
+#. Retrieve the keyring for your monitors (``{tmp}`` is the path to the
+ retrieved keyring and ``{key-filename}`` is the name of the file that
+ contains the retrieved monitor key):
+
+ .. prompt:: bash $
+
+ ceph auth get mon. -o {tmp}/{key-filename}
+
+#. Retrieve the monitor map (``{tmp}`` is the path to the retrieved monitor map
+ and ``{map-filename}`` is the name of the file that contains the retrieved
+ monitor map):
+
+ .. prompt:: bash $
+
+ ceph mon getmap -o {tmp}/{map-filename}
+
+#. Prepare the monitor's data directory, which was created in the first step.
+ The following command must specify the path to the monitor map (so that
+ information about a quorum of monitors and their ``fsid``\s can be
+ retrieved) and specify the path to the monitor keyring:
+
+ .. prompt:: bash $
+
+ sudo ceph-mon -i {mon-id} --mkfs --monmap {tmp}/{map-filename} --keyring {tmp}/{key-filename}
+
+#. Start the new monitor. It will automatically join the cluster. To provide
+ information to the daemon about which address to bind to, use either the
+ ``--public-addr {ip}`` option or the ``--public-network {network}`` option.
+ For example:
+
+ .. prompt:: bash $
+
+ ceph-mon -i {mon-id} --public-addr {ip:port}
+
+.. _removing-monitors:
+
+Removing Monitors
+=================
+
+When monitors are removed from a cluster, it is important to remember
+that Ceph monitors use Paxos to maintain consensus about the cluster
+map. Such consensus is possible only if the number of monitors is sufficient
+to establish quorum.
+
+
+.. _Removing a Monitor (Manual):
+
+Removing a Monitor (Manual)
+---------------------------
+
+The procedure in this section removes a ``ceph-mon`` daemon from the cluster.
+The procedure might result in a Ceph cluster that contains a number of monitors
+insufficient to maintain quorum, so plan carefully. When replacing an old
+monitor with a new monitor, add the new monitor first, wait for quorum to be
+established, and then remove the old monitor. This ensures that quorum is not
+lost.
+
+
+#. Stop the monitor:
+
+ .. prompt:: bash $
+
+ service ceph -a stop mon.{mon-id}
+
+#. Remove the monitor from the cluster:
+
+ .. prompt:: bash $
+
+ ceph mon remove {mon-id}
+
+#. Remove the monitor entry from the ``ceph.conf`` file:
+
+.. _rados-mon-remove-from-unhealthy:
+
+
+Removing Monitors from an Unhealthy Cluster
+-------------------------------------------
+
+The procedure in this section removes a ``ceph-mon`` daemon from an unhealthy
+cluster (for example, a cluster whose monitors are unable to form a quorum).
+
+#. Stop all ``ceph-mon`` daemons on all monitor hosts:
+
+ .. prompt:: bash $
+
+ ssh {mon-host}
+ systemctl stop ceph-mon.target
+
+ Repeat this step on every monitor host.
+
+#. Identify a surviving monitor and log in to the monitor's host:
+
+ .. prompt:: bash $
+
+ ssh {mon-host}
+
+#. Extract a copy of the ``monmap`` file by running a command of the following
+ form:
+
+ .. prompt:: bash $
+
+ ceph-mon -i {mon-id} --extract-monmap {map-path}
+
+ Here is a more concrete example. In this example, ``hostname`` is the
+ ``{mon-id}`` and ``/tmp/monpap`` is the ``{map-path}``:
+
+ .. prompt:: bash $
+
+ ceph-mon -i `hostname` --extract-monmap /tmp/monmap
+
+#. Remove the non-surviving or otherwise problematic monitors:
+
+ .. prompt:: bash $
+
+ monmaptool {map-path} --rm {mon-id}
+
+ For example, suppose that there are three monitors |---| ``mon.a``, ``mon.b``,
+ and ``mon.c`` |---| and that only ``mon.a`` will survive:
+
+ .. prompt:: bash $
+
+ monmaptool /tmp/monmap --rm b
+ monmaptool /tmp/monmap --rm c
+
+#. Inject the surviving map that includes the removed monitors into the
+ monmap of the surviving monitor(s):
+
+ .. prompt:: bash $
+
+ ceph-mon -i {mon-id} --inject-monmap {map-path}
+
+ Continuing with the above example, inject a map into monitor ``mon.a`` by
+ running the following command:
+
+ .. prompt:: bash $
+
+ ceph-mon -i a --inject-monmap /tmp/monmap
+
+
+#. Start only the surviving monitors.
+
+#. Verify that the monitors form a quorum by running the command ``ceph -s``.
+
+#. The data directory of the removed monitors is in ``/var/lib/ceph/mon``:
+ either archive this data directory in a safe location or delete this data
+ directory. However, do not delete it unless you are confident that the
+ remaining monitors are healthy and sufficiently redundant. Make sure that
+ there is enough room for the live DB to expand and compact, and make sure
+ that there is also room for an archived copy of the DB. The archived copy
+ can be compressed.
+
+.. _Changing a Monitor's IP address:
+
+Changing a Monitor's IP Address
+===============================
+
+.. important:: Existing monitors are not supposed to change their IP addresses.
+
+Monitors are critical components of a Ceph cluster. The entire system can work
+properly only if the monitors maintain quorum, and quorum can be established
+only if the monitors have discovered each other by means of their IP addresses.
+Ceph has strict requirements on the discovery of monitors.
+
+Although the ``ceph.conf`` file is used by Ceph clients and other Ceph daemons
+to discover monitors, the monitor map is used by monitors to discover each
+other. This is why it is necessary to obtain the current ``monmap`` at the time
+a new monitor is created: as can be seen above in `Adding a Monitor (Manual)`_,
+the ``monmap`` is one of the arguments required by the ``ceph-mon -i {mon-id}
+--mkfs`` command. The following sections explain the consistency requirements
+for Ceph monitors, and also explain a number of safe ways to change a monitor's
+IP address.
+
+
+Consistency Requirements
+------------------------
+
+When a monitor discovers other monitors in the cluster, it always refers to the
+local copy of the monitor map. Using the monitor map instead of using the
+``ceph.conf`` file avoids errors that could break the cluster (for example,
+typos or other slight errors in ``ceph.conf`` when a monitor address or port is
+specified). Because monitors use monitor maps for discovery and because they
+share monitor maps with Ceph clients and other Ceph daemons, the monitor map
+provides monitors with a strict guarantee that their consensus is valid.
+
+Strict consistency also applies to updates to the monmap. As with any other
+updates on the monitor, changes to the monmap always run through a distributed
+consensus algorithm called `Paxos`_. The monitors must agree on each update to
+the monmap, such as adding or removing a monitor, to ensure that each monitor
+in the quorum has the same version of the monmap. Updates to the monmap are
+incremental so that monitors have the latest agreed upon version, and a set of
+previous versions, allowing a monitor that has an older version of the monmap
+to catch up with the current state of the cluster.
+
+There are additional advantages to using the monitor map rather than
+``ceph.conf`` when monitors discover each other. Because ``ceph.conf`` is not
+automatically updated and distributed, its use would bring certain risks:
+monitors might use an outdated ``ceph.conf`` file, might fail to recognize a
+specific monitor, might fall out of quorum, and might develop a situation in
+which `Paxos`_ is unable to accurately ascertain the current state of the
+system. Because of these risks, any changes to an existing monitor's IP address
+must be made with great care.
+
+.. _operations_add_or_rm_mons_changing_mon_ip:
+
+Changing a Monitor's IP address (Preferred Method)
+--------------------------------------------------
+
+If a monitor's IP address is changed only in the ``ceph.conf`` file, there is
+no guarantee that the other monitors in the cluster will receive the update.
+For this reason, the preferred method to change a monitor's IP address is as
+follows: add a new monitor with the desired IP address (as described in `Adding
+a Monitor (Manual)`_), make sure that the new monitor successfully joins the
+quorum, remove the monitor that is using the old IP address, and update the
+``ceph.conf`` file to ensure that clients and other daemons are made aware of
+the new monitor's IP address.
+
+For example, suppose that there are three monitors in place::
+
+ [mon.a]
+ host = host01
+ addr = 10.0.0.1:6789
+ [mon.b]
+ host = host02
+ addr = 10.0.0.2:6789
+ [mon.c]
+ host = host03
+ addr = 10.0.0.3:6789
+
+To change ``mon.c`` so that its name is ``host04`` and its IP address is
+``10.0.0.4``: (1) follow the steps in `Adding a Monitor (Manual)`_ to add a new
+monitor ``mon.d``, (2) make sure that ``mon.d`` is running before removing
+``mon.c`` or else quorum will be broken, and (3) follow the steps in `Removing
+a Monitor (Manual)`_ to remove ``mon.c``. To move all three monitors to new IP
+addresses, repeat this process.
+
+Changing a Monitor's IP address (Advanced Method)
+-------------------------------------------------
+
+There are cases in which the method outlined in :ref"`<Changing a Monitor's IP
+Address (Preferred Method)> operations_add_or_rm_mons_changing_mon_ip` cannot
+be used. For example, it might be necessary to move the cluster's monitors to a
+different network, to a different part of the datacenter, or to a different
+datacenter altogether. It is still possible to change the monitors' IP
+addresses, but a different method must be used.
+
+For such cases, a new monitor map with updated IP addresses for every monitor
+in the cluster must be generated and injected on each monitor. Although this
+method is not particularly easy, such a major migration is unlikely to be a
+routine task. As stated at the beginning of this section, existing monitors are
+not supposed to change their IP addresses.
+
+Continue with the monitor configuration in the example from :ref"`<Changing a
+Monitor's IP Address (Preferred Method)>
+operations_add_or_rm_mons_changing_mon_ip` . Suppose that all of the monitors
+are to be moved from the ``10.0.0.x`` range to the ``10.1.0.x`` range, and that
+these networks are unable to communicate. Carry out the following procedure:
+
+#. Retrieve the monitor map (``{tmp}`` is the path to the retrieved monitor
+ map, and ``{filename}`` is the name of the file that contains the retrieved
+ monitor map):
+
+ .. prompt:: bash $
+
+ ceph mon getmap -o {tmp}/{filename}
+
+#. Check the contents of the monitor map:
+
+ .. prompt:: bash $
+
+ monmaptool --print {tmp}/{filename}
+
+ ::
+
+ monmaptool: monmap file {tmp}/{filename}
+ epoch 1
+ fsid 224e376d-c5fe-4504-96bb-ea6332a19e61
+ last_changed 2012-12-17 02:46:41.591248
+ created 2012-12-17 02:46:41.591248
+ 0: 10.0.0.1:6789/0 mon.a
+ 1: 10.0.0.2:6789/0 mon.b
+ 2: 10.0.0.3:6789/0 mon.c
+
+#. Remove the existing monitors from the monitor map:
+
+ .. prompt:: bash $
+
+ monmaptool --rm a --rm b --rm c {tmp}/{filename}
+
+ ::
+
+ monmaptool: monmap file {tmp}/{filename}
+ monmaptool: removing a
+ monmaptool: removing b
+ monmaptool: removing c
+ monmaptool: writing epoch 1 to {tmp}/{filename} (0 monitors)
+
+#. Add the new monitor locations to the monitor map:
+
+ .. prompt:: bash $
+
+ monmaptool --add a 10.1.0.1:6789 --add b 10.1.0.2:6789 --add c 10.1.0.3:6789 {tmp}/{filename}
+
+ ::
+
+ monmaptool: monmap file {tmp}/{filename}
+ monmaptool: writing epoch 1 to {tmp}/{filename} (3 monitors)
+
+#. Check the new contents of the monitor map:
+
+ .. prompt:: bash $
+
+ monmaptool --print {tmp}/{filename}
+
+ ::
+
+ monmaptool: monmap file {tmp}/{filename}
+ epoch 1
+ fsid 224e376d-c5fe-4504-96bb-ea6332a19e61
+ last_changed 2012-12-17 02:46:41.591248
+ created 2012-12-17 02:46:41.591248
+ 0: 10.1.0.1:6789/0 mon.a
+ 1: 10.1.0.2:6789/0 mon.b
+ 2: 10.1.0.3:6789/0 mon.c
+
+At this point, we assume that the monitors (and stores) have been installed at
+the new location. Next, propagate the modified monitor map to the new monitors,
+and inject the modified monitor map into each new monitor.
+
+#. Make sure all of your monitors have been stopped. Never inject into a
+ monitor while the monitor daemon is running.
+
+#. Inject the monitor map:
+
+ .. prompt:: bash $
+
+ ceph-mon -i {mon-id} --inject-monmap {tmp}/{filename}
+
+#. Restart all of the monitors.
+
+Migration to the new location is now complete. The monitors should operate
+successfully.
+
+
+
+.. _Manual Deployment: ../../../install/manual-deployment
+.. _Monitor Bootstrap: ../../../dev/mon-bootstrap
+.. _Paxos: https://en.wikipedia.org/wiki/Paxos_(computer_science)
+
+.. |---| unicode:: U+2014 .. EM DASH
+ :trim:
diff --git a/doc/rados/operations/add-or-rm-osds.rst b/doc/rados/operations/add-or-rm-osds.rst
new file mode 100644
index 000000000..1a6621148
--- /dev/null
+++ b/doc/rados/operations/add-or-rm-osds.rst
@@ -0,0 +1,419 @@
+======================
+ Adding/Removing OSDs
+======================
+
+When a cluster is up and running, it is possible to add or remove OSDs.
+
+Adding OSDs
+===========
+
+OSDs can be added to a cluster in order to expand the cluster's capacity and
+resilience. Typically, an OSD is a Ceph ``ceph-osd`` daemon running on one
+storage drive within a host machine. But if your host machine has multiple
+storage drives, you may map one ``ceph-osd`` daemon for each drive on the
+machine.
+
+It's a good idea to check the capacity of your cluster so that you know when it
+approaches its capacity limits. If your cluster has reached its ``near full``
+ratio, then you should add OSDs to expand your cluster's capacity.
+
+.. warning:: Do not add an OSD after your cluster has reached its ``full
+ ratio``. OSD failures that occur after the cluster reaches its ``near full
+ ratio`` might cause the cluster to exceed its ``full ratio``.
+
+
+Deploying your Hardware
+-----------------------
+
+If you are also adding a new host when adding a new OSD, see `Hardware
+Recommendations`_ for details on minimum recommendations for OSD hardware. To
+add an OSD host to your cluster, begin by making sure that an appropriate
+version of Linux has been installed on the host machine and that all initial
+preparations for your storage drives have been carried out. For details, see
+`Filesystem Recommendations`_.
+
+Next, add your OSD host to a rack in your cluster, connect the host to the
+network, and ensure that the host has network connectivity. For details, see
+`Network Configuration Reference`_.
+
+
+.. _Hardware Recommendations: ../../../start/hardware-recommendations
+.. _Filesystem Recommendations: ../../configuration/filesystem-recommendations
+.. _Network Configuration Reference: ../../configuration/network-config-ref
+
+Installing the Required Software
+--------------------------------
+
+If your cluster has been manually deployed, you will need to install Ceph
+software packages manually. For details, see `Installing Ceph (Manual)`_.
+Configure SSH for the appropriate user to have both passwordless authentication
+and root permissions.
+
+.. _Installing Ceph (Manual): ../../../install
+
+
+Adding an OSD (Manual)
+----------------------
+
+The following procedure sets up a ``ceph-osd`` daemon, configures this OSD to
+use one drive, and configures the cluster to distribute data to the OSD. If
+your host machine has multiple drives, you may add an OSD for each drive on the
+host by repeating this procedure.
+
+As the following procedure will demonstrate, adding an OSD involves creating a
+metadata directory for it, configuring a data storage drive, adding the OSD to
+the cluster, and then adding it to the CRUSH map.
+
+When you add the OSD to the CRUSH map, you will need to consider the weight you
+assign to the new OSD. Since storage drive capacities increase over time, newer
+OSD hosts are likely to have larger hard drives than the older hosts in the
+cluster have and therefore might have greater weight as well.
+
+.. tip:: Ceph works best with uniform hardware across pools. It is possible to
+ add drives of dissimilar size and then adjust their weights accordingly.
+ However, for best performance, consider a CRUSH hierarchy that has drives of
+ the same type and size. It is better to add larger drives uniformly to
+ existing hosts. This can be done incrementally, replacing smaller drives
+ each time the new drives are added.
+
+#. Create the new OSD by running a command of the following form. If you opt
+ not to specify a UUID in this command, the UUID will be set automatically
+ when the OSD starts up. The OSD number, which is needed for subsequent
+ steps, is found in the command's output:
+
+ .. prompt:: bash $
+
+ ceph osd create [{uuid} [{id}]]
+
+ If the optional parameter {id} is specified it will be used as the OSD ID.
+ However, if the ID number is already in use, the command will fail.
+
+ .. warning:: Explicitly specifying the ``{id}`` parameter is not
+ recommended. IDs are allocated as an array, and any skipping of entries
+ consumes extra memory. This memory consumption can become significant if
+ there are large gaps or if clusters are large. By leaving the ``{id}``
+ parameter unspecified, we ensure that Ceph uses the smallest ID number
+ available and that these problems are avoided.
+
+#. Create the default directory for your new OSD by running commands of the
+ following form:
+
+ .. prompt:: bash $
+
+ ssh {new-osd-host}
+ sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
+
+#. If the OSD will be created on a drive other than the OS drive, prepare it
+ for use with Ceph. Run commands of the following form:
+
+ .. prompt:: bash $
+
+ ssh {new-osd-host}
+ sudo mkfs -t {fstype} /dev/{drive}
+ sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/ceph-{osd-number}
+
+#. Initialize the OSD data directory by running commands of the following form:
+
+ .. prompt:: bash $
+
+ ssh {new-osd-host}
+ ceph-osd -i {osd-num} --mkfs --mkkey
+
+ Make sure that the directory is empty before running ``ceph-osd``.
+
+#. Register the OSD authentication key by running a command of the following
+ form:
+
+ .. prompt:: bash $
+
+ ceph auth add osd.{osd-num} osd 'allow *' mon 'allow rwx' -i /var/lib/ceph/osd/ceph-{osd-num}/keyring
+
+ This presentation of the command has ``ceph-{osd-num}`` in the listed path
+ because many clusters have the name ``ceph``. However, if your cluster name
+ is not ``ceph``, then the string ``ceph`` in ``ceph-{osd-num}`` needs to be
+ replaced with your cluster name. For example, if your cluster name is
+ ``cluster1``, then the path in the command should be
+ ``/var/lib/ceph/osd/cluster1-{osd-num}/keyring``.
+
+#. Add the OSD to the CRUSH map by running the following command. This allows
+ the OSD to begin receiving data. The ``ceph osd crush add`` command can add
+ OSDs to the CRUSH hierarchy wherever you want. If you specify one or more
+ buckets, the command places the OSD in the most specific of those buckets,
+ and it moves that bucket underneath any other buckets that you have
+ specified. **Important:** If you specify only the root bucket, the command
+ will attach the OSD directly to the root, but CRUSH rules expect OSDs to be
+ inside of hosts. If the OSDs are not inside hosts, the OSDS will likely not
+ receive any data.
+
+ .. prompt:: bash $
+
+ ceph osd crush add {id-or-name} {weight} [{bucket-type}={bucket-name} ...]
+
+ Note that there is another way to add a new OSD to the CRUSH map: decompile
+ the CRUSH map, add the OSD to the device list, add the host as a bucket (if
+ it is not already in the CRUSH map), add the device as an item in the host,
+ assign the device a weight, recompile the CRUSH map, and set the CRUSH map.
+ For details, see `Add/Move an OSD`_. This is rarely necessary with recent
+ releases (this sentence was written the month that Reef was released).
+
+
+.. _rados-replacing-an-osd:
+
+Replacing an OSD
+----------------
+
+.. note:: If the procedure in this section does not work for you, try the
+ instructions in the ``cephadm`` documentation:
+ :ref:`cephadm-replacing-an-osd`.
+
+Sometimes OSDs need to be replaced: for example, when a disk fails, or when an
+administrator wants to reprovision OSDs with a new back end (perhaps when
+switching from Filestore to BlueStore). Replacing an OSD differs from `Removing
+the OSD`_ in that the replaced OSD's ID and CRUSH map entry must be kept intact
+after the OSD is destroyed for replacement.
+
+
+#. Make sure that it is safe to destroy the OSD:
+
+ .. prompt:: bash $
+
+ while ! ceph osd safe-to-destroy osd.{id} ; do sleep 10 ; done
+
+#. Destroy the OSD:
+
+ .. prompt:: bash $
+
+ ceph osd destroy {id} --yes-i-really-mean-it
+
+#. *Optional*: If the disk that you plan to use is not a new disk and has been
+ used before for other purposes, zap the disk:
+
+ .. prompt:: bash $
+
+ ceph-volume lvm zap /dev/sdX
+
+#. Prepare the disk for replacement by using the ID of the OSD that was
+ destroyed in previous steps:
+
+ .. prompt:: bash $
+
+ ceph-volume lvm prepare --osd-id {id} --data /dev/sdX
+
+#. Finally, activate the OSD:
+
+ .. prompt:: bash $
+
+ ceph-volume lvm activate {id} {fsid}
+
+Alternatively, instead of carrying out the final two steps (preparing the disk
+and activating the OSD), you can re-create the OSD by running a single command
+of the following form:
+
+ .. prompt:: bash $
+
+ ceph-volume lvm create --osd-id {id} --data /dev/sdX
+
+Starting the OSD
+----------------
+
+After an OSD is added to Ceph, the OSD is in the cluster. However, until it is
+started, the OSD is considered ``down`` and ``in``. The OSD is not running and
+will be unable to receive data. To start an OSD, either run ``service ceph``
+from your admin host or run a command of the following form to start the OSD
+from its host machine:
+
+ .. prompt:: bash $
+
+ sudo systemctl start ceph-osd@{osd-num}
+
+After the OSD is started, it is considered ``up`` and ``in``.
+
+Observing the Data Migration
+----------------------------
+
+After the new OSD has been added to the CRUSH map, Ceph begins rebalancing the
+cluster by migrating placement groups (PGs) to the new OSD. To observe this
+process by using the `ceph`_ tool, run the following command:
+
+ .. prompt:: bash $
+
+ ceph -w
+
+Or:
+
+ .. prompt:: bash $
+
+ watch ceph status
+
+The PG states will first change from ``active+clean`` to ``active, some
+degraded objects`` and then return to ``active+clean`` when migration
+completes. When you are finished observing, press Ctrl-C to exit.
+
+.. _Add/Move an OSD: ../crush-map#addosd
+.. _ceph: ../monitoring
+
+
+Removing OSDs (Manual)
+======================
+
+It is possible to remove an OSD manually while the cluster is running: you
+might want to do this in order to reduce the size of the cluster or when
+replacing hardware. Typically, an OSD is a Ceph ``ceph-osd`` daemon running on
+one storage drive within a host machine. Alternatively, if your host machine
+has multiple storage drives, you might need to remove multiple ``ceph-osd``
+daemons: one daemon for each drive on the machine.
+
+.. warning:: Before you begin the process of removing an OSD, make sure that
+ your cluster is not near its ``full ratio``. Otherwise the act of removing
+ OSDs might cause the cluster to reach or exceed its ``full ratio``.
+
+
+Taking the OSD ``out`` of the Cluster
+-------------------------------------
+
+OSDs are typically ``up`` and ``in`` before they are removed from the cluster.
+Before the OSD can be removed from the cluster, the OSD must be taken ``out``
+of the cluster so that Ceph can begin rebalancing and copying its data to other
+OSDs. To take an OSD ``out`` of the cluster, run a command of the following
+form:
+
+ .. prompt:: bash $
+
+ ceph osd out {osd-num}
+
+
+Observing the Data Migration
+----------------------------
+
+After the OSD has been taken ``out`` of the cluster, Ceph begins rebalancing
+the cluster by migrating placement groups out of the OSD that was removed. To
+observe this process by using the `ceph`_ tool, run the following command:
+
+ .. prompt:: bash $
+
+ ceph -w
+
+The PG states will change from ``active+clean`` to ``active, some degraded
+objects`` and will then return to ``active+clean`` when migration completes.
+When you are finished observing, press Ctrl-C to exit.
+
+.. note:: Under certain conditions, the action of taking ``out`` an OSD
+ might lead CRUSH to encounter a corner case in which some PGs remain stuck
+ in the ``active+remapped`` state. This problem sometimes occurs in small
+ clusters with few hosts (for example, in a small testing cluster). To
+ address this problem, mark the OSD ``in`` by running a command of the
+ following form:
+
+ .. prompt:: bash $
+
+ ceph osd in {osd-num}
+
+ After the OSD has come back to its initial state, do not mark the OSD
+ ``out`` again. Instead, set the OSD's weight to ``0`` by running a command
+ of the following form:
+
+ .. prompt:: bash $
+
+ ceph osd crush reweight osd.{osd-num} 0
+
+ After the OSD has been reweighted, observe the data migration and confirm
+ that it has completed successfully. The difference between marking an OSD
+ ``out`` and reweighting the OSD to ``0`` has to do with the bucket that
+ contains the OSD. When an OSD is marked ``out``, the weight of the bucket is
+ not changed. But when an OSD is reweighted to ``0``, the weight of the
+ bucket is updated (namely, the weight of the OSD is subtracted from the
+ overall weight of the bucket). When operating small clusters, it can
+ sometimes be preferable to use the above reweight command.
+
+
+Stopping the OSD
+----------------
+
+After you take an OSD ``out`` of the cluster, the OSD might still be running.
+In such a case, the OSD is ``up`` and ``out``. Before it is removed from the
+cluster, the OSD must be stopped by running commands of the following form:
+
+ .. prompt:: bash $
+
+ ssh {osd-host}
+ sudo systemctl stop ceph-osd@{osd-num}
+
+After the OSD has been stopped, it is ``down``.
+
+
+Removing the OSD
+----------------
+
+The following procedure removes an OSD from the cluster map, removes the OSD's
+authentication key, removes the OSD from the OSD map, and removes the OSD from
+the ``ceph.conf`` file. If your host has multiple drives, it might be necessary
+to remove an OSD from each drive by repeating this procedure.
+
+#. Begin by having the cluster forget the OSD. This step removes the OSD from
+ the CRUSH map, removes the OSD's authentication key, and removes the OSD
+ from the OSD map. (The :ref:`purge subcommand <ceph-admin-osd>` was
+ introduced in Luminous. For older releases, see :ref:`the procedure linked
+ here <ceph_osd_purge_procedure_pre_luminous>`.):
+
+ .. prompt:: bash $
+
+ ceph osd purge {id} --yes-i-really-mean-it
+
+
+#. Navigate to the host where the master copy of the cluster's
+ ``ceph.conf`` file is kept:
+
+ .. prompt:: bash $
+
+ ssh {admin-host}
+ cd /etc/ceph
+ vim ceph.conf
+
+#. Remove the OSD entry from your ``ceph.conf`` file (if such an entry
+ exists)::
+
+ [osd.1]
+ host = {hostname}
+
+#. Copy the updated ``ceph.conf`` file from the location on the host where the
+ master copy of the cluster's ``ceph.conf`` is kept to the ``/etc/ceph``
+ directory of the other hosts in your cluster.
+
+.. _ceph_osd_purge_procedure_pre_luminous:
+
+If your Ceph cluster is older than Luminous, you will be unable to use the
+``ceph osd purge`` command. Instead, carry out the following procedure:
+
+#. Remove the OSD from the CRUSH map so that it no longer receives data (for
+ more details, see `Remove an OSD`_):
+
+ .. prompt:: bash $
+
+ ceph osd crush remove {name}
+
+ Instead of removing the OSD from the CRUSH map, you might opt for one of two
+ alternatives: (1) decompile the CRUSH map, remove the OSD from the device
+ list, and remove the device from the host bucket; (2) remove the host bucket
+ from the CRUSH map (provided that it is in the CRUSH map and that you intend
+ to remove the host), recompile the map, and set it:
+
+
+#. Remove the OSD authentication key:
+
+ .. prompt:: bash $
+
+ ceph auth del osd.{osd-num}
+
+#. Remove the OSD:
+
+ .. prompt:: bash $
+
+ ceph osd rm {osd-num}
+
+ For example:
+
+ .. prompt:: bash $
+
+ ceph osd rm 1
+
+.. _Remove an OSD: ../crush-map#removeosd
diff --git a/doc/rados/operations/balancer.rst b/doc/rados/operations/balancer.rst
new file mode 100644
index 000000000..aa4eab93c
--- /dev/null
+++ b/doc/rados/operations/balancer.rst
@@ -0,0 +1,221 @@
+.. _balancer:
+
+Balancer Module
+=======================
+
+The *balancer* can optimize the allocation of placement groups (PGs) across
+OSDs in order to achieve a balanced distribution. The balancer can operate
+either automatically or in a supervised fashion.
+
+
+Status
+------
+
+To check the current status of the balancer, run the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer status
+
+
+Automatic balancing
+-------------------
+
+When the balancer is in ``upmap`` mode, the automatic balancing feature is
+enabled by default. For more details, see :ref:`upmap`. To disable the
+balancer, run the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer off
+
+The balancer mode can be changed from ``upmap`` mode to ``crush-compat`` mode.
+``crush-compat`` mode is backward compatible with older clients. In
+``crush-compat`` mode, the balancer automatically makes small changes to the
+data distribution in order to ensure that OSDs are utilized equally.
+
+
+Throttling
+----------
+
+If the cluster is degraded (that is, if an OSD has failed and the system hasn't
+healed itself yet), then the balancer will not make any adjustments to the PG
+distribution.
+
+When the cluster is healthy, the balancer will incrementally move a small
+fraction of unbalanced PGs in order to improve distribution. This fraction
+will not exceed a certain threshold that defaults to 5%. To adjust this
+``target_max_misplaced_ratio`` threshold setting, run the following command:
+
+ .. prompt:: bash $
+
+ ceph config set mgr target_max_misplaced_ratio .07 # 7%
+
+The balancer sleeps between runs. To set the number of seconds for this
+interval of sleep, run the following command:
+
+ .. prompt:: bash $
+
+ ceph config set mgr mgr/balancer/sleep_interval 60
+
+To set the time of day (in HHMM format) at which automatic balancing begins,
+run the following command:
+
+ .. prompt:: bash $
+
+ ceph config set mgr mgr/balancer/begin_time 0000
+
+To set the time of day (in HHMM format) at which automatic balancing ends, run
+the following command:
+
+ .. prompt:: bash $
+
+ ceph config set mgr mgr/balancer/end_time 2359
+
+Automatic balancing can be restricted to certain days of the week. To restrict
+it to a specific day of the week or later (as with crontab, ``0`` is Sunday,
+``1`` is Monday, and so on), run the following command:
+
+ .. prompt:: bash $
+
+ ceph config set mgr mgr/balancer/begin_weekday 0
+
+To restrict automatic balancing to a specific day of the week or earlier
+(again, ``0`` is Sunday, ``1`` is Monday, and so on), run the following
+command:
+
+ .. prompt:: bash $
+
+ ceph config set mgr mgr/balancer/end_weekday 6
+
+Automatic balancing can be restricted to certain pools. By default, the value
+of this setting is an empty string, so that all pools are automatically
+balanced. To restrict automatic balancing to specific pools, retrieve their
+numeric pool IDs (by running the :command:`ceph osd pool ls detail` command),
+and then run the following command:
+
+ .. prompt:: bash $
+
+ ceph config set mgr mgr/balancer/pool_ids 1,2,3
+
+
+Modes
+-----
+
+There are two supported balancer modes:
+
+#. **crush-compat**. This mode uses the compat weight-set feature (introduced
+ in Luminous) to manage an alternative set of weights for devices in the
+ CRUSH hierarchy. When the balancer is operating in this mode, the normal
+ weights should remain set to the size of the device in order to reflect the
+ target amount of data intended to be stored on the device. The balancer will
+ then optimize the weight-set values, adjusting them up or down in small
+ increments, in order to achieve a distribution that matches the target
+ distribution as closely as possible. (Because PG placement is a pseudorandom
+ process, it is subject to a natural amount of variation; optimizing the
+ weights serves to counteract that natural variation.)
+
+ Note that this mode is *fully backward compatible* with older clients: when
+ an OSD Map and CRUSH map are shared with older clients, Ceph presents the
+ optimized weights as the "real" weights.
+
+ The primary limitation of this mode is that the balancer cannot handle
+ multiple CRUSH hierarchies with different placement rules if the subtrees of
+ the hierarchy share any OSDs. (Such sharing of OSDs is not typical and,
+ because of the difficulty of managing the space utilization on the shared
+ OSDs, is generally not recommended.)
+
+#. **upmap**. In Luminous and later releases, the OSDMap can store explicit
+ mappings for individual OSDs as exceptions to the normal CRUSH placement
+ calculation. These ``upmap`` entries provide fine-grained control over the
+ PG mapping. This balancer mode optimizes the placement of individual PGs in
+ order to achieve a balanced distribution. In most cases, the resulting
+ distribution is nearly perfect: that is, there is an equal number of PGs on
+ each OSD (±1 PG, since the total number might not divide evenly).
+
+ To use ``upmap``, all clients must be Luminous or newer.
+
+The default mode is ``upmap``. The mode can be changed to ``crush-compat`` by
+running the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer mode crush-compat
+
+Supervised optimization
+-----------------------
+
+Supervised use of the balancer can be understood in terms of three distinct
+phases:
+
+#. building a plan
+#. evaluating the quality of the data distribution, either for the current PG
+ distribution or for the PG distribution that would result after executing a
+ plan
+#. executing the plan
+
+To evaluate the current distribution, run the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer eval
+
+To evaluate the distribution for a single pool, run the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer eval <pool-name>
+
+To see the evaluation in greater detail, run the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer eval-verbose ...
+
+To instruct the balancer to generate a plan (using the currently configured
+mode), make up a name (any useful identifying string) for the plan, and run the
+following command:
+
+ .. prompt:: bash $
+
+ ceph balancer optimize <plan-name>
+
+To see the contents of a plan, run the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer show <plan-name>
+
+To display all plans, run the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer ls
+
+To discard an old plan, run the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer rm <plan-name>
+
+To see currently recorded plans, examine the output of the following status
+command:
+
+ .. prompt:: bash $
+
+ ceph balancer status
+
+To evaluate the distribution that would result from executing a specific plan,
+run the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer eval <plan-name>
+
+If a plan is expected to improve the distribution (that is, the plan's score is
+lower than the current cluster state's score), you can execute that plan by
+running the following command:
+
+ .. prompt:: bash $
+
+ ceph balancer execute <plan-name>
diff --git a/doc/rados/operations/bluestore-migration.rst b/doc/rados/operations/bluestore-migration.rst
new file mode 100644
index 000000000..d24782c46
--- /dev/null
+++ b/doc/rados/operations/bluestore-migration.rst
@@ -0,0 +1,357 @@
+.. _rados_operations_bluestore_migration:
+
+=====================
+ BlueStore Migration
+=====================
+.. warning:: Filestore has been deprecated in the Reef release and is no longer supported.
+ Please migrate to BlueStore.
+
+Each OSD must be formatted as either Filestore or BlueStore. However, a Ceph
+cluster can operate with a mixture of both Filestore OSDs and BlueStore OSDs.
+Because BlueStore is superior to Filestore in performance and robustness, and
+because Filestore is not supported by Ceph releases beginning with Reef, users
+deploying Filestore OSDs should transition to BlueStore. There are several
+strategies for making the transition to BlueStore.
+
+BlueStore is so different from Filestore that an individual OSD cannot be
+converted in place. Instead, the conversion process must use either (1) the
+cluster's normal replication and healing support, or (2) tools and strategies
+that copy OSD content from an old (Filestore) device to a new (BlueStore) one.
+
+Deploying new OSDs with BlueStore
+=================================
+
+Use BlueStore when deploying new OSDs (for example, when the cluster is
+expanded). Because this is the default behavior, no specific change is
+needed.
+
+Similarly, use BlueStore for any OSDs that have been reprovisioned after
+a failed drive was replaced.
+
+Converting existing OSDs
+========================
+
+"Mark-``out``" replacement
+--------------------------
+
+The simplest approach is to verify that the cluster is healthy and
+then follow these steps for each Filestore OSD in succession: mark the OSD
+``out``, wait for the data to replicate across the cluster, reprovision the OSD,
+mark the OSD back ``in``, and wait for recovery to complete before proceeding
+to the next OSD. This approach is easy to automate, but it entails unnecessary
+data migration that carries costs in time and SSD wear.
+
+#. Identify a Filestore OSD to replace::
+
+ ID=<osd-id-number>
+ DEVICE=<disk-device>
+
+ #. Determine whether a given OSD is Filestore or BlueStore:
+
+ .. prompt:: bash $
+
+ ceph osd metadata $ID | grep osd_objectstore
+
+ #. Get a current count of Filestore and BlueStore OSDs:
+
+ .. prompt:: bash $
+
+ ceph osd count-metadata osd_objectstore
+
+#. Mark a Filestore OSD ``out``:
+
+ .. prompt:: bash $
+
+ ceph osd out $ID
+
+#. Wait for the data to migrate off this OSD:
+
+ .. prompt:: bash $
+
+ while ! ceph osd safe-to-destroy $ID ; do sleep 60 ; done
+
+#. Stop the OSD:
+
+ .. prompt:: bash $
+
+ systemctl kill ceph-osd@$ID
+
+ .. _osd_id_retrieval:
+
+#. Note which device the OSD is using:
+
+ .. prompt:: bash $
+
+ mount | grep /var/lib/ceph/osd/ceph-$ID
+
+#. Unmount the OSD:
+
+ .. prompt:: bash $
+
+ umount /var/lib/ceph/osd/ceph-$ID
+
+#. Destroy the OSD's data. Be *EXTREMELY CAREFUL*! These commands will destroy
+ the contents of the device; you must be certain that the data on the device is
+ not needed (in other words, that the cluster is healthy) before proceeding:
+
+ .. prompt:: bash $
+
+ ceph-volume lvm zap $DEVICE
+
+#. Tell the cluster that the OSD has been destroyed (and that a new OSD can be
+ reprovisioned with the same OSD ID):
+
+ .. prompt:: bash $
+
+ ceph osd destroy $ID --yes-i-really-mean-it
+
+#. Provision a BlueStore OSD in place by using the same OSD ID. This requires
+ you to identify which device to wipe, and to make certain that you target
+ the correct and intended device, using the information that was retrieved in
+ the :ref:`"Note which device the OSD is using" <osd_id_retrieval>` step. BE
+ CAREFUL! Note that you may need to modify these commands when dealing with
+ hybrid OSDs:
+
+ .. prompt:: bash $
+
+ ceph-volume lvm create --bluestore --data $DEVICE --osd-id $ID
+
+#. Repeat.
+
+You may opt to (1) have the balancing of the replacement BlueStore OSD take
+place concurrently with the draining of the next Filestore OSD, or instead
+(2) follow the same procedure for multiple OSDs in parallel. In either case,
+however, you must ensure that the cluster is fully clean (in other words, that
+all data has all replicas) before destroying any OSDs. If you opt to reprovision
+multiple OSDs in parallel, be **very** careful to destroy OSDs only within a
+single CRUSH failure domain (for example, ``host`` or ``rack``). Failure to
+satisfy this requirement will reduce the redundancy and availability of your
+data and increase the risk of data loss (or even guarantee data loss).
+
+Advantages:
+
+* Simple.
+* Can be done on a device-by-device basis.
+* No spare devices or hosts are required.
+
+Disadvantages:
+
+* Data is copied over the network twice: once to another OSD in the cluster (to
+ maintain the specified number of replicas), and again back to the
+ reprovisioned BlueStore OSD.
+
+"Whole host" replacement
+------------------------
+
+If you have a spare host in the cluster, or sufficient free space to evacuate
+an entire host for use as a spare, then the conversion can be done on a
+host-by-host basis so that each stored copy of the data is migrated only once.
+
+To use this approach, you need an empty host that has no OSDs provisioned.
+There are two ways to do this: either by using a new, empty host that is not
+yet part of the cluster, or by offloading data from an existing host that is
+already part of the cluster.
+
+Using a new, empty host
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Ideally the host will have roughly the same capacity as each of the other hosts
+you will be converting. Add the host to the CRUSH hierarchy, but do not attach
+it to the root:
+
+
+.. prompt:: bash $
+
+ NEWHOST=<empty-host-name>
+ ceph osd crush add-bucket $NEWHOST host
+
+Make sure that Ceph packages are installed on the new host.
+
+Using an existing host
+^^^^^^^^^^^^^^^^^^^^^^
+
+If you would like to use an existing host that is already part of the cluster,
+and if there is sufficient free space on that host so that all of its data can
+be migrated off to other cluster hosts, you can do the following (instead of
+using a new, empty host):
+
+.. prompt:: bash $
+
+ OLDHOST=<existing-cluster-host-to-offload>
+ ceph osd crush unlink $OLDHOST default
+
+where "default" is the immediate ancestor in the CRUSH map. (For
+smaller clusters with unmodified configurations this will normally
+be "default", but it might instead be a rack name.) You should now
+see the host at the top of the OSD tree output with no parent:
+
+.. prompt:: bash $
+
+ bin/ceph osd tree
+
+::
+
+ ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
+ -5 0 host oldhost
+ 10 ssd 1.00000 osd.10 up 1.00000 1.00000
+ 11 ssd 1.00000 osd.11 up 1.00000 1.00000
+ 12 ssd 1.00000 osd.12 up 1.00000 1.00000
+ -1 3.00000 root default
+ -2 3.00000 host foo
+ 0 ssd 1.00000 osd.0 up 1.00000 1.00000
+ 1 ssd 1.00000 osd.1 up 1.00000 1.00000
+ 2 ssd 1.00000 osd.2 up 1.00000 1.00000
+ ...
+
+If everything looks good, jump directly to the :ref:`"Wait for the data
+migration to complete" <bluestore_data_migration_step>` step below and proceed
+from there to clean up the old OSDs.
+
+Migration process
+^^^^^^^^^^^^^^^^^
+
+If you're using a new host, start at :ref:`the first step
+<bluestore_migration_process_first_step>`. If you're using an existing host,
+jump to :ref:`this step <bluestore_data_migration_step>`.
+
+.. _bluestore_migration_process_first_step:
+
+#. Provision new BlueStore OSDs for all devices:
+
+ .. prompt:: bash $
+
+ ceph-volume lvm create --bluestore --data /dev/$DEVICE
+
+#. Verify that the new OSDs have joined the cluster:
+
+ .. prompt:: bash $
+
+ ceph osd tree
+
+ You should see the new host ``$NEWHOST`` with all of the OSDs beneath
+ it, but the host should *not* be nested beneath any other node in the
+ hierarchy (like ``root default``). For example, if ``newhost`` is
+ the empty host, you might see something like::
+
+ $ bin/ceph osd tree
+ ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
+ -5 0 host newhost
+ 10 ssd 1.00000 osd.10 up 1.00000 1.00000
+ 11 ssd 1.00000 osd.11 up 1.00000 1.00000
+ 12 ssd 1.00000 osd.12 up 1.00000 1.00000
+ -1 3.00000 root default
+ -2 3.00000 host oldhost1
+ 0 ssd 1.00000 osd.0 up 1.00000 1.00000
+ 1 ssd 1.00000 osd.1 up 1.00000 1.00000
+ 2 ssd 1.00000 osd.2 up 1.00000 1.00000
+ ...
+
+#. Identify the first target host to convert :
+
+ .. prompt:: bash $
+
+ OLDHOST=<existing-cluster-host-to-convert>
+
+#. Swap the new host into the old host's position in the cluster:
+
+ .. prompt:: bash $
+
+ ceph osd crush swap-bucket $NEWHOST $OLDHOST
+
+ At this point all data on ``$OLDHOST`` will begin migrating to the OSDs on
+ ``$NEWHOST``. If there is a difference between the total capacity of the
+ old hosts and the total capacity of the new hosts, you may also see some
+ data migrate to or from other nodes in the cluster. Provided that the hosts
+ are similarly sized, however, this will be a relatively small amount of
+ data.
+
+ .. _bluestore_data_migration_step:
+
+#. Wait for the data migration to complete:
+
+ .. prompt:: bash $
+
+ while ! ceph osd safe-to-destroy $(ceph osd ls-tree $OLDHOST); do sleep 60 ; done
+
+#. Stop all old OSDs on the now-empty ``$OLDHOST``:
+
+ .. prompt:: bash $
+
+ ssh $OLDHOST
+ systemctl kill ceph-osd.target
+ umount /var/lib/ceph/osd/ceph-*
+
+#. Destroy and purge the old OSDs:
+
+ .. prompt:: bash $
+
+ for osd in `ceph osd ls-tree $OLDHOST`; do
+ ceph osd purge $osd --yes-i-really-mean-it
+ done
+
+#. Wipe the old OSDs. This requires you to identify which devices are to be
+ wiped manually. BE CAREFUL! For each device:
+
+ .. prompt:: bash $
+
+ ceph-volume lvm zap $DEVICE
+
+#. Use the now-empty host as the new host, and repeat:
+
+ .. prompt:: bash $
+
+ NEWHOST=$OLDHOST
+
+Advantages:
+
+* Data is copied over the network only once.
+* An entire host's OSDs are converted at once.
+* Can be parallelized, to make possible the conversion of multiple hosts at the same time.
+* No host involved in this process needs to have a spare device.
+
+Disadvantages:
+
+* A spare host is required.
+* An entire host's worth of OSDs will be migrating data at a time. This
+ is likely to impact overall cluster performance.
+* All migrated data still makes one full hop over the network.
+
+Per-OSD device copy
+-------------------
+A single logical OSD can be converted by using the ``copy`` function
+included in ``ceph-objectstore-tool``. This requires that the host have one or more free
+devices to provision a new, empty BlueStore OSD. For
+example, if each host in your cluster has twelve OSDs, then you need a
+thirteenth unused OSD so that each OSD can be converted before the
+previous OSD is reclaimed to convert the next OSD.
+
+Caveats:
+
+* This approach requires that we prepare an empty BlueStore OSD but that we do not allocate
+ a new OSD ID to it. The ``ceph-volume`` tool does not support such an operation. **IMPORTANT:**
+ because the setup of *dmcrypt* is closely tied to the identity of the OSD, this approach does not
+ work with encrypted OSDs.
+
+* The device must be manually partitioned.
+
+* An unsupported user-contributed script that demonstrates this process may be found here:
+ https://github.com/ceph/ceph/blob/master/src/script/contrib/ceph-migrate-bluestore.bash
+
+Advantages:
+
+* Provided that the 'noout' or the 'norecover'/'norebalance' flags are set on the OSD or the
+ cluster while the conversion process is underway, little or no data migrates over the
+ network during the conversion.
+
+Disadvantages:
+
+* Tooling is not fully implemented, supported, or documented.
+
+* Each host must have an appropriate spare or empty device for staging.
+
+* The OSD is offline during the conversion, which means new writes to PGs
+ with the OSD in their acting set may not be ideally redundant until the
+ subject OSD comes up and recovers. This increases the risk of data
+ loss due to an overlapping failure. However, if another OSD fails before
+ conversion and startup have completed, the original Filestore OSD can be
+ started to provide access to its original data.
diff --git a/doc/rados/operations/cache-tiering.rst b/doc/rados/operations/cache-tiering.rst
new file mode 100644
index 000000000..127b0141f
--- /dev/null
+++ b/doc/rados/operations/cache-tiering.rst
@@ -0,0 +1,557 @@
+===============
+ Cache Tiering
+===============
+
+.. warning:: Cache tiering has been deprecated in the Reef release as it
+ has lacked a maintainer for a very long time. This does not mean
+ it will be certainly removed, but we may choose to remove it
+ without much further notice.
+
+A cache tier provides Ceph Clients with better I/O performance for a subset of
+the data stored in a backing storage tier. Cache tiering involves creating a
+pool of relatively fast/expensive storage devices (e.g., solid state drives)
+configured to act as a cache tier, and a backing pool of either erasure-coded
+or relatively slower/cheaper devices configured to act as an economical storage
+tier. The Ceph objecter handles where to place the objects and the tiering
+agent determines when to flush objects from the cache to the backing storage
+tier. So the cache tier and the backing storage tier are completely transparent
+to Ceph clients.
+
+
+.. ditaa::
+ +-------------+
+ | Ceph Client |
+ +------+------+
+ ^
+ Tiering is |
+ Transparent | Faster I/O
+ to Ceph | +---------------+
+ Client Ops | | |
+ | +----->+ Cache Tier |
+ | | | |
+ | | +-----+---+-----+
+ | | | ^
+ v v | | Active Data in Cache Tier
+ +------+----+--+ | |
+ | Objecter | | |
+ +-----------+--+ | |
+ ^ | | Inactive Data in Storage Tier
+ | v |
+ | +-----+---+-----+
+ | | |
+ +----->| Storage Tier |
+ | |
+ +---------------+
+ Slower I/O
+
+
+The cache tiering agent handles the migration of data between the cache tier
+and the backing storage tier automatically. However, admins have the ability to
+configure how this migration takes place by setting the ``cache-mode``. There are
+two main scenarios:
+
+- **writeback** mode: If the base tier and the cache tier are configured in
+ ``writeback`` mode, Ceph clients receive an ACK from the base tier every time
+ they write data to it. Then the cache tiering agent determines whether
+ ``osd_tier_default_cache_min_write_recency_for_promote`` has been set. If it
+ has been set and the data has been written more than a specified number of
+ times per interval, the data is promoted to the cache tier.
+
+ When Ceph clients need access to data stored in the base tier, the cache
+ tiering agent reads the data from the base tier and returns it to the client.
+ While data is being read from the base tier, the cache tiering agent consults
+ the value of ``osd_tier_default_cache_min_read_recency_for_promote`` and
+ decides whether to promote that data from the base tier to the cache tier.
+ When data has been promoted from the base tier to the cache tier, the Ceph
+ client is able to perform I/O operations on it using the cache tier. This is
+ well-suited for mutable data (for example, photo/video editing, transactional
+ data).
+
+- **readproxy** mode: This mode will use any objects that already
+ exist in the cache tier, but if an object is not present in the
+ cache the request will be proxied to the base tier. This is useful
+ for transitioning from ``writeback`` mode to a disabled cache as it
+ allows the workload to function properly while the cache is drained,
+ without adding any new objects to the cache.
+
+Other cache modes are:
+
+- **readonly** promotes objects to the cache on read operations only; write
+ operations are forwarded to the base tier. This mode is intended for
+ read-only workloads that do not require consistency to be enforced by the
+ storage system. (**Warning**: when objects are updated in the base tier,
+ Ceph makes **no** attempt to sync these updates to the corresponding objects
+ in the cache. Since this mode is considered experimental, a
+ ``--yes-i-really-mean-it`` option must be passed in order to enable it.)
+
+- **none** is used to completely disable caching.
+
+
+A word of caution
+=================
+
+Cache tiering will *degrade* performance for most workloads. Users should use
+extreme caution before using this feature.
+
+* *Workload dependent*: Whether a cache will improve performance is
+ highly dependent on the workload. Because there is a cost
+ associated with moving objects into or out of the cache, it can only
+ be effective when there is a *large skew* in the access pattern in
+ the data set, such that most of the requests touch a small number of
+ objects. The cache pool should be large enough to capture the
+ working set for your workload to avoid thrashing.
+
+* *Difficult to benchmark*: Most benchmarks that users run to measure
+ performance will show terrible performance with cache tiering, in
+ part because very few of them skew requests toward a small set of
+ objects, it can take a long time for the cache to "warm up," and
+ because the warm-up cost can be high.
+
+* *Usually slower*: For workloads that are not cache tiering-friendly,
+ performance is often slower than a normal RADOS pool without cache
+ tiering enabled.
+
+* *librados object enumeration*: The librados-level object enumeration
+ API is not meant to be coherent in the presence of the case. If
+ your application is using librados directly and relies on object
+ enumeration, cache tiering will probably not work as expected.
+ (This is not a problem for RGW, RBD, or CephFS.)
+
+* *Complexity*: Enabling cache tiering means that a lot of additional
+ machinery and complexity within the RADOS cluster is being used.
+ This increases the probability that you will encounter a bug in the system
+ that other users have not yet encountered and will put your deployment at a
+ higher level of risk.
+
+Known Good Workloads
+--------------------
+
+* *RGW time-skewed*: If the RGW workload is such that almost all read
+ operations are directed at recently written objects, a simple cache
+ tiering configuration that destages recently written objects from
+ the cache to the base tier after a configurable period can work
+ well.
+
+Known Bad Workloads
+-------------------
+
+The following configurations are *known to work poorly* with cache
+tiering.
+
+* *RBD with replicated cache and erasure-coded base*: This is a common
+ request, but usually does not perform well. Even reasonably skewed
+ workloads still send some small writes to cold objects, and because
+ small writes are not yet supported by the erasure-coded pool, entire
+ (usually 4 MB) objects must be migrated into the cache in order to
+ satisfy a small (often 4 KB) write. Only a handful of users have
+ successfully deployed this configuration, and it only works for them
+ because their data is extremely cold (backups) and they are not in
+ any way sensitive to performance.
+
+* *RBD with replicated cache and base*: RBD with a replicated base
+ tier does better than when the base is erasure coded, but it is
+ still highly dependent on the amount of skew in the workload, and
+ very difficult to validate. The user will need to have a good
+ understanding of their workload and will need to tune the cache
+ tiering parameters carefully.
+
+
+Setting Up Pools
+================
+
+To set up cache tiering, you must have two pools. One will act as the
+backing storage and the other will act as the cache.
+
+
+Setting Up a Backing Storage Pool
+---------------------------------
+
+Setting up a backing storage pool typically involves one of two scenarios:
+
+- **Standard Storage**: In this scenario, the pool stores multiple copies
+ of an object in the Ceph Storage Cluster.
+
+- **Erasure Coding:** In this scenario, the pool uses erasure coding to
+ store data much more efficiently with a small performance tradeoff.
+
+In the standard storage scenario, you can setup a CRUSH rule to establish
+the failure domain (e.g., osd, host, chassis, rack, row, etc.). Ceph OSD
+Daemons perform optimally when all storage drives in the rule are of the
+same size, speed (both RPMs and throughput) and type. See `CRUSH Maps`_
+for details on creating a rule. Once you have created a rule, create
+a backing storage pool.
+
+In the erasure coding scenario, the pool creation arguments will generate the
+appropriate rule automatically. See `Create a Pool`_ for details.
+
+In subsequent examples, we will refer to the backing storage pool
+as ``cold-storage``.
+
+
+Setting Up a Cache Pool
+-----------------------
+
+Setting up a cache pool follows the same procedure as the standard storage
+scenario, but with this difference: the drives for the cache tier are typically
+high performance drives that reside in their own servers and have their own
+CRUSH rule. When setting up such a rule, it should take account of the hosts
+that have the high performance drives while omitting the hosts that don't. See
+:ref:`CRUSH Device Class<crush-map-device-class>` for details.
+
+
+In subsequent examples, we will refer to the cache pool as ``hot-storage`` and
+the backing pool as ``cold-storage``.
+
+For cache tier configuration and default values, see
+`Pools - Set Pool Values`_.
+
+
+Creating a Cache Tier
+=====================
+
+Setting up a cache tier involves associating a backing storage pool with
+a cache pool:
+
+.. prompt:: bash $
+
+ ceph osd tier add {storagepool} {cachepool}
+
+For example:
+
+.. prompt:: bash $
+
+ ceph osd tier add cold-storage hot-storage
+
+To set the cache mode, execute the following:
+
+.. prompt:: bash $
+
+ ceph osd tier cache-mode {cachepool} {cache-mode}
+
+For example:
+
+.. prompt:: bash $
+
+ ceph osd tier cache-mode hot-storage writeback
+
+The cache tiers overlay the backing storage tier, so they require one
+additional step: you must direct all client traffic from the storage pool to
+the cache pool. To direct client traffic directly to the cache pool, execute
+the following:
+
+.. prompt:: bash $
+
+ ceph osd tier set-overlay {storagepool} {cachepool}
+
+For example:
+
+.. prompt:: bash $
+
+ ceph osd tier set-overlay cold-storage hot-storage
+
+
+Configuring a Cache Tier
+========================
+
+Cache tiers have several configuration options. You may set
+cache tier configuration options with the following usage:
+
+.. prompt:: bash $
+
+ ceph osd pool set {cachepool} {key} {value}
+
+See `Pools - Set Pool Values`_ for details.
+
+
+Target Size and Type
+--------------------
+
+Ceph's production cache tiers use a `Bloom Filter`_ for the ``hit_set_type``:
+
+.. prompt:: bash $
+
+ ceph osd pool set {cachepool} hit_set_type bloom
+
+For example:
+
+.. prompt:: bash $
+
+ ceph osd pool set hot-storage hit_set_type bloom
+
+The ``hit_set_count`` and ``hit_set_period`` define how many such HitSets to
+store, and how much time each HitSet should cover:
+
+.. prompt:: bash $
+
+ ceph osd pool set {cachepool} hit_set_count 12
+ ceph osd pool set {cachepool} hit_set_period 14400
+ ceph osd pool set {cachepool} target_max_bytes 1000000000000
+
+.. note:: A larger ``hit_set_count`` results in more RAM consumed by
+ the ``ceph-osd`` process.
+
+Binning accesses over time allows Ceph to determine whether a Ceph client
+accessed an object at least once, or more than once over a time period
+("age" vs "temperature").
+
+The ``min_read_recency_for_promote`` defines how many HitSets to check for the
+existence of an object when handling a read operation. The checking result is
+used to decide whether to promote the object asynchronously. Its value should be
+between 0 and ``hit_set_count``. If it's set to 0, the object is always promoted.
+If it's set to 1, the current HitSet is checked. And if this object is in the
+current HitSet, it's promoted. Otherwise not. For the other values, the exact
+number of archive HitSets are checked. The object is promoted if the object is
+found in any of the most recent ``min_read_recency_for_promote`` HitSets.
+
+A similar parameter can be set for the write operation, which is
+``min_write_recency_for_promote``:
+
+.. prompt:: bash $
+
+ ceph osd pool set {cachepool} min_read_recency_for_promote 2
+ ceph osd pool set {cachepool} min_write_recency_for_promote 2
+
+.. note:: The longer the period and the higher the
+ ``min_read_recency_for_promote`` and
+ ``min_write_recency_for_promote``values, the more RAM the ``ceph-osd``
+ daemon consumes. In particular, when the agent is active to flush
+ or evict cache objects, all ``hit_set_count`` HitSets are loaded
+ into RAM.
+
+
+Cache Sizing
+------------
+
+The cache tiering agent performs two main functions:
+
+- **Flushing:** The agent identifies modified (or dirty) objects and forwards
+ them to the storage pool for long-term storage.
+
+- **Evicting:** The agent identifies objects that haven't been modified
+ (or clean) and evicts the least recently used among them from the cache.
+
+
+Absolute Sizing
+~~~~~~~~~~~~~~~
+
+The cache tiering agent can flush or evict objects based upon the total number
+of bytes or the total number of objects. To specify a maximum number of bytes,
+execute the following:
+
+.. prompt:: bash $
+
+ ceph osd pool set {cachepool} target_max_bytes {#bytes}
+
+For example, to flush or evict at 1 TB, execute the following:
+
+.. prompt:: bash $
+
+ ceph osd pool set hot-storage target_max_bytes 1099511627776
+
+To specify the maximum number of objects, execute the following:
+
+.. prompt:: bash $
+
+ ceph osd pool set {cachepool} target_max_objects {#objects}
+
+For example, to flush or evict at 1M objects, execute the following:
+
+.. prompt:: bash $
+
+ ceph osd pool set hot-storage target_max_objects 1000000
+
+.. note:: Ceph is not able to determine the size of a cache pool automatically, so
+ the configuration on the absolute size is required here, otherwise the
+ flush/evict will not work. If you specify both limits, the cache tiering
+ agent will begin flushing or evicting when either threshold is triggered.
+
+.. note:: All client requests will be blocked only when ``target_max_bytes`` or
+ ``target_max_objects`` reached
+
+Relative Sizing
+~~~~~~~~~~~~~~~
+
+The cache tiering agent can flush or evict objects relative to the size of the
+cache pool(specified by ``target_max_bytes`` / ``target_max_objects`` in
+`Absolute sizing`_). When the cache pool consists of a certain percentage of
+modified (or dirty) objects, the cache tiering agent will flush them to the
+storage pool. To set the ``cache_target_dirty_ratio``, execute the following:
+
+.. prompt:: bash $
+
+ ceph osd pool set {cachepool} cache_target_dirty_ratio {0.0..1.0}
+
+For example, setting the value to ``0.4`` will begin flushing modified
+(dirty) objects when they reach 40% of the cache pool's capacity:
+
+.. prompt:: bash $
+
+ ceph osd pool set hot-storage cache_target_dirty_ratio 0.4
+
+When the dirty objects reaches a certain percentage of its capacity, flush dirty
+objects with a higher speed. To set the ``cache_target_dirty_high_ratio``:
+
+.. prompt:: bash $
+
+ ceph osd pool set {cachepool} cache_target_dirty_high_ratio {0.0..1.0}
+
+For example, setting the value to ``0.6`` will begin aggressively flush dirty
+objects when they reach 60% of the cache pool's capacity. obviously, we'd
+better set the value between dirty_ratio and full_ratio:
+
+.. prompt:: bash $
+
+ ceph osd pool set hot-storage cache_target_dirty_high_ratio 0.6
+
+When the cache pool reaches a certain percentage of its capacity, the cache
+tiering agent will evict objects to maintain free capacity. To set the
+``cache_target_full_ratio``, execute the following:
+
+.. prompt:: bash $
+
+ ceph osd pool set {cachepool} cache_target_full_ratio {0.0..1.0}
+
+For example, setting the value to ``0.8`` will begin flushing unmodified
+(clean) objects when they reach 80% of the cache pool's capacity:
+
+.. prompt:: bash $
+
+ ceph osd pool set hot-storage cache_target_full_ratio 0.8
+
+
+Cache Age
+---------
+
+You can specify the minimum age of an object before the cache tiering agent
+flushes a recently modified (or dirty) object to the backing storage pool:
+
+.. prompt:: bash $
+
+ ceph osd pool set {cachepool} cache_min_flush_age {#seconds}
+
+For example, to flush modified (or dirty) objects after 10 minutes, execute the
+following:
+
+.. prompt:: bash $
+
+ ceph osd pool set hot-storage cache_min_flush_age 600
+
+You can specify the minimum age of an object before it will be evicted from the
+cache tier:
+
+.. prompt:: bash $
+
+ ceph osd pool {cache-tier} cache_min_evict_age {#seconds}
+
+For example, to evict objects after 30 minutes, execute the following:
+
+.. prompt:: bash $
+
+ ceph osd pool set hot-storage cache_min_evict_age 1800
+
+
+Removing a Cache Tier
+=====================
+
+Removing a cache tier differs depending on whether it is a writeback
+cache or a read-only cache.
+
+
+Removing a Read-Only Cache
+--------------------------
+
+Since a read-only cache does not have modified data, you can disable
+and remove it without losing any recent changes to objects in the cache.
+
+#. Change the cache-mode to ``none`` to disable it.:
+
+ .. prompt:: bash
+
+ ceph osd tier cache-mode {cachepool} none
+
+ For example:
+
+ .. prompt:: bash $
+
+ ceph osd tier cache-mode hot-storage none
+
+#. Remove the cache pool from the backing pool.:
+
+ .. prompt:: bash $
+
+ ceph osd tier remove {storagepool} {cachepool}
+
+ For example:
+
+ .. prompt:: bash $
+
+ ceph osd tier remove cold-storage hot-storage
+
+
+Removing a Writeback Cache
+--------------------------
+
+Since a writeback cache may have modified data, you must take steps to ensure
+that you do not lose any recent changes to objects in the cache before you
+disable and remove it.
+
+
+#. Change the cache mode to ``proxy`` so that new and modified objects will
+ flush to the backing storage pool.:
+
+ .. prompt:: bash $
+
+ ceph osd tier cache-mode {cachepool} proxy
+
+ For example:
+
+ .. prompt:: bash $
+
+ ceph osd tier cache-mode hot-storage proxy
+
+
+#. Ensure that the cache pool has been flushed. This may take a few minutes:
+
+ .. prompt:: bash $
+
+ rados -p {cachepool} ls
+
+ If the cache pool still has objects, you can flush them manually.
+ For example:
+
+ .. prompt:: bash $
+
+ rados -p {cachepool} cache-flush-evict-all
+
+
+#. Remove the overlay so that clients will not direct traffic to the cache.:
+
+ .. prompt:: bash $
+
+ ceph osd tier remove-overlay {storagetier}
+
+ For example:
+
+ .. prompt:: bash $
+
+ ceph osd tier remove-overlay cold-storage
+
+
+#. Finally, remove the cache tier pool from the backing storage pool.:
+
+ .. prompt:: bash $
+
+ ceph osd tier remove {storagepool} {cachepool}
+
+ For example:
+
+ .. prompt:: bash $
+
+ ceph osd tier remove cold-storage hot-storage
+
+
+.. _Create a Pool: ../pools#create-a-pool
+.. _Pools - Set Pool Values: ../pools#set-pool-values
+.. _Bloom Filter: https://en.wikipedia.org/wiki/Bloom_filter
+.. _CRUSH Maps: ../crush-map
+.. _Absolute Sizing: #absolute-sizing
diff --git a/doc/rados/operations/change-mon-elections.rst b/doc/rados/operations/change-mon-elections.rst
new file mode 100644
index 000000000..7418ea363
--- /dev/null
+++ b/doc/rados/operations/change-mon-elections.rst
@@ -0,0 +1,100 @@
+.. _changing_monitor_elections:
+
+=======================================
+Configuring Monitor Election Strategies
+=======================================
+
+By default, the monitors are in ``classic`` mode. We recommend staying in this
+mode unless you have a very specific reason.
+
+If you want to switch modes BEFORE constructing the cluster, change the ``mon
+election default strategy`` option. This option takes an integer value:
+
+* ``1`` for ``classic``
+* ``2`` for ``disallow``
+* ``3`` for ``connectivity``
+
+After your cluster has started running, you can change strategies by running a
+command of the following form:
+
+ $ ceph mon set election_strategy {classic|disallow|connectivity}
+
+Choosing a mode
+===============
+
+The modes other than ``classic`` provide specific features. We recommend staying
+in ``classic`` mode if you don't need these extra features because it is the
+simplest mode.
+
+.. _rados_operations_disallow_mode:
+
+Disallow Mode
+=============
+
+The ``disallow`` mode allows you to mark monitors as disallowed. Disallowed
+monitors participate in the quorum and serve clients, but cannot be elected
+leader. You might want to use this mode for monitors that are far away from
+clients.
+
+To disallow a monitor from being elected leader, run a command of the following
+form:
+
+.. prompt:: bash $
+
+ ceph mon add disallowed_leader {name}
+
+To remove a monitor from the disallowed list and allow it to be elected leader,
+run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph mon rm disallowed_leader {name}
+
+To see the list of disallowed leaders, examine the output of the following
+command:
+
+.. prompt:: bash $
+
+ ceph mon dump
+
+Connectivity Mode
+=================
+
+The ``connectivity`` mode evaluates connection scores that are provided by each
+monitor for its peers and elects the monitor with the highest score. This mode
+is designed to handle network partitioning (also called *net-splits*): network
+partitioning might occur if your cluster is stretched across multiple data
+centers or otherwise has a non-uniform or unbalanced network topology.
+
+The ``connectivity`` mode also supports disallowing monitors from being elected
+leader by using the same commands that were presented in :ref:`Disallow Mode <rados_operations_disallow_mode>`.
+
+Examining connectivity scores
+=============================
+
+The monitors maintain connection scores even if they aren't in ``connectivity``
+mode. To examine a specific monitor's connection scores, run a command of the
+following form:
+
+.. prompt:: bash $
+
+ ceph daemon mon.{name} connection scores dump
+
+Scores for an individual connection range from ``0`` to ``1`` inclusive and
+include whether the connection is considered alive or dead (as determined by
+whether it returned its latest ping before timeout).
+
+Connectivity scores are expected to remain valid. However, if during
+troubleshooting you determine that these scores have for some reason become
+invalid, drop the history and reset the scores by running a command of the
+following form:
+
+.. prompt:: bash $
+
+ ceph daemon mon.{name} connection scores reset
+
+Resetting connectivity scores carries little risk: monitors will still quickly
+determine whether a connection is alive or dead and trend back to the previous
+scores if those scores were accurate. Nevertheless, resetting scores ought to
+be unnecessary and it is not recommended unless advised by your support team
+or by a developer.
diff --git a/doc/rados/operations/control.rst b/doc/rados/operations/control.rst
new file mode 100644
index 000000000..033f831cd
--- /dev/null
+++ b/doc/rados/operations/control.rst
@@ -0,0 +1,665 @@
+.. index:: control, commands
+
+==================
+ Control Commands
+==================
+
+
+Monitor Commands
+================
+
+To issue monitor commands, use the ``ceph`` utility:
+
+.. prompt:: bash $
+
+ ceph [-m monhost] {command}
+
+In most cases, monitor commands have the following form:
+
+.. prompt:: bash $
+
+ ceph {subsystem} {command}
+
+
+System Commands
+===============
+
+To display the current cluster status, run the following commands:
+
+.. prompt:: bash $
+
+ ceph -s
+ ceph status
+
+To display a running summary of cluster status and major events, run the
+following command:
+
+.. prompt:: bash $
+
+ ceph -w
+
+To display the monitor quorum, including which monitors are participating and
+which one is the leader, run the following commands:
+
+.. prompt:: bash $
+
+ ceph mon stat
+ ceph quorum_status
+
+To query the status of a single monitor, including whether it is in the quorum,
+run the following command:
+
+.. prompt:: bash $
+
+ ceph tell mon.[id] mon_status
+
+Here the value of ``[id]`` can be found by consulting the output of ``ceph
+-s``.
+
+
+Authentication Subsystem
+========================
+
+To add an OSD keyring for a specific OSD, run the following command:
+
+.. prompt:: bash $
+
+ ceph auth add {osd} {--in-file|-i} {path-to-osd-keyring}
+
+To list the cluster's keys and their capabilities, run the following command:
+
+.. prompt:: bash $
+
+ ceph auth ls
+
+
+Placement Group Subsystem
+=========================
+
+To display the statistics for all placement groups (PGs), run the following
+command:
+
+.. prompt:: bash $
+
+ ceph pg dump [--format {format}]
+
+Here the valid formats are ``plain`` (default), ``json`` ``json-pretty``,
+``xml``, and ``xml-pretty``. When implementing monitoring tools and other
+tools, it is best to use the ``json`` format. JSON parsing is more
+deterministic than the ``plain`` format (which is more human readable), and the
+layout is much more consistent from release to release. The ``jq`` utility is
+very useful for extracting data from JSON output.
+
+To display the statistics for all PGs stuck in a specified state, run the
+following command:
+
+.. prompt:: bash $
+
+ ceph pg dump_stuck inactive|unclean|stale|undersized|degraded [--format {format}] [-t|--threshold {seconds}]
+
+Here ``--format`` may be ``plain`` (default), ``json``, ``json-pretty``,
+``xml``, or ``xml-pretty``.
+
+The ``--threshold`` argument determines the time interval (in seconds) for a PG
+to be considered ``stuck`` (default: 300).
+
+PGs might be stuck in any of the following states:
+
+**Inactive**
+
+ PGs are unable to process reads or writes because they are waiting for an
+ OSD that has the most up-to-date data to return to an ``up`` state.
+
+
+**Unclean**
+
+ PGs contain objects that have not been replicated the desired number of
+ times. These PGs have not yet completed the process of recovering.
+
+
+**Stale**
+
+ PGs are in an unknown state, because the OSDs that host them have not
+ reported to the monitor cluster for a certain period of time (specified by
+ the ``mon_osd_report_timeout`` configuration setting).
+
+
+To delete a ``lost`` object or revert an object to its prior state, either by
+reverting it to its previous version or by deleting it because it was just
+created and has no previous version, run the following command:
+
+.. prompt:: bash $
+
+ ceph pg {pgid} mark_unfound_lost revert|delete
+
+
+.. _osd-subsystem:
+
+OSD Subsystem
+=============
+
+To query OSD subsystem status, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd stat
+
+To write a copy of the most recent OSD map to a file (see :ref:`osdmaptool
+<osdmaptool>`), run the following command:
+
+.. prompt:: bash $
+
+ ceph osd getmap -o file
+
+To write a copy of the CRUSH map from the most recent OSD map to a file, run
+the following command:
+
+.. prompt:: bash $
+
+ ceph osd getcrushmap -o file
+
+Note that this command is functionally equivalent to the following two
+commands:
+
+.. prompt:: bash $
+
+ ceph osd getmap -o /tmp/osdmap
+ osdmaptool /tmp/osdmap --export-crush file
+
+To dump the OSD map, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd dump [--format {format}]
+
+The ``--format`` option accepts the following arguments: ``plain`` (default),
+``json``, ``json-pretty``, ``xml``, and ``xml-pretty``. As noted above, JSON is
+the recommended format for tools, scripting, and other forms of automation.
+
+To dump the OSD map as a tree that lists one OSD per line and displays
+information about the weights and states of the OSDs, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph osd tree [--format {format}]
+
+To find out where a specific RADOS object is stored in the system, run a
+command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd map <pool-name> <object-name>
+
+To add or move a new OSD (specified by its ID, name, or weight) to a specific
+CRUSH location, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush set {id} {weight} [{loc1} [{loc2} ...]]
+
+To remove an existing OSD from the CRUSH map, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush remove {name}
+
+To remove an existing bucket from the CRUSH map, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush remove {bucket-name}
+
+To move an existing bucket from one position in the CRUSH hierarchy to another,
+run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush move {id} {loc1} [{loc2} ...]
+
+To set the CRUSH weight of a specific OSD (specified by ``{name}``) to
+``{weight}``, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush reweight {name} {weight}
+
+To mark an OSD as ``lost``, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd lost {id} [--yes-i-really-mean-it]
+
+.. warning::
+ This could result in permanent data loss. Use with caution!
+
+To create a new OSD, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd create [{uuid}]
+
+If no UUID is given as part of this command, the UUID will be set automatically
+when the OSD starts up.
+
+To remove one or more specific OSDs, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd rm [{id}...]
+
+To display the current ``max_osd`` parameter in the OSD map, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph osd getmaxosd
+
+To import a specific CRUSH map, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd setcrushmap -i file
+
+To set the ``max_osd`` parameter in the OSD map, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd setmaxosd
+
+The parameter has a default value of 10000. Most operators will never need to
+adjust it.
+
+To mark a specific OSD ``down``, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd down {osd-num}
+
+To mark a specific OSD ``out`` (so that no data will be allocated to it), run
+the following command:
+
+.. prompt:: bash $
+
+ ceph osd out {osd-num}
+
+To mark a specific OSD ``in`` (so that data will be allocated to it), run the
+following command:
+
+.. prompt:: bash $
+
+ ceph osd in {osd-num}
+
+By using the "pause flags" in the OSD map, you can pause or unpause I/O
+requests. If the flags are set, then no I/O requests will be sent to any OSD.
+When the flags are cleared, then pending I/O requests will be resent. To set or
+clear pause flags, run one of the following commands:
+
+.. prompt:: bash $
+
+ ceph osd pause
+ ceph osd unpause
+
+You can assign an override or ``reweight`` weight value to a specific OSD if
+the normal CRUSH distribution seems to be suboptimal. The weight of an OSD
+helps determine the extent of its I/O requests and data storage: two OSDs with
+the same weight will receive approximately the same number of I/O requests and
+store approximately the same amount of data. The ``ceph osd reweight`` command
+assigns an override weight to an OSD. The weight value is in the range 0 to 1,
+and the command forces CRUSH to relocate a certain amount (1 - ``weight``) of
+the data that would otherwise be on this OSD. The command does not change the
+weights of the buckets above the OSD in the CRUSH map. Using the command is
+merely a corrective measure: for example, if one of your OSDs is at 90% and the
+others are at 50%, you could reduce the outlier weight to correct this
+imbalance. To assign an override weight to a specific OSD, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph osd reweight {osd-num} {weight}
+
+.. note:: Any assigned override reweight value will conflict with the balancer.
+ This means that if the balancer is in use, all override reweight values
+ should be ``1.0000`` in order to avoid suboptimal cluster behavior.
+
+A cluster's OSDs can be reweighted in order to maintain balance if some OSDs
+are being disproportionately utilized. Note that override or ``reweight``
+weights have values relative to one another that default to 1.00000; their
+values are not absolute, and these weights must be distinguished from CRUSH
+weights (which reflect the absolute capacity of a bucket, as measured in TiB).
+To reweight OSDs by utilization, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd reweight-by-utilization [threshold [max_change [max_osds]]] [--no-increasing]
+
+By default, this command adjusts the override weight of OSDs that have ±20% of
+the average utilization, but you can specify a different percentage in the
+``threshold`` argument.
+
+To limit the increment by which any OSD's reweight is to be changed, use the
+``max_change`` argument (default: 0.05). To limit the number of OSDs that are
+to be adjusted, use the ``max_osds`` argument (default: 4). Increasing these
+variables can accelerate the reweighting process, but perhaps at the cost of
+slower client operations (as a result of the increase in data movement).
+
+You can test the ``osd reweight-by-utilization`` command before running it. To
+find out which and how many PGs and OSDs will be affected by a specific use of
+the ``osd reweight-by-utilization`` command, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd test-reweight-by-utilization [threshold [max_change max_osds]] [--no-increasing]
+
+The ``--no-increasing`` option can be added to the ``reweight-by-utilization``
+and ``test-reweight-by-utilization`` commands in order to prevent any override
+weights that are currently less than 1.00000 from being increased. This option
+can be useful in certain circumstances: for example, when you are hastily
+balancing in order to remedy ``full`` or ``nearfull`` OSDs, or when there are
+OSDs being evacuated or slowly brought into service.
+
+Operators of deployments that utilize Nautilus or newer (or later revisions of
+Luminous and Mimic) and that have no pre-Luminous clients might likely instead
+want to enable the `balancer`` module for ``ceph-mgr``.
+
+The blocklist can be modified by adding or removing an IP address or a CIDR
+range. If an address is blocklisted, it will be unable to connect to any OSD.
+If an OSD is contained within an IP address or CIDR range that has been
+blocklisted, the OSD will be unable to perform operations on its peers when it
+acts as a client: such blocked operations include tiering and copy-from
+functionality. To add or remove an IP address or CIDR range to the blocklist,
+run one of the following commands:
+
+.. prompt:: bash $
+
+ ceph osd blocklist ["range"] add ADDRESS[:source_port][/netmask_bits] [TIME]
+ ceph osd blocklist ["range"] rm ADDRESS[:source_port][/netmask_bits]
+
+If you add something to the blocklist with the above ``add`` command, you can
+use the ``TIME`` keyword to specify the length of time (in seconds) that it
+will remain on the blocklist (default: one hour). To add or remove a CIDR
+range, use the ``range`` keyword in the above commands.
+
+Note that these commands are useful primarily in failure testing. Under normal
+conditions, blocklists are maintained automatically and do not need any manual
+intervention.
+
+To create or delete a snapshot of a specific storage pool, run one of the
+following commands:
+
+.. prompt:: bash $
+
+ ceph osd pool mksnap {pool-name} {snap-name}
+ ceph osd pool rmsnap {pool-name} {snap-name}
+
+To create, delete, or rename a specific storage pool, run one of the following
+commands:
+
+.. prompt:: bash $
+
+ ceph osd pool create {pool-name} [pg_num [pgp_num]]
+ ceph osd pool delete {pool-name} [{pool-name} --yes-i-really-really-mean-it]
+ ceph osd pool rename {old-name} {new-name}
+
+To change a pool setting, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set {pool-name} {field} {value}
+
+The following are valid fields:
+
+ * ``size``: The number of copies of data in the pool.
+ * ``pg_num``: The PG number.
+ * ``pgp_num``: The effective number of PGs when calculating placement.
+ * ``crush_rule``: The rule number for mapping placement.
+
+To retrieve the value of a pool setting, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool get {pool-name} {field}
+
+Valid fields are:
+
+ * ``pg_num``: The PG number.
+ * ``pgp_num``: The effective number of PGs when calculating placement.
+
+To send a scrub command to a specific OSD, or to all OSDs (by using ``*``), run
+the following command:
+
+.. prompt:: bash $
+
+ ceph osd scrub {osd-num}
+
+To send a repair command to a specific OSD, or to all OSDs (by using ``*``),
+run the following command:
+
+.. prompt:: bash $
+
+ ceph osd repair N
+
+You can run a simple throughput benchmark test against a specific OSD. This
+test writes a total size of ``TOTAL_DATA_BYTES`` (default: 1 GB) incrementally,
+in multiple write requests that each have a size of ``BYTES_PER_WRITE``
+(default: 4 MB). The test is not destructive and it will not overwrite existing
+live OSD data, but it might temporarily affect the performance of clients that
+are concurrently accessing the OSD. To launch this benchmark test, run the
+following command:
+
+.. prompt:: bash $
+
+ ceph tell osd.N bench [TOTAL_DATA_BYTES] [BYTES_PER_WRITE]
+
+To clear the caches of a specific OSD during the interval between one benchmark
+run and another, run the following command:
+
+.. prompt:: bash $
+
+ ceph tell osd.N cache drop
+
+To retrieve the cache statistics of a specific OSD, run the following command:
+
+.. prompt:: bash $
+
+ ceph tell osd.N cache status
+
+MDS Subsystem
+=============
+
+To change the configuration parameters of a running metadata server, run the
+following command:
+
+.. prompt:: bash $
+
+ ceph tell mds.{mds-id} config set {setting} {value}
+
+Example:
+
+.. prompt:: bash $
+
+ ceph tell mds.0 config set debug_ms 1
+
+To enable debug messages, run the following command:
+
+.. prompt:: bash $
+
+ ceph mds stat
+
+To display the status of all metadata servers, run the following command:
+
+.. prompt:: bash $
+
+ ceph mds fail 0
+
+To mark the active metadata server as failed (and to trigger failover to a
+standby if a standby is present), run the following command:
+
+.. todo:: ``ceph mds`` subcommands missing docs: set, dump, getmap, stop, setmap
+
+
+Mon Subsystem
+=============
+
+To display monitor statistics, run the following command:
+
+.. prompt:: bash $
+
+ ceph mon stat
+
+This command returns output similar to the following:
+
+::
+
+ e2: 3 mons at {a=127.0.0.1:40000/0,b=127.0.0.1:40001/0,c=127.0.0.1:40002/0}, election epoch 6, quorum 0,1,2 a,b,c
+
+There is a ``quorum`` list at the end of the output. It lists those monitor
+nodes that are part of the current quorum.
+
+To retrieve this information in a more direct way, run the following command:
+
+.. prompt:: bash $
+
+ ceph quorum_status -f json-pretty
+
+This command returns output similar to the following:
+
+.. code-block:: javascript
+
+ {
+ "election_epoch": 6,
+ "quorum": [
+ 0,
+ 1,
+ 2
+ ],
+ "quorum_names": [
+ "a",
+ "b",
+ "c"
+ ],
+ "quorum_leader_name": "a",
+ "monmap": {
+ "epoch": 2,
+ "fsid": "ba807e74-b64f-4b72-b43f-597dfe60ddbc",
+ "modified": "2016-12-26 14:42:09.288066",
+ "created": "2016-12-26 14:42:03.573585",
+ "features": {
+ "persistent": [
+ "kraken"
+ ],
+ "optional": []
+ },
+ "mons": [
+ {
+ "rank": 0,
+ "name": "a",
+ "addr": "127.0.0.1:40000\/0",
+ "public_addr": "127.0.0.1:40000\/0"
+ },
+ {
+ "rank": 1,
+ "name": "b",
+ "addr": "127.0.0.1:40001\/0",
+ "public_addr": "127.0.0.1:40001\/0"
+ },
+ {
+ "rank": 2,
+ "name": "c",
+ "addr": "127.0.0.1:40002\/0",
+ "public_addr": "127.0.0.1:40002\/0"
+ }
+ ]
+ }
+ }
+
+
+The above will block until a quorum is reached.
+
+To see the status of a specific monitor, run the following command:
+
+.. prompt:: bash $
+
+ ceph tell mon.[name] mon_status
+
+Here the value of ``[name]`` can be found by consulting the output of the
+``ceph quorum_status`` command. This command returns output similar to the
+following:
+
+::
+
+ {
+ "name": "b",
+ "rank": 1,
+ "state": "peon",
+ "election_epoch": 6,
+ "quorum": [
+ 0,
+ 1,
+ 2
+ ],
+ "features": {
+ "required_con": "9025616074522624",
+ "required_mon": [
+ "kraken"
+ ],
+ "quorum_con": "1152921504336314367",
+ "quorum_mon": [
+ "kraken"
+ ]
+ },
+ "outside_quorum": [],
+ "extra_probe_peers": [],
+ "sync_provider": [],
+ "monmap": {
+ "epoch": 2,
+ "fsid": "ba807e74-b64f-4b72-b43f-597dfe60ddbc",
+ "modified": "2016-12-26 14:42:09.288066",
+ "created": "2016-12-26 14:42:03.573585",
+ "features": {
+ "persistent": [
+ "kraken"
+ ],
+ "optional": []
+ },
+ "mons": [
+ {
+ "rank": 0,
+ "name": "a",
+ "addr": "127.0.0.1:40000\/0",
+ "public_addr": "127.0.0.1:40000\/0"
+ },
+ {
+ "rank": 1,
+ "name": "b",
+ "addr": "127.0.0.1:40001\/0",
+ "public_addr": "127.0.0.1:40001\/0"
+ },
+ {
+ "rank": 2,
+ "name": "c",
+ "addr": "127.0.0.1:40002\/0",
+ "public_addr": "127.0.0.1:40002\/0"
+ }
+ ]
+ }
+ }
+
+To see a dump of the monitor state, run the following command:
+
+.. prompt:: bash $
+
+ ceph mon dump
+
+This command returns output similar to the following:
+
+::
+
+ dumped monmap epoch 2
+ epoch 2
+ fsid ba807e74-b64f-4b72-b43f-597dfe60ddbc
+ last_changed 2016-12-26 14:42:09.288066
+ created 2016-12-26 14:42:03.573585
+ 0: 127.0.0.1:40000/0 mon.a
+ 1: 127.0.0.1:40001/0 mon.b
+ 2: 127.0.0.1:40002/0 mon.c
diff --git a/doc/rados/operations/crush-map-edits.rst b/doc/rados/operations/crush-map-edits.rst
new file mode 100644
index 000000000..46a4a4f74
--- /dev/null
+++ b/doc/rados/operations/crush-map-edits.rst
@@ -0,0 +1,746 @@
+Manually editing the CRUSH Map
+==============================
+
+.. note:: Manually editing the CRUSH map is an advanced administrator
+ operation. For the majority of installations, CRUSH changes can be
+ implemented via the Ceph CLI and do not require manual CRUSH map edits. If
+ you have identified a use case where manual edits *are* necessary with a
+ recent Ceph release, consider contacting the Ceph developers at dev@ceph.io
+ so that future versions of Ceph do not have this problem.
+
+To edit an existing CRUSH map, carry out the following procedure:
+
+#. `Get the CRUSH map`_.
+#. `Decompile`_ the CRUSH map.
+#. Edit at least one of the following sections: `Devices`_, `Buckets`_, and
+ `Rules`_. Use a text editor for this task.
+#. `Recompile`_ the CRUSH map.
+#. `Set the CRUSH map`_.
+
+For details on setting the CRUSH map rule for a specific pool, see `Set Pool
+Values`_.
+
+.. _Get the CRUSH map: #getcrushmap
+.. _Decompile: #decompilecrushmap
+.. _Devices: #crushmapdevices
+.. _Buckets: #crushmapbuckets
+.. _Rules: #crushmaprules
+.. _Recompile: #compilecrushmap
+.. _Set the CRUSH map: #setcrushmap
+.. _Set Pool Values: ../pools#setpoolvalues
+
+.. _getcrushmap:
+
+Get the CRUSH Map
+-----------------
+
+To get the CRUSH map for your cluster, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd getcrushmap -o {compiled-crushmap-filename}
+
+Ceph outputs (``-o``) a compiled CRUSH map to the filename that you have
+specified. Because the CRUSH map is in a compiled form, you must first
+decompile it before you can edit it.
+
+.. _decompilecrushmap:
+
+Decompile the CRUSH Map
+-----------------------
+
+To decompile the CRUSH map, run a command of the following form:
+
+.. prompt:: bash $
+
+ crushtool -d {compiled-crushmap-filename} -o {decompiled-crushmap-filename}
+
+.. _compilecrushmap:
+
+Recompile the CRUSH Map
+-----------------------
+
+To compile the CRUSH map, run a command of the following form:
+
+.. prompt:: bash $
+
+ crushtool -c {decompiled-crushmap-filename} -o {compiled-crushmap-filename}
+
+.. _setcrushmap:
+
+Set the CRUSH Map
+-----------------
+
+To set the CRUSH map for your cluster, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd setcrushmap -i {compiled-crushmap-filename}
+
+Ceph loads (``-i``) a compiled CRUSH map from the filename that you have
+specified.
+
+Sections
+--------
+
+A CRUSH map has six main sections:
+
+#. **tunables:** The preamble at the top of the map describes any *tunables*
+ that are not a part of legacy CRUSH behavior. These tunables correct for old
+ bugs, optimizations, or other changes that have been made over the years to
+ improve CRUSH's behavior.
+
+#. **devices:** Devices are individual OSDs that store data.
+
+#. **types**: Bucket ``types`` define the types of buckets that are used in
+ your CRUSH hierarchy.
+
+#. **buckets:** Buckets consist of a hierarchical aggregation of storage
+ locations (for example, rows, racks, chassis, hosts) and their assigned
+ weights. After the bucket ``types`` have been defined, the CRUSH map defines
+ each node in the hierarchy, its type, and which devices or other nodes it
+ contains.
+
+#. **rules:** Rules define policy about how data is distributed across
+ devices in the hierarchy.
+
+#. **choose_args:** ``choose_args`` are alternative weights associated with
+ the hierarchy that have been adjusted in order to optimize data placement. A
+ single ``choose_args`` map can be used for the entire cluster, or a number
+ of ``choose_args`` maps can be created such that each map is crafted for a
+ particular pool.
+
+
+.. _crushmapdevices:
+
+CRUSH-Map Devices
+-----------------
+
+Devices are individual OSDs that store data. In this section, there is usually
+one device defined for each OSD daemon in your cluster. Devices are identified
+by an ``id`` (a non-negative integer) and a ``name`` (usually ``osd.N``, where
+``N`` is the device's ``id``).
+
+
+.. _crush-map-device-class:
+
+A device can also have a *device class* associated with it: for example,
+``hdd`` or ``ssd``. Device classes make it possible for devices to be targeted
+by CRUSH rules. This means that device classes allow CRUSH rules to select only
+OSDs that match certain characteristics. For example, you might want an RBD
+pool associated only with SSDs and a different RBD pool associated only with
+HDDs.
+
+To see a list of devices, run the following command:
+
+.. prompt:: bash #
+
+ ceph device ls
+
+The output of this command takes the following form:
+
+::
+
+ device {num} {osd.name} [class {class}]
+
+For example:
+
+.. prompt:: bash #
+
+ ceph device ls
+
+::
+
+ device 0 osd.0 class ssd
+ device 1 osd.1 class hdd
+ device 2 osd.2
+ device 3 osd.3
+
+In most cases, each device maps to a corresponding ``ceph-osd`` daemon. This
+daemon might map to a single storage device, a pair of devices (for example,
+one for data and one for a journal or metadata), or in some cases a small RAID
+device or a partition of a larger storage device.
+
+
+CRUSH-Map Bucket Types
+----------------------
+
+The second list in the CRUSH map defines 'bucket' types. Buckets facilitate a
+hierarchy of nodes and leaves. Node buckets (also known as non-leaf buckets)
+typically represent physical locations in a hierarchy. Nodes aggregate other
+nodes or leaves. Leaf buckets represent ``ceph-osd`` daemons and their
+corresponding storage media.
+
+.. tip:: In the context of CRUSH, the term "bucket" is used to refer to
+ a node in the hierarchy (that is, to a location or a piece of physical
+ hardware). In the context of RADOS Gateway APIs, however, the term
+ "bucket" has a different meaning.
+
+To add a bucket type to the CRUSH map, create a new line under the list of
+bucket types. Enter ``type`` followed by a unique numeric ID and a bucket name.
+By convention, there is exactly one leaf bucket type and it is ``type 0``;
+however, you may give the leaf bucket any name you like (for example: ``osd``,
+``disk``, ``drive``, ``storage``)::
+
+ # types
+ type {num} {bucket-name}
+
+For example::
+
+ # types
+ type 0 osd
+ type 1 host
+ type 2 chassis
+ type 3 rack
+ type 4 row
+ type 5 pdu
+ type 6 pod
+ type 7 room
+ type 8 datacenter
+ type 9 zone
+ type 10 region
+ type 11 root
+
+.. _crushmapbuckets:
+
+CRUSH-Map Bucket Hierarchy
+--------------------------
+
+The CRUSH algorithm distributes data objects among storage devices according to
+a per-device weight value, approximating a uniform probability distribution.
+CRUSH distributes objects and their replicas according to the hierarchical
+cluster map you define. The CRUSH map represents the available storage devices
+and the logical elements that contain them.
+
+To map placement groups (PGs) to OSDs across failure domains, a CRUSH map
+defines a hierarchical list of bucket types under ``#types`` in the generated
+CRUSH map. The purpose of creating a bucket hierarchy is to segregate the leaf
+nodes according to their failure domains (for example: hosts, chassis, racks,
+power distribution units, pods, rows, rooms, and data centers). With the
+exception of the leaf nodes that represent OSDs, the hierarchy is arbitrary and
+you may define it according to your own needs.
+
+We recommend adapting your CRUSH map to your preferred hardware-naming
+conventions and using bucket names that clearly reflect the physical
+hardware. Clear naming practice can make it easier to administer the cluster
+and easier to troubleshoot problems when OSDs malfunction (or other hardware
+malfunctions) and the administrator needs access to physical hardware.
+
+
+In the following example, the bucket hierarchy has a leaf bucket named ``osd``
+and two node buckets named ``host`` and ``rack``:
+
+.. ditaa::
+ +-----------+
+ | {o}rack |
+ | Bucket |
+ +-----+-----+
+ |
+ +---------------+---------------+
+ | |
+ +-----+-----+ +-----+-----+
+ | {o}host | | {o}host |
+ | Bucket | | Bucket |
+ +-----+-----+ +-----+-----+
+ | |
+ +-------+-------+ +-------+-------+
+ | | | |
+ +-----+-----+ +-----+-----+ +-----+-----+ +-----+-----+
+ | osd | | osd | | osd | | osd |
+ | Bucket | | Bucket | | Bucket | | Bucket |
+ +-----------+ +-----------+ +-----------+ +-----------+
+
+.. note:: The higher-numbered ``rack`` bucket type aggregates the
+ lower-numbered ``host`` bucket type.
+
+Because leaf nodes reflect storage devices that have already been declared
+under the ``#devices`` list at the beginning of the CRUSH map, there is no need
+to declare them as bucket instances. The second-lowest bucket type in your
+hierarchy is typically used to aggregate the devices (that is, the
+second-lowest bucket type is usually the computer that contains the storage
+media and, such as ``node``, ``computer``, ``server``, ``host``, or
+``machine``). In high-density environments, it is common to have multiple hosts
+or nodes in a single chassis (for example, in the cases of blades or twins). It
+is important to anticipate the potential consequences of chassis failure -- for
+example, during the replacement of a chassis in case of a node failure, the
+chassis's hosts or nodes (and their associated OSDs) will be in a ``down``
+state.
+
+To declare a bucket instance, do the following: specify its type, give it a
+unique name (an alphanumeric string), assign it a unique ID expressed as a
+negative integer (this is optional), assign it a weight relative to the total
+capacity and capability of the item(s) in the bucket, assign it a bucket
+algorithm (usually ``straw2``), and specify the bucket algorithm's hash
+(usually ``0``, a setting that reflects the hash algorithm ``rjenkins1``). A
+bucket may have one or more items. The items may consist of node buckets or
+leaves. Items may have a weight that reflects the relative weight of the item.
+
+To declare a node bucket, use the following syntax::
+
+ [bucket-type] [bucket-name] {
+ id [a unique negative numeric ID]
+ weight [the relative capacity/capability of the item(s)]
+ alg [the bucket type: uniform | list | tree | straw | straw2 ]
+ hash [the hash type: 0 by default]
+ item [item-name] weight [weight]
+ }
+
+For example, in the above diagram, two host buckets (referred to in the
+declaration below as ``node1`` and ``node2``) and one rack bucket (referred to
+in the declaration below as ``rack1``) are defined. The OSDs are declared as
+items within the host buckets::
+
+ host node1 {
+ id -1
+ alg straw2
+ hash 0
+ item osd.0 weight 1.00
+ item osd.1 weight 1.00
+ }
+
+ host node2 {
+ id -2
+ alg straw2
+ hash 0
+ item osd.2 weight 1.00
+ item osd.3 weight 1.00
+ }
+
+ rack rack1 {
+ id -3
+ alg straw2
+ hash 0
+ item node1 weight 2.00
+ item node2 weight 2.00
+ }
+
+.. note:: In this example, the rack bucket does not contain any OSDs. Instead,
+ it contains lower-level host buckets and includes the sum of their weight in
+ the item entry.
+
+
+.. topic:: Bucket Types
+
+ Ceph supports five bucket types. Each bucket type provides a balance between
+ performance and reorganization efficiency, and each is different from the
+ others. If you are unsure of which bucket type to use, use the ``straw2``
+ bucket. For a more technical discussion of bucket types than is offered
+ here, see **Section 3.4** of `CRUSH - Controlled, Scalable, Decentralized
+ Placement of Replicated Data`_.
+
+ The bucket types are as follows:
+
+ #. **uniform**: Uniform buckets aggregate devices that have **exactly**
+ the same weight. For example, when hardware is commissioned or
+ decommissioned, it is often done in sets of machines that have exactly
+ the same physical configuration (this can be the case, for example,
+ after bulk purchases). When storage devices have exactly the same
+ weight, you may use the ``uniform`` bucket type, which allows CRUSH to
+ map replicas into uniform buckets in constant time. If your devices have
+ non-uniform weights, you should not use the uniform bucket algorithm.
+
+ #. **list**: List buckets aggregate their content as linked lists. The
+ behavior of list buckets is governed by the :abbr:`RUSH (Replication
+ Under Scalable Hashing)`:sub:`P` algorithm. In the behavior of this
+ bucket type, an object is either relocated to the newest device in
+ accordance with an appropriate probability, or it remains on the older
+ devices as before. This results in optimal data migration when items are
+ added to the bucket. The removal of items from the middle or the tail of
+ the list, however, can result in a significant amount of unnecessary
+ data movement. This means that list buckets are most suitable for
+ circumstances in which they **never shrink or very rarely shrink**.
+
+ #. **tree**: Tree buckets use a binary search tree. They are more efficient
+ at dealing with buckets that contain many items than are list buckets.
+ The behavior of tree buckets is governed by the :abbr:`RUSH (Replication
+ Under Scalable Hashing)`:sub:`R` algorithm. Tree buckets reduce the
+ placement time to 0(log\ :sub:`n`). This means that tree buckets are
+ suitable for managing large sets of devices or nested buckets.
+
+ #. **straw**: Straw buckets allow all items in the bucket to "compete"
+ against each other for replica placement through a process analogous to
+ drawing straws. This is different from the behavior of list buckets and
+ tree buckets, which use a divide-and-conquer strategy that either gives
+ certain items precedence (for example, those at the beginning of a list)
+ or obviates the need to consider entire subtrees of items. Such an
+ approach improves the performance of the replica placement process, but
+ can also introduce suboptimal reorganization behavior when the contents
+ of a bucket change due an addition, a removal, or the re-weighting of an
+ item.
+
+ * **straw2**: Straw2 buckets improve on Straw by correctly avoiding
+ any data movement between items when neighbor weights change. For
+ example, if the weight of a given item changes (including during the
+ operations of adding it to the cluster or removing it from the
+ cluster), there will be data movement to or from only that item.
+ Neighbor weights are not taken into account.
+
+
+.. topic:: Hash
+
+ Each bucket uses a hash algorithm. As of Reef, Ceph supports the
+ ``rjenkins1`` algorithm. To select ``rjenkins1`` as the hash algorithm,
+ enter ``0`` as your hash setting.
+
+.. _weightingbucketitems:
+
+.. topic:: Weighting Bucket Items
+
+ Ceph expresses bucket weights as doubles, which allows for fine-grained
+ weighting. A weight is the relative difference between device capacities. We
+ recommend using ``1.00`` as the relative weight for a 1 TB storage device.
+ In such a scenario, a weight of ``0.50`` would represent approximately 500
+ GB, and a weight of ``3.00`` would represent approximately 3 TB. Buckets
+ higher in the CRUSH hierarchy have a weight that is the sum of the weight of
+ the leaf items aggregated by the bucket.
+
+
+.. _crushmaprules:
+
+CRUSH Map Rules
+---------------
+
+CRUSH maps have rules that include data placement for a pool: these are
+called "CRUSH rules". The default CRUSH map has one rule for each pool. If you
+are running a large cluster, you might create many pools and each of those
+pools might have its own non-default CRUSH rule.
+
+
+.. note:: In most cases, there is no need to modify the default rule. When a
+ new pool is created, by default the rule will be set to the value ``0``
+ (which indicates the default CRUSH rule, which has the numeric ID ``0``).
+
+CRUSH rules define policy that governs how data is distributed across the devices in
+the hierarchy. The rules define placement as well as replication strategies or
+distribution policies that allow you to specify exactly how CRUSH places data
+replicas. For example, you might create one rule selecting a pair of targets for
+two-way mirroring, another rule for selecting three targets in two different data
+centers for three-way replication, and yet another rule for erasure coding across
+six storage devices. For a detailed discussion of CRUSH rules, see **Section 3.2**
+of `CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_.
+
+A rule takes the following form::
+
+ rule <rulename> {
+
+ id [a unique integer ID]
+ type [replicated|erasure]
+ step take <bucket-name> [class <device-class>]
+ step [choose|chooseleaf] [firstn|indep] <N> type <bucket-type>
+ step emit
+ }
+
+
+``id``
+ :Description: A unique integer that identifies the rule.
+ :Purpose: A component of the rule mask.
+ :Type: Integer
+ :Required: Yes
+ :Default: 0
+
+
+``type``
+ :Description: Denotes the type of replication strategy to be enforced by the
+ rule.
+ :Purpose: A component of the rule mask.
+ :Type: String
+ :Required: Yes
+ :Default: ``replicated``
+ :Valid Values: ``replicated`` or ``erasure``
+
+
+``step take <bucket-name> [class <device-class>]``
+ :Description: Takes a bucket name and iterates down the tree. If
+ the ``device-class`` argument is specified, the argument must
+ match a class assigned to OSDs within the cluster. Only
+ devices belonging to the class are included.
+ :Purpose: A component of the rule.
+ :Required: Yes
+ :Example: ``step take data``
+
+
+
+``step choose firstn {num} type {bucket-type}``
+ :Description: Selects ``num`` buckets of the given type from within the
+ current bucket. ``{num}`` is usually the number of replicas in
+ the pool (in other words, the pool size).
+
+ - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (as many buckets as are available).
+ - If ``pool-num-replicas > {num} > 0``, choose that many buckets.
+ - If ``{num} < 0``, choose ``pool-num-replicas - {num}`` buckets.
+
+ :Purpose: A component of the rule.
+ :Prerequisite: Follows ``step take`` or ``step choose``.
+ :Example: ``step choose firstn 1 type row``
+
+
+``step chooseleaf firstn {num} type {bucket-type}``
+ :Description: Selects a set of buckets of the given type and chooses a leaf
+ node (that is, an OSD) from the subtree of each bucket in that set of buckets. The
+ number of buckets in the set is usually the number of replicas in
+ the pool (in other words, the pool size).
+
+ - If ``{num} == 0``, choose ``pool-num-replicas`` buckets (as many buckets as are available).
+ - If ``pool-num-replicas > {num} > 0``, choose that many buckets.
+ - If ``{num} < 0``, choose ``pool-num-replicas - {num}`` buckets.
+ :Purpose: A component of the rule. Using ``chooseleaf`` obviates the need to select a device in a separate step.
+ :Prerequisite: Follows ``step take`` or ``step choose``.
+ :Example: ``step chooseleaf firstn 0 type row``
+
+
+``step emit``
+ :Description: Outputs the current value on the top of the stack and empties
+ the stack. Typically used
+ at the end of a rule, but may also be used to choose from different
+ trees in the same rule.
+
+ :Purpose: A component of the rule.
+ :Prerequisite: Follows ``step choose``.
+ :Example: ``step emit``
+
+.. important:: A single CRUSH rule can be assigned to multiple pools, but
+ a single pool cannot have multiple CRUSH rules.
+
+``firstn`` or ``indep``
+
+ :Description: Determines which replacement strategy CRUSH uses when items (OSDs)
+ are marked ``down`` in the CRUSH map. When this rule is used
+ with replicated pools, ``firstn`` is used. When this rule is
+ used with erasure-coded pools, ``indep`` is used.
+
+ Suppose that a PG is stored on OSDs 1, 2, 3, 4, and 5 and then
+ OSD 3 goes down.
+
+ When in ``firstn`` mode, CRUSH simply adjusts its calculation
+ to select OSDs 1 and 2, then selects 3 and discovers that 3 is
+ down, retries and selects 4 and 5, and finally goes on to
+ select a new OSD: OSD 6. The final CRUSH mapping
+ transformation is therefore 1, 2, 3, 4, 5 → 1, 2, 4, 5, 6.
+
+ However, if you were storing an erasure-coded pool, the above
+ sequence would have changed the data that is mapped to OSDs 4,
+ 5, and 6. The ``indep`` mode attempts to avoid this unwanted
+ consequence. When in ``indep`` mode, CRUSH can be expected to
+ select 3, discover that 3 is down, retry, and select 6. The
+ final CRUSH mapping transformation is therefore 1, 2, 3, 4, 5
+ → 1, 2, 6, 4, 5.
+
+.. _crush-reclassify:
+
+Migrating from a legacy SSD rule to device classes
+--------------------------------------------------
+
+Prior to the Luminous release's introduction of the *device class* feature, in
+order to write rules that applied to a specialized device type (for example,
+SSD), it was necessary to manually edit the CRUSH map and maintain a parallel
+hierarchy for each device type. The device class feature provides a more
+transparent way to achieve this end.
+
+However, if your cluster is migrated from an existing manually-customized
+per-device map to new device class-based rules, all data in the system will be
+reshuffled.
+
+The ``crushtool`` utility has several commands that can transform a legacy rule
+and hierarchy and allow you to start using the new device class rules. There
+are three possible types of transformation:
+
+#. ``--reclassify-root <root-name> <device-class>``
+
+ This command examines everything under ``root-name`` in the hierarchy and
+ rewrites any rules that reference the specified root and that have the
+ form ``take <root-name>`` so that they instead have the
+ form ``take <root-name> class <device-class>``. The command also renumbers
+ the buckets in such a way that the old IDs are used for the specified
+ class's "shadow tree" and as a result no data movement takes place.
+
+ For example, suppose you have the following as an existing rule::
+
+ rule replicated_rule {
+ id 0
+ type replicated
+ step take default
+ step chooseleaf firstn 0 type rack
+ step emit
+ }
+
+ If the root ``default`` is reclassified as class ``hdd``, the new rule will
+ be as follows::
+
+ rule replicated_rule {
+ id 0
+ type replicated
+ step take default class hdd
+ step chooseleaf firstn 0 type rack
+ step emit
+ }
+
+#. ``--set-subtree-class <bucket-name> <device-class>``
+
+ This command marks every device in the subtree that is rooted at *bucket-name*
+ with the specified device class.
+
+ This command is typically used in conjunction with the ``--reclassify-root`` option
+ in order to ensure that all devices in that root are labeled with the
+ correct class. In certain circumstances, however, some of those devices
+ are correctly labeled with a different class and must not be relabeled. To
+ manage this difficulty, one can exclude the ``--set-subtree-class``
+ option. The remapping process will not be perfect, because the previous rule
+ had an effect on devices of multiple classes but the adjusted rules will map
+ only to devices of the specified device class. However, when there are not many
+ outlier devices, the resulting level of data movement is often within tolerable
+ limits.
+
+
+#. ``--reclassify-bucket <match-pattern> <device-class> <default-parent>``
+
+ This command allows you to merge a parallel type-specific hierarchy with the
+ normal hierarchy. For example, many users have maps that resemble the
+ following::
+
+ host node1 {
+ id -2 # do not change unnecessarily
+ # weight 109.152
+ alg straw2
+ hash 0 # rjenkins1
+ item osd.0 weight 9.096
+ item osd.1 weight 9.096
+ item osd.2 weight 9.096
+ item osd.3 weight 9.096
+ item osd.4 weight 9.096
+ item osd.5 weight 9.096
+ ...
+ }
+
+ host node1-ssd {
+ id -10 # do not change unnecessarily
+ # weight 2.000
+ alg straw2
+ hash 0 # rjenkins1
+ item osd.80 weight 2.000
+ ...
+ }
+
+ root default {
+ id -1 # do not change unnecessarily
+ alg straw2
+ hash 0 # rjenkins1
+ item node1 weight 110.967
+ ...
+ }
+
+ root ssd {
+ id -18 # do not change unnecessarily
+ # weight 16.000
+ alg straw2
+ hash 0 # rjenkins1
+ item node1-ssd weight 2.000
+ ...
+ }
+
+ This command reclassifies each bucket that matches a certain
+ pattern. The pattern can be of the form ``%suffix`` or ``prefix%``. For
+ example, in the above example, we would use the pattern
+ ``%-ssd``. For each matched bucket, the remaining portion of the
+ name (corresponding to the ``%`` wildcard) specifies the *base bucket*. All
+ devices in the matched bucket are labeled with the specified
+ device class and then moved to the base bucket. If the base bucket
+ does not exist (for example, ``node12-ssd`` exists but ``node12`` does
+ not), then it is created and linked under the specified
+ *default parent* bucket. In each case, care is taken to preserve
+ the old bucket IDs for the new shadow buckets in order to prevent data
+ movement. Any rules with ``take`` steps that reference the old
+ buckets are adjusted accordingly.
+
+
+#. ``--reclassify-bucket <bucket-name> <device-class> <base-bucket>``
+
+ The same command can also be used without a wildcard in order to map a
+ single bucket. For example, in the previous example, we want the
+ ``ssd`` bucket to be mapped to the ``default`` bucket.
+
+#. The final command to convert the map that consists of the above fragments
+ resembles the following:
+
+ .. prompt:: bash $
+
+ ceph osd getcrushmap -o original
+ crushtool -i original --reclassify \
+ --set-subtree-class default hdd \
+ --reclassify-root default hdd \
+ --reclassify-bucket %-ssd ssd default \
+ --reclassify-bucket ssd ssd default \
+ -o adjusted
+
+``--compare`` flag
+------------------
+
+A ``--compare`` flag is available to make sure that the conversion performed in
+:ref:`Migrating from a legacy SSD rule to device classes <crush-reclassify>` is
+correct. This flag tests a large sample of inputs against the CRUSH map and
+checks that the expected result is output. The options that control these
+inputs are the same as the options that apply to the ``--test`` command. For an
+illustration of how this ``--compare`` command applies to the above example,
+see the following:
+
+.. prompt:: bash $
+
+ crushtool -i original --compare adjusted
+
+::
+
+ rule 0 had 0/10240 mismatched mappings (0)
+ rule 1 had 0/10240 mismatched mappings (0)
+ maps appear equivalent
+
+If the command finds any differences, the ratio of remapped inputs is reported
+in the parentheses.
+
+When you are satisfied with the adjusted map, apply it to the cluster by
+running the following command:
+
+.. prompt:: bash $
+
+ ceph osd setcrushmap -i adjusted
+
+Manually Tuning CRUSH
+---------------------
+
+If you have verified that all clients are running recent code, you can adjust
+the CRUSH tunables by extracting the CRUSH map, modifying the values, and
+reinjecting the map into the cluster. The procedure is carried out as follows:
+
+#. Extract the latest CRUSH map:
+
+ .. prompt:: bash $
+
+ ceph osd getcrushmap -o /tmp/crush
+
+#. Adjust tunables. In our tests, the following values appear to result in the
+ best behavior for both large and small clusters. The procedure requires that
+ you specify the ``--enable-unsafe-tunables`` flag in the ``crushtool``
+ command. Use this option with **extreme care**:
+
+ .. prompt:: bash $
+
+ crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new
+
+#. Reinject the modified map:
+
+ .. prompt:: bash $
+
+ ceph osd setcrushmap -i /tmp/crush.new
+
+Legacy values
+-------------
+
+To set the legacy values of the CRUSH tunables, run the following command:
+
+.. prompt:: bash $
+
+ crushtool -i /tmp/crush --set-choose-local-tries 2 --set-choose-local-fallback-tries 5 --set-choose-total-tries 19 --set-chooseleaf-descend-once 0 --set-chooseleaf-vary-r 0 -o /tmp/crush.legacy
+
+The special ``--enable-unsafe-tunables`` flag is required. Be careful when
+running old versions of the ``ceph-osd`` daemon after reverting to legacy
+values, because the feature bit is not perfectly enforced.
+
+.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.io/assets/pdfs/weil-crush-sc06.pdf
diff --git a/doc/rados/operations/crush-map.rst b/doc/rados/operations/crush-map.rst
new file mode 100644
index 000000000..39151e6d4
--- /dev/null
+++ b/doc/rados/operations/crush-map.rst
@@ -0,0 +1,1147 @@
+============
+ CRUSH Maps
+============
+
+The :abbr:`CRUSH (Controlled Replication Under Scalable Hashing)` algorithm
+computes storage locations in order to determine how to store and retrieve
+data. CRUSH allows Ceph clients to communicate with OSDs directly rather than
+through a centralized server or broker. By using an algorithmically-determined
+method of storing and retrieving data, Ceph avoids a single point of failure, a
+performance bottleneck, and a physical limit to its scalability.
+
+CRUSH uses a map of the cluster (the CRUSH map) to map data to OSDs,
+distributing the data across the cluster in accordance with configured
+replication policy and failure domains. For a detailed discussion of CRUSH, see
+`CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_
+
+CRUSH maps contain a list of :abbr:`OSDs (Object Storage Devices)` and a
+hierarchy of "buckets" (``host``\s, ``rack``\s) and rules that govern how CRUSH
+replicates data within the cluster's pools. By reflecting the underlying
+physical organization of the installation, CRUSH can model (and thereby
+address) the potential for correlated device failures. Some factors relevant
+to the CRUSH hierarchy include chassis, racks, physical proximity, a shared
+power source, shared networking, and failure domains. By encoding this
+information into the CRUSH map, CRUSH placement policies distribute object
+replicas across failure domains while maintaining the desired distribution. For
+example, to address the possibility of concurrent failures, it might be
+desirable to ensure that data replicas are on devices that reside in or rely
+upon different shelves, racks, power supplies, controllers, or physical
+locations.
+
+When OSDs are deployed, they are automatically added to the CRUSH map under a
+``host`` bucket that is named for the node on which the OSDs run. This
+behavior, combined with the configured CRUSH failure domain, ensures that
+replicas or erasure-code shards are distributed across hosts and that the
+failure of a single host or other kinds of failures will not affect
+availability. For larger clusters, administrators must carefully consider their
+choice of failure domain. For example, distributing replicas across racks is
+typical for mid- to large-sized clusters.
+
+
+CRUSH Location
+==============
+
+The location of an OSD within the CRUSH map's hierarchy is referred to as its
+``CRUSH location``. The specification of a CRUSH location takes the form of a
+list of key-value pairs. For example, if an OSD is in a particular row, rack,
+chassis, and host, and is also part of the 'default' CRUSH root (which is the
+case for most clusters), its CRUSH location can be specified as follows::
+
+ root=default row=a rack=a2 chassis=a2a host=a2a1
+
+.. note::
+
+ #. The order of the keys does not matter.
+ #. The key name (left of ``=``) must be a valid CRUSH ``type``. By default,
+ valid CRUSH types include ``root``, ``datacenter``, ``room``, ``row``,
+ ``pod``, ``pdu``, ``rack``, ``chassis``, and ``host``. These defined
+ types suffice for nearly all clusters, but can be customized by
+ modifying the CRUSH map.
+ #. Not all keys need to be specified. For example, by default, Ceph
+ automatically sets an ``OSD``'s location as ``root=default
+ host=HOSTNAME`` (as determined by the output of ``hostname -s``).
+
+The CRUSH location for an OSD can be modified by adding the ``crush location``
+option in ``ceph.conf``. When this option has been added, every time the OSD
+starts it verifies that it is in the correct location in the CRUSH map and
+moves itself if it is not. To disable this automatic CRUSH map management, add
+the following to the ``ceph.conf`` configuration file in the ``[osd]``
+section::
+
+ osd crush update on start = false
+
+Note that this action is unnecessary in most cases.
+
+
+Custom location hooks
+---------------------
+
+A custom location hook can be used to generate a more complete CRUSH location
+on startup. The CRUSH location is determined by, in order of preference:
+
+#. A ``crush location`` option in ``ceph.conf``
+#. A default of ``root=default host=HOSTNAME`` where the hostname is determined
+ by the output of the ``hostname -s`` command
+
+A script can be written to provide additional location fields (for example,
+``rack`` or ``datacenter``) and the hook can be enabled via the following
+config option::
+
+ crush location hook = /path/to/customized-ceph-crush-location
+
+This hook is passed several arguments (see below). The hook outputs a single
+line to ``stdout`` that contains the CRUSH location description. The output
+resembles the following:::
+
+ --cluster CLUSTER --id ID --type TYPE
+
+Here the cluster name is typically ``ceph``, the ``id`` is the daemon
+identifier or (in the case of OSDs) the OSD number, and the daemon type is
+``osd``, ``mds, ``mgr``, or ``mon``.
+
+For example, a simple hook that specifies a rack location via a value in the
+file ``/etc/rack`` might be as follows::
+
+ #!/bin/sh
+ echo "host=$(hostname -s) rack=$(cat /etc/rack) root=default"
+
+
+CRUSH structure
+===============
+
+The CRUSH map consists of (1) a hierarchy that describes the physical topology
+of the cluster and (2) a set of rules that defines data placement policy. The
+hierarchy has devices (OSDs) at the leaves and internal nodes corresponding to
+other physical features or groupings: hosts, racks, rows, data centers, and so
+on. The rules determine how replicas are placed in terms of that hierarchy (for
+example, 'three replicas in different racks').
+
+Devices
+-------
+
+Devices are individual OSDs that store data (usually one device for each
+storage drive). Devices are identified by an ``id`` (a non-negative integer)
+and a ``name`` (usually ``osd.N``, where ``N`` is the device's ``id``).
+
+In Luminous and later releases, OSDs can have a *device class* assigned (for
+example, ``hdd`` or ``ssd`` or ``nvme``), allowing them to be targeted by CRUSH
+rules. Device classes are especially useful when mixing device types within
+hosts.
+
+.. _crush_map_default_types:
+
+Types and Buckets
+-----------------
+
+"Bucket", in the context of CRUSH, is a term for any of the internal nodes in
+the hierarchy: hosts, racks, rows, and so on. The CRUSH map defines a series of
+*types* that are used to identify these nodes. Default types include:
+
+- ``osd`` (or ``device``)
+- ``host``
+- ``chassis``
+- ``rack``
+- ``row``
+- ``pdu``
+- ``pod``
+- ``room``
+- ``datacenter``
+- ``zone``
+- ``region``
+- ``root``
+
+Most clusters use only a handful of these types, and other types can be defined
+as needed.
+
+The hierarchy is built with devices (normally of type ``osd``) at the leaves
+and non-device types as the internal nodes. The root node is of type ``root``.
+For example:
+
+
+.. ditaa::
+
+ +-----------------+
+ |{o}root default |
+ +--------+--------+
+ |
+ +---------------+---------------+
+ | |
+ +------+------+ +------+------+
+ |{o}host foo | |{o}host bar |
+ +------+------+ +------+------+
+ | |
+ +-------+-------+ +-------+-------+
+ | | | |
+ +-----+-----+ +-----+-----+ +-----+-----+ +-----+-----+
+ | osd.0 | | osd.1 | | osd.2 | | osd.3 |
+ +-----------+ +-----------+ +-----------+ +-----------+
+
+
+Each node (device or bucket) in the hierarchy has a *weight* that indicates the
+relative proportion of the total data that should be stored by that device or
+hierarchy subtree. Weights are set at the leaves, indicating the size of the
+device. These weights automatically sum in an 'up the tree' direction: that is,
+the weight of the ``root`` node will be the sum of the weights of all devices
+contained under it. Weights are typically measured in tebibytes (TiB).
+
+To get a simple view of the cluster's CRUSH hierarchy, including weights, run
+the following command:
+
+.. prompt:: bash $
+
+ ceph osd tree
+
+Rules
+-----
+
+CRUSH rules define policy governing how data is distributed across the devices
+in the hierarchy. The rules define placement as well as replication strategies
+or distribution policies that allow you to specify exactly how CRUSH places
+data replicas. For example, you might create one rule selecting a pair of
+targets for two-way mirroring, another rule for selecting three targets in two
+different data centers for three-way replication, and yet another rule for
+erasure coding across six storage devices. For a detailed discussion of CRUSH
+rules, see **Section 3.2** of `CRUSH - Controlled, Scalable, Decentralized
+Placement of Replicated Data`_.
+
+CRUSH rules can be created via the command-line by specifying the *pool type*
+that they will govern (replicated or erasure coded), the *failure domain*, and
+optionally a *device class*. In rare cases, CRUSH rules must be created by
+manually editing the CRUSH map.
+
+To see the rules that are defined for the cluster, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush rule ls
+
+To view the contents of the rules, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush rule dump
+
+.. _device_classes:
+
+Device classes
+--------------
+
+Each device can optionally have a *class* assigned. By default, OSDs
+automatically set their class at startup to `hdd`, `ssd`, or `nvme` in
+accordance with the type of device they are backed by.
+
+To explicitly set the device class of one or more OSDs, run a command of the
+following form:
+
+.. prompt:: bash $
+
+ ceph osd crush set-device-class <class> <osd-name> [...]
+
+Once a device class has been set, it cannot be changed to another class until
+the old class is unset. To remove the old class of one or more OSDs, run a
+command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd crush rm-device-class <osd-name> [...]
+
+This restriction allows administrators to set device classes that won't be
+changed on OSD restart or by a script.
+
+To create a placement rule that targets a specific device class, run a command
+of the following form:
+
+.. prompt:: bash $
+
+ ceph osd crush rule create-replicated <rule-name> <root> <failure-domain> <class>
+
+To apply the new placement rule to a specific pool, run a command of the
+following form:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> crush_rule <rule-name>
+
+Device classes are implemented by creating one or more "shadow" CRUSH
+hierarchies. For each device class in use, there will be a shadow hierarchy
+that contains only devices of that class. CRUSH rules can then distribute data
+across the relevant shadow hierarchy. This approach is fully backward
+compatible with older Ceph clients. To view the CRUSH hierarchy with shadow
+items displayed, run the following command:
+
+.. prompt:: bash #
+
+ ceph osd crush tree --show-shadow
+
+Some older clusters that were created before the Luminous release rely on
+manually crafted CRUSH maps to maintain per-device-type hierarchies. For these
+clusters, there is a *reclassify* tool available that can help them transition
+to device classes without triggering unwanted data movement (see
+:ref:`crush-reclassify`).
+
+Weight sets
+-----------
+
+A *weight set* is an alternative set of weights to use when calculating data
+placement. The normal weights associated with each device in the CRUSH map are
+set in accordance with the device size and indicate how much data should be
+stored where. However, because CRUSH is a probabilistic pseudorandom placement
+process, there is always some variation from this ideal distribution (in the
+same way that rolling a die sixty times will likely not result in exactly ten
+ones and ten sixes). Weight sets allow the cluster to perform numerical
+optimization based on the specifics of your cluster (for example: hierarchy,
+pools) to achieve a balanced distribution.
+
+Ceph supports two types of weight sets:
+
+#. A **compat** weight set is a single alternative set of weights for each
+ device and each node in the cluster. Compat weight sets cannot be expected
+ to correct all anomalies (for example, PGs for different pools might be of
+ different sizes and have different load levels, but are mostly treated alike
+ by the balancer). However, they have the major advantage of being *backward
+ compatible* with previous versions of Ceph. This means that even though
+ weight sets were first introduced in Luminous v12.2.z, older clients (for
+ example, Firefly) can still connect to the cluster when a compat weight set
+ is being used to balance data.
+
+#. A **per-pool** weight set is more flexible in that it allows placement to
+ be optimized for each data pool. Additionally, weights can be adjusted
+ for each position of placement, allowing the optimizer to correct for a
+ subtle skew of data toward devices with small weights relative to their
+ peers (an effect that is usually apparent only in very large clusters
+ but that can cause balancing problems).
+
+When weight sets are in use, the weights associated with each node in the
+hierarchy are visible in a separate column (labeled either as ``(compat)`` or
+as the pool name) in the output of the following command:
+
+.. prompt:: bash #
+
+ ceph osd tree
+
+If both *compat* and *per-pool* weight sets are in use, data placement for a
+particular pool will use its own per-pool weight set if present. If only
+*compat* weight sets are in use, data placement will use the compat weight set.
+If neither are in use, data placement will use the normal CRUSH weights.
+
+Although weight sets can be set up and adjusted manually, we recommend enabling
+the ``ceph-mgr`` *balancer* module to perform these tasks automatically if the
+cluster is running Luminous or a later release.
+
+Modifying the CRUSH map
+=======================
+
+.. _addosd:
+
+Adding/Moving an OSD
+--------------------
+
+.. note:: Under normal conditions, OSDs automatically add themselves to the
+ CRUSH map when they are created. The command in this section is rarely
+ needed.
+
+
+To add or move an OSD in the CRUSH map of a running cluster, run a command of
+the following form:
+
+.. prompt:: bash $
+
+ ceph osd crush set {name} {weight} root={root} [{bucket-type}={bucket-name} ...]
+
+For details on this command's parameters, see the following:
+
+``name``
+ :Description: The full name of the OSD.
+ :Type: String
+ :Required: Yes
+ :Example: ``osd.0``
+
+
+``weight``
+ :Description: The CRUSH weight of the OSD. Normally, this is its size, as measured in terabytes (TB).
+ :Type: Double
+ :Required: Yes
+ :Example: ``2.0``
+
+
+``root``
+ :Description: The root node of the CRUSH hierarchy in which the OSD resides (normally ``default``).
+ :Type: Key-value pair.
+ :Required: Yes
+ :Example: ``root=default``
+
+
+``bucket-type``
+ :Description: The OSD's location in the CRUSH hierarchy.
+ :Type: Key-value pairs.
+ :Required: No
+ :Example: ``datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1``
+
+In the following example, the command adds ``osd.0`` to the hierarchy, or moves
+``osd.0`` from a previous location:
+
+.. prompt:: bash $
+
+ ceph osd crush set osd.0 1.0 root=default datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1
+
+
+Adjusting OSD weight
+--------------------
+
+.. note:: Under normal conditions, OSDs automatically add themselves to the
+ CRUSH map with the correct weight when they are created. The command in this
+ section is rarely needed.
+
+To adjust an OSD's CRUSH weight in a running cluster, run a command of the
+following form:
+
+.. prompt:: bash $
+
+ ceph osd crush reweight {name} {weight}
+
+For details on this command's parameters, see the following:
+
+``name``
+ :Description: The full name of the OSD.
+ :Type: String
+ :Required: Yes
+ :Example: ``osd.0``
+
+
+``weight``
+ :Description: The CRUSH weight of the OSD.
+ :Type: Double
+ :Required: Yes
+ :Example: ``2.0``
+
+
+.. _removeosd:
+
+Removing an OSD
+---------------
+
+.. note:: OSDs are normally removed from the CRUSH map as a result of the
+ `ceph osd purge`` command. This command is rarely needed.
+
+To remove an OSD from the CRUSH map of a running cluster, run a command of the
+following form:
+
+.. prompt:: bash $
+
+ ceph osd crush remove {name}
+
+For details on the ``name`` parameter, see the following:
+
+``name``
+ :Description: The full name of the OSD.
+ :Type: String
+ :Required: Yes
+ :Example: ``osd.0``
+
+
+Adding a CRUSH Bucket
+---------------------
+
+.. note:: Buckets are implicitly created when an OSD is added and the command
+ that creates it specifies a ``{bucket-type}={bucket-name}`` as part of the
+ OSD's location (provided that a bucket with that name does not already
+ exist). The command in this section is typically used when manually
+ adjusting the structure of the hierarchy after OSDs have already been
+ created. One use of this command is to move a series of hosts to a new
+ rack-level bucket. Another use of this command is to add new ``host``
+ buckets (OSD nodes) to a dummy ``root`` so that the buckets don't receive
+ any data until they are ready to receive data. When they are ready, move the
+ buckets to the ``default`` root or to any other root as described below.
+
+To add a bucket in the CRUSH map of a running cluster, run a command of the
+following form:
+
+.. prompt:: bash $
+
+ ceph osd crush add-bucket {bucket-name} {bucket-type}
+
+For details on this command's parameters, see the following:
+
+``bucket-name``
+ :Description: The full name of the bucket.
+ :Type: String
+ :Required: Yes
+ :Example: ``rack12``
+
+
+``bucket-type``
+ :Description: The type of the bucket. This type must already exist in the CRUSH hierarchy.
+ :Type: String
+ :Required: Yes
+ :Example: ``rack``
+
+In the following example, the command adds the ``rack12`` bucket to the hierarchy:
+
+.. prompt:: bash $
+
+ ceph osd crush add-bucket rack12 rack
+
+Moving a Bucket
+---------------
+
+To move a bucket to a different location or position in the CRUSH map
+hierarchy, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd crush move {bucket-name} {bucket-type}={bucket-name}, [...]
+
+For details on this command's parameters, see the following:
+
+``bucket-name``
+ :Description: The name of the bucket that you are moving.
+ :Type: String
+ :Required: Yes
+ :Example: ``foo-bar-1``
+
+``bucket-type``
+ :Description: The bucket's new location in the CRUSH hierarchy.
+ :Type: Key-value pairs.
+ :Required: No
+ :Example: ``datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1``
+
+Removing a Bucket
+-----------------
+
+To remove a bucket from the CRUSH hierarchy, run a command of the following
+form:
+
+.. prompt:: bash $
+
+ ceph osd crush remove {bucket-name}
+
+.. note:: A bucket must already be empty before it is removed from the CRUSH
+ hierarchy. In other words, there must not be OSDs or any other CRUSH buckets
+ within it.
+
+For details on the ``bucket-name`` parameter, see the following:
+
+``bucket-name``
+ :Description: The name of the bucket that is being removed.
+ :Type: String
+ :Required: Yes
+ :Example: ``rack12``
+
+In the following example, the command removes the ``rack12`` bucket from the
+hierarchy:
+
+.. prompt:: bash $
+
+ ceph osd crush remove rack12
+
+Creating a compat weight set
+----------------------------
+
+.. note:: Normally this action is done automatically if needed by the
+ ``balancer`` module (provided that the module is enabled).
+
+To create a *compat* weight set, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush weight-set create-compat
+
+To adjust the weights of the compat weight set, run a command of the following
+form:
+
+.. prompt:: bash $
+
+ ceph osd crush weight-set reweight-compat {name} {weight}
+
+To destroy the compat weight set, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush weight-set rm-compat
+
+Creating per-pool weight sets
+-----------------------------
+
+To create a weight set for a specific pool, run a command of the following
+form:
+
+.. prompt:: bash $
+
+ ceph osd crush weight-set create {pool-name} {mode}
+
+.. note:: Per-pool weight sets can be used only if all servers and daemons are
+ running Luminous v12.2.z or a later release.
+
+For details on this command's parameters, see the following:
+
+``pool-name``
+ :Description: The name of a RADOS pool.
+ :Type: String
+ :Required: Yes
+ :Example: ``rbd``
+
+``mode``
+ :Description: Either ``flat`` or ``positional``. A *flat* weight set
+ assigns a single weight to all devices or buckets. A
+ *positional* weight set has a potentially different
+ weight for each position in the resulting placement
+ mapping. For example: if a pool has a replica count of
+ ``3``, then a positional weight set will have three
+ weights for each device and bucket.
+ :Type: String
+ :Required: Yes
+ :Example: ``flat``
+
+To adjust the weight of an item in a weight set, run a command of the following
+form:
+
+.. prompt:: bash $
+
+ ceph osd crush weight-set reweight {pool-name} {item-name} {weight [...]}
+
+To list existing weight sets, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush weight-set ls
+
+To remove a weight set, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd crush weight-set rm {pool-name}
+
+
+Creating a rule for a replicated pool
+-------------------------------------
+
+When you create a CRUSH rule for a replicated pool, there is an important
+decision to make: selecting a failure domain. For example, if you select a
+failure domain of ``host``, then CRUSH will ensure that each replica of the
+data is stored on a unique host. Alternatively, if you select a failure domain
+of ``rack``, then each replica of the data will be stored in a different rack.
+Your selection of failure domain should be guided by the size and its CRUSH
+topology.
+
+The entire cluster hierarchy is typically nested beneath a root node that is
+named ``default``. If you have customized your hierarchy, you might want to
+create a rule nested beneath some other node in the hierarchy. In creating
+this rule for the customized hierarchy, the node type doesn't matter, and in
+particular the rule does not have to be nested beneath a ``root`` node.
+
+It is possible to create a rule that restricts data placement to a specific
+*class* of device. By default, Ceph OSDs automatically classify themselves as
+either ``hdd`` or ``ssd`` in accordance with the underlying type of device
+being used. These device classes can be customized. One might set the ``device
+class`` of OSDs to ``nvme`` to distinguish the from SATA SSDs, or one might set
+them to something arbitrary like ``ssd-testing`` or ``ssd-ethel`` so that rules
+and pools may be flexibly constrained to use (or avoid using) specific subsets
+of OSDs based on specific requirements.
+
+To create a rule for a replicated pool, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd crush rule create-replicated {name} {root} {failure-domain-type} [{class}]
+
+For details on this command's parameters, see the following:
+
+``name``
+ :Description: The name of the rule.
+ :Type: String
+ :Required: Yes
+ :Example: ``rbd-rule``
+
+``root``
+ :Description: The name of the CRUSH hierarchy node under which data is to be placed.
+ :Type: String
+ :Required: Yes
+ :Example: ``default``
+
+``failure-domain-type``
+ :Description: The type of CRUSH nodes used for the replicas of the failure domain.
+ :Type: String
+ :Required: Yes
+ :Example: ``rack``
+
+``class``
+ :Description: The device class on which data is to be placed.
+ :Type: String
+ :Required: No
+ :Example: ``ssd``
+
+Creating a rule for an erasure-coded pool
+-----------------------------------------
+
+For an erasure-coded pool, similar decisions need to be made: what the failure
+domain is, which node in the hierarchy data will be placed under (usually
+``default``), and whether placement is restricted to a specific device class.
+However, erasure-code pools are created in a different way: there is a need to
+construct them carefully with reference to the erasure code plugin in use. For
+this reason, these decisions must be incorporated into the **erasure-code
+profile**. A CRUSH rule will then be created from the erasure-code profile,
+either explicitly or automatically when the profile is used to create a pool.
+
+To list the erasure-code profiles, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile ls
+
+To view a specific existing profile, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile get {profile-name}
+
+Under normal conditions, profiles should never be modified; instead, a new
+profile should be created and used when creating either a new pool or a new
+rule for an existing pool.
+
+An erasure-code profile consists of a set of key-value pairs. Most of these
+key-value pairs govern the behavior of the erasure code that encodes data in
+the pool. However, key-value pairs that begin with ``crush-`` govern the CRUSH
+rule that is created.
+
+The relevant erasure-code profile properties are as follows:
+
+ * **crush-root**: the name of the CRUSH node under which to place data
+ [default: ``default``].
+ * **crush-failure-domain**: the CRUSH bucket type used in the distribution of
+ erasure-coded shards [default: ``host``].
+ * **crush-device-class**: the device class on which to place data [default:
+ none, which means that all devices are used].
+ * **k** and **m** (and, for the ``lrc`` plugin, **l**): these determine the
+ number of erasure-code shards, affecting the resulting CRUSH rule.
+
+ After a profile is defined, you can create a CRUSH rule by running a command
+ of the following form:
+
+.. prompt:: bash $
+
+ ceph osd crush rule create-erasure {name} {profile-name}
+
+.. note: When creating a new pool, it is not necessary to create the rule
+ explicitly. If only the erasure-code profile is specified and the rule
+ argument is omitted, then Ceph will create the CRUSH rule automatically.
+
+
+Deleting rules
+--------------
+
+To delete rules that are not in use by pools, run a command of the following
+form:
+
+.. prompt:: bash $
+
+ ceph osd crush rule rm {rule-name}
+
+.. _crush-map-tunables:
+
+Tunables
+========
+
+The CRUSH algorithm that is used to calculate the placement of data has been
+improved over time. In order to support changes in behavior, we have provided
+users with sets of tunables that determine which legacy or optimal version of
+CRUSH is to be used.
+
+In order to use newer tunables, all Ceph clients and daemons must support the
+new major release of CRUSH. Because of this requirement, we have created
+``profiles`` that are named after the Ceph version in which they were
+introduced. For example, the ``firefly`` tunables were first supported by the
+Firefly release and do not work with older clients (for example, clients
+running Dumpling). After a cluster's tunables profile is changed from a legacy
+set to a newer or ``optimal`` set, the ``ceph-mon`` and ``ceph-osd`` options
+will prevent older clients that do not support the new CRUSH features from
+connecting to the cluster.
+
+argonaut (legacy)
+-----------------
+
+The legacy CRUSH behavior used by Argonaut and older releases works fine for
+most clusters, provided that not many OSDs have been marked ``out``.
+
+bobtail (CRUSH_TUNABLES2)
+-------------------------
+
+The ``bobtail`` tunable profile provides the following improvements:
+
+ * For hierarchies with a small number of devices in leaf buckets, some PGs
+ might map to fewer than the desired number of replicas, resulting in
+ ``undersized`` PGs. This is known to happen in the case of hierarchies with
+ ``host`` nodes that have a small number of OSDs (1 to 3) nested beneath each
+ host.
+
+ * For large clusters, a small percentage of PGs might map to fewer than the
+ desired number of OSDs. This is known to happen when there are multiple
+ hierarchy layers in use (for example,, ``row``, ``rack``, ``host``,
+ ``osd``).
+
+ * When one or more OSDs are marked ``out``, data tends to be redistributed
+ to nearby OSDs instead of across the entire hierarchy.
+
+The tunables introduced in the Bobtail release are as follows:
+
+ * ``choose_local_tries``: Number of local retries. The legacy value is ``2``,
+ and the optimal value is ``0``.
+
+ * ``choose_local_fallback_tries``: The legacy value is ``5``, and the optimal
+ value is 0.
+
+ * ``choose_total_tries``: Total number of attempts to choose an item. The
+ legacy value is ``19``, but subsequent testing indicates that a value of
+ ``50`` is more appropriate for typical clusters. For extremely large
+ clusters, an even larger value might be necessary.
+
+ * ``chooseleaf_descend_once``: Whether a recursive ``chooseleaf`` attempt will
+ retry, or try only once and allow the original placement to retry. The
+ legacy default is ``0``, and the optimal value is ``1``.
+
+Migration impact:
+
+ * Moving from the ``argonaut`` tunables to the ``bobtail`` tunables triggers a
+ moderate amount of data movement. Use caution on a cluster that is already
+ populated with data.
+
+firefly (CRUSH_TUNABLES3)
+-------------------------
+
+chooseleaf_vary_r
+~~~~~~~~~~~~~~~~~
+
+This ``firefly`` tunable profile fixes a problem with ``chooseleaf`` CRUSH step
+behavior. This problem arose when a large fraction of OSDs were marked ``out``, which resulted in PG mappings with too few OSDs.
+
+This profile was introduced in the Firefly release, and adds a new tunable as follows:
+
+ * ``chooseleaf_vary_r``: Whether a recursive chooseleaf attempt will start
+ with a non-zero value of ``r``, as determined by the number of attempts the
+ parent has already made. The legacy default value is ``0``, but with this
+ value CRUSH is sometimes unable to find a mapping. The optimal value (in
+ terms of computational cost and correctness) is ``1``.
+
+Migration impact:
+
+ * For existing clusters that store a great deal of data, changing this tunable
+ from ``0`` to ``1`` will trigger a large amount of data migration; a value
+ of ``4`` or ``5`` will allow CRUSH to still find a valid mapping and will
+ cause less data to move.
+
+straw_calc_version tunable
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There were problems with the internal weights calculated and stored in the
+CRUSH map for ``straw`` algorithm buckets. When there were buckets with a CRUSH
+weight of ``0`` or with a mix of different and unique weights, CRUSH would
+distribute data incorrectly (that is, not in proportion to the weights).
+
+This tunable, introduced in the Firefly release, is as follows:
+
+ * ``straw_calc_version``: A value of ``0`` preserves the old, broken
+ internal-weight calculation; a value of ``1`` fixes the problem.
+
+Migration impact:
+
+ * Changing this tunable to a value of ``1`` and then adjusting a straw bucket
+ (either by adding, removing, or reweighting an item or by using the
+ reweight-all command) can trigger a small to moderate amount of data
+ movement provided that the cluster has hit one of the problematic
+ conditions.
+
+This tunable option is notable in that it has absolutely no impact on the
+required kernel version in the client side.
+
+hammer (CRUSH_V4)
+-----------------
+
+The ``hammer`` tunable profile does not affect the mapping of existing CRUSH
+maps simply by changing the profile. However:
+
+ * There is a new bucket algorithm supported: ``straw2``. This new algorithm
+ fixes several limitations in the original ``straw``. More specifically, the
+ old ``straw`` buckets would change some mappings that should not have
+ changed when a weight was adjusted, while ``straw2`` achieves the original
+ goal of changing mappings only to or from the bucket item whose weight has
+ changed.
+
+ * The ``straw2`` type is the default type for any newly created buckets.
+
+Migration impact:
+
+ * Changing a bucket type from ``straw`` to ``straw2`` will trigger a small
+ amount of data movement, depending on how much the bucket items' weights
+ vary from each other. When the weights are all the same no data will move,
+ and the more variance there is in the weights the more movement there will
+ be.
+
+jewel (CRUSH_TUNABLES5)
+-----------------------
+
+The ``jewel`` tunable profile improves the overall behavior of CRUSH. As a
+result, significantly fewer mappings change when an OSD is marked ``out`` of
+the cluster. This improvement results in significantly less data movement.
+
+The new tunable introduced in the Jewel release is as follows:
+
+ * ``chooseleaf_stable``: Determines whether a recursive chooseleaf attempt
+ will use a better value for an inner loop that greatly reduces the number of
+ mapping changes when an OSD is marked ``out``. The legacy value is ``0``,
+ and the new value of ``1`` uses the new approach.
+
+Migration impact:
+
+ * Changing this value on an existing cluster will result in a very large
+ amount of data movement because nearly every PG mapping is likely to change.
+
+Client versions that support CRUSH_TUNABLES2
+--------------------------------------------
+
+ * v0.55 and later, including Bobtail (v0.56.x)
+ * Linux kernel version v3.9 and later (for the CephFS and RBD kernel clients)
+
+Client versions that support CRUSH_TUNABLES3
+--------------------------------------------
+
+ * v0.78 (Firefly) and later
+ * Linux kernel version v3.15 and later (for the CephFS and RBD kernel clients)
+
+Client versions that support CRUSH_V4
+-------------------------------------
+
+ * v0.94 (Hammer) and later
+ * Linux kernel version v4.1 and later (for the CephFS and RBD kernel clients)
+
+Client versions that support CRUSH_TUNABLES5
+--------------------------------------------
+
+ * v10.0.2 (Jewel) and later
+ * Linux kernel version v4.5 and later (for the CephFS and RBD kernel clients)
+
+"Non-optimal tunables" warning
+------------------------------
+
+In v0.74 and later versions, Ceph will raise a health check ("HEALTH_WARN crush
+map has non-optimal tunables") if any of the current CRUSH tunables have
+non-optimal values: that is, if any fail to have the optimal values from the
+:ref:` ``default`` profile
+<rados_operations_crush_map_default_profile_definition>`. There are two
+different ways to silence the alert:
+
+1. Adjust the CRUSH tunables on the existing cluster so as to render them
+ optimal. Making this adjustment will trigger some data movement
+ (possibly as much as 10%). This approach is generally preferred to the
+ other approach, but special care must be taken in situations where
+ data movement might affect performance: for example, in production clusters.
+ To enable optimal tunables, run the following command:
+
+ .. prompt:: bash $
+
+ ceph osd crush tunables optimal
+
+ There are several potential problems that might make it preferable to revert
+ to the previous values of the tunables. The new values might generate too
+ much load for the cluster to handle, the new values might unacceptably slow
+ the operation of the cluster, or there might be a client-compatibility
+ problem. Such client-compatibility problems can arise when using old-kernel
+ CephFS or RBD clients, or pre-Bobtail ``librados`` clients. To revert to
+ the previous values of the tunables, run the following command:
+
+ .. prompt:: bash $
+
+ ceph osd crush tunables legacy
+
+2. To silence the alert without making any changes to CRUSH,
+ add the following option to the ``[mon]`` section of your ceph.conf file::
+
+ mon_warn_on_legacy_crush_tunables = false
+
+ In order for this change to take effect, you will need to either restart
+ the monitors or run the following command to apply the option to the
+ monitors while they are still running:
+
+ .. prompt:: bash $
+
+ ceph tell mon.\* config set mon_warn_on_legacy_crush_tunables false
+
+
+Tuning CRUSH
+------------
+
+When making adjustments to CRUSH tunables, keep the following considerations in
+mind:
+
+ * Adjusting the values of CRUSH tunables will result in the shift of one or
+ more PGs from one storage node to another. If the Ceph cluster is already
+ storing a great deal of data, be prepared for significant data movement.
+ * When the ``ceph-osd`` and ``ceph-mon`` daemons get the updated map, they
+ immediately begin rejecting new connections from clients that do not support
+ the new feature. However, already-connected clients are effectively
+ grandfathered in, and any of these clients that do not support the new
+ feature will malfunction.
+ * If the CRUSH tunables are set to newer (non-legacy) values and subsequently
+ reverted to the legacy values, ``ceph-osd`` daemons will not be required to
+ support any of the newer CRUSH features associated with the newer
+ (non-legacy) values. However, the OSD peering process requires the
+ examination and understanding of old maps. For this reason, **if the cluster
+ has previously used non-legacy CRUSH values, do not run old versions of
+ the** ``ceph-osd`` **daemon** -- even if the latest version of the map has
+ been reverted so as to use the legacy defaults.
+
+The simplest way to adjust CRUSH tunables is to apply them in matched sets
+known as *profiles*. As of the Octopus release, Ceph supports the following
+profiles:
+
+ * ``legacy``: The legacy behavior from argonaut and earlier.
+ * ``argonaut``: The legacy values supported by the argonaut release.
+ * ``bobtail``: The values supported by the bobtail release.
+ * ``firefly``: The values supported by the firefly release.
+ * ``hammer``: The values supported by the hammer release.
+ * ``jewel``: The values supported by the jewel release.
+ * ``optimal``: The best values for the current version of Ceph.
+ .. _rados_operations_crush_map_default_profile_definition:
+ * ``default``: The default values of a new cluster that has been installed
+ from scratch. These values, which depend on the current version of Ceph, are
+ hardcoded and are typically a mix of optimal and legacy values. These
+ values often correspond to the ``optimal`` profile of either the previous
+ LTS (long-term service) release or the most recent release for which most
+ users are expected to have up-to-date clients.
+
+To apply a profile to a running cluster, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd crush tunables {PROFILE}
+
+This action might trigger a great deal of data movement. Consult release notes
+and documentation before changing the profile on a running cluster. Consider
+throttling recovery and backfill parameters in order to limit the backfill
+resulting from a specific change.
+
+.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.io/assets/pdfs/weil-crush-sc06.pdf
+
+
+Tuning Primary OSD Selection
+============================
+
+When a Ceph client reads or writes data, it first contacts the primary OSD in
+each affected PG's acting set. By default, the first OSD in the acting set is
+the primary OSD (also known as the "lead OSD"). For example, in the acting set
+``[2, 3, 4]``, ``osd.2`` is listed first and is therefore the primary OSD.
+However, sometimes it is clear that an OSD is not well suited to act as the
+lead as compared with other OSDs (for example, if the OSD has a slow drive or a
+slow controller). To prevent performance bottlenecks (especially on read
+operations) and at the same time maximize the utilization of your hardware, you
+can influence the selection of the primary OSD either by adjusting "primary
+affinity" values, or by crafting a CRUSH rule that selects OSDs that are better
+suited to act as the lead rather than other OSDs.
+
+To determine whether tuning Ceph's selection of primary OSDs will improve
+cluster performance, pool redundancy strategy must be taken into account. For
+replicated pools, this tuning can be especially useful, because by default read
+operations are served from the primary OSD of each PG. For erasure-coded pools,
+however, the speed of read operations can be increased by enabling **fast
+read** (see :ref:`pool-settings`).
+
+.. _rados_ops_primary_affinity:
+
+Primary Affinity
+----------------
+
+**Primary affinity** is a characteristic of an OSD that governs the likelihood
+that a given OSD will be selected as the primary OSD (or "lead OSD") in a given
+acting set. A primary affinity value can be any real number in the range ``0``
+to ``1``, inclusive.
+
+As an example of a common scenario in which it can be useful to adjust primary
+affinity values, let us suppose that a cluster contains a mix of drive sizes:
+for example, suppose it contains some older racks with 1.9 TB SATA SSDs and
+some newer racks with 3.84 TB SATA SSDs. The latter will on average be assigned
+twice the number of PGs and will thus serve twice the number of write and read
+operations -- they will be busier than the former. In such a scenario, you
+might make a rough assignment of primary affinity as inversely proportional to
+OSD size. Such an assignment will not be 100% optimal, but it can readily
+achieve a 15% improvement in overall read throughput by means of a more even
+utilization of SATA interface bandwidth and CPU cycles. This example is not
+merely a thought experiment meant to illustrate the theoretical benefits of
+adjusting primary affinity values; this fifteen percent improvement was
+achieved on an actual Ceph cluster.
+
+By default, every Ceph OSD has a primary affinity value of ``1``. In a cluster
+in which every OSD has this default value, all OSDs are equally likely to act
+as a primary OSD.
+
+By reducing the value of a Ceph OSD's primary affinity, you make CRUSH less
+likely to select the OSD as primary in a PG's acting set. To change the weight
+value associated with a specific OSD's primary affinity, run a command of the
+following form:
+
+.. prompt:: bash $
+
+ ceph osd primary-affinity <osd-id> <weight>
+
+The primary affinity of an OSD can be set to any real number in the range
+``[0-1]`` inclusive, where ``0`` indicates that the OSD may not be used as
+primary and ``1`` indicates that the OSD is maximally likely to be used as a
+primary. When the weight is between these extremes, its value indicates roughly
+how likely it is that CRUSH will select the OSD associated with it as a
+primary.
+
+The process by which CRUSH selects the lead OSD is not a mere function of a
+simple probability determined by relative affinity values. Nevertheless,
+measurable results can be achieved even with first-order approximations of
+desirable primary affinity values.
+
+
+Custom CRUSH Rules
+------------------
+
+Some clusters balance cost and performance by mixing SSDs and HDDs in the same
+replicated pool. By setting the primary affinity of HDD OSDs to ``0``,
+operations will be directed to an SSD OSD in each acting set. Alternatively,
+you can define a CRUSH rule that always selects an SSD OSD as the primary OSD
+and then selects HDDs for the remaining OSDs. Given this rule, each PG's acting
+set will contain an SSD OSD as the primary and have the remaining OSDs on HDDs.
+
+For example, see the following CRUSH rule::
+
+ rule mixed_replicated_rule {
+ id 11
+ type replicated
+ step take default class ssd
+ step chooseleaf firstn 1 type host
+ step emit
+ step take default class hdd
+ step chooseleaf firstn 0 type host
+ step emit
+ }
+
+This rule chooses an SSD as the first OSD. For an ``N``-times replicated pool,
+this rule selects ``N+1`` OSDs in order to guarantee that ``N`` copies are on
+different hosts, because the first SSD OSD might be colocated with any of the
+``N`` HDD OSDs.
+
+To avoid this extra storage requirement, you might place SSDs and HDDs in
+different hosts. However, taking this approach means that all client requests
+will be received by hosts with SSDs. For this reason, it might be advisable to
+have faster CPUs for SSD OSDs and more modest CPUs for HDD OSDs, since the
+latter will under normal circumstances perform only recovery operations. Here
+the CRUSH roots ``ssd_hosts`` and ``hdd_hosts`` are under a strict requirement
+not to contain any of the same servers, as seen in the following CRUSH rule::
+
+ rule mixed_replicated_rule_two {
+ id 1
+ type replicated
+ step take ssd_hosts class ssd
+ step chooseleaf firstn 1 type host
+ step emit
+ step take hdd_hosts class hdd
+ step chooseleaf firstn -1 type host
+ step emit
+ }
+
+.. note:: If a primary SSD OSD fails, then requests to the associated PG will
+ be temporarily served from a slower HDD OSD until the PG's data has been
+ replicated onto the replacement primary SSD OSD.
+
+
diff --git a/doc/rados/operations/data-placement.rst b/doc/rados/operations/data-placement.rst
new file mode 100644
index 000000000..3d3be65ec
--- /dev/null
+++ b/doc/rados/operations/data-placement.rst
@@ -0,0 +1,47 @@
+=========================
+ Data Placement Overview
+=========================
+
+Ceph stores, replicates, and rebalances data objects across a RADOS cluster
+dynamically. Because different users store objects in different pools for
+different purposes on many OSDs, Ceph operations require a certain amount of
+data- placement planning. The main data-placement planning concepts in Ceph
+include:
+
+- **Pools:** Ceph stores data within pools, which are logical groups used for
+ storing objects. Pools manage the number of placement groups, the number of
+ replicas, and the CRUSH rule for the pool. To store data in a pool, it is
+ necessary to be an authenticated user with permissions for the pool. Ceph is
+ able to make snapshots of pools. For additional details, see `Pools`_.
+
+- **Placement Groups:** Ceph maps objects to placement groups. Placement
+ groups (PGs) are shards or fragments of a logical object pool that place
+ objects as a group into OSDs. Placement groups reduce the amount of
+ per-object metadata that is necessary for Ceph to store the data in OSDs. A
+ greater number of placement groups (for example, 100 PGs per OSD as compared
+ with 50 PGs per OSD) leads to better balancing. For additional details, see
+ :ref:`placement groups`.
+
+- **CRUSH Maps:** CRUSH plays a major role in allowing Ceph to scale while
+ avoiding certain pitfalls, such as performance bottlenecks, limitations to
+ scalability, and single points of failure. CRUSH maps provide the physical
+ topology of the cluster to the CRUSH algorithm, so that it can determine both
+ (1) where the data for an object and its replicas should be stored and (2)
+ how to store that data across failure domains so as to improve data safety.
+ For additional details, see `CRUSH Maps`_.
+
+- **Balancer:** The balancer is a feature that automatically optimizes the
+ distribution of placement groups across devices in order to achieve a
+ balanced data distribution, in order to maximize the amount of data that can
+ be stored in the cluster, and in order to evenly distribute the workload
+ across OSDs.
+
+It is possible to use the default values for each of the above components.
+Default values are recommended for a test cluster's initial setup. However,
+when planning a large Ceph cluster, values should be customized for
+data-placement operations with reference to the different roles played by
+pools, placement groups, and CRUSH.
+
+.. _Pools: ../pools
+.. _CRUSH Maps: ../crush-map
+.. _Balancer: ../balancer
diff --git a/doc/rados/operations/devices.rst b/doc/rados/operations/devices.rst
new file mode 100644
index 000000000..f92f622d5
--- /dev/null
+++ b/doc/rados/operations/devices.rst
@@ -0,0 +1,227 @@
+.. _devices:
+
+Device Management
+=================
+
+Device management allows Ceph to address hardware failure. Ceph tracks hardware
+storage devices (HDDs, SSDs) to see which devices are managed by which daemons.
+Ceph also collects health metrics about these devices. By doing so, Ceph can
+provide tools that predict hardware failure and can automatically respond to
+hardware failure.
+
+Device tracking
+---------------
+
+To see a list of the storage devices that are in use, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph device ls
+
+Alternatively, to list devices by daemon or by host, run a command of one of
+the following forms:
+
+.. prompt:: bash $
+
+ ceph device ls-by-daemon <daemon>
+ ceph device ls-by-host <host>
+
+To see information about the location of an specific device and about how the
+device is being consumed, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph device info <devid>
+
+Identifying physical devices
+----------------------------
+
+To make the replacement of failed disks easier and less error-prone, you can
+(in some cases) "blink" the drive's LEDs on hardware enclosures by running a
+command of the following form::
+
+ device light on|off <devid> [ident|fault] [--force]
+
+.. note:: Using this command to blink the lights might not work. Whether it
+ works will depend upon such factors as your kernel revision, your SES
+ firmware, or the setup of your HBA.
+
+The ``<devid>`` parameter is the device identification. To retrieve this
+information, run the following command:
+
+.. prompt:: bash $
+
+ ceph device ls
+
+The ``[ident|fault]`` parameter determines which kind of light will blink. By
+default, the `identification` light is used.
+
+.. note:: This command works only if the Cephadm or the Rook `orchestrator
+ <https://docs.ceph.com/docs/master/mgr/orchestrator/#orchestrator-cli-module>`_
+ module is enabled. To see which orchestrator module is enabled, run the
+ following command:
+
+ .. prompt:: bash $
+
+ ceph orch status
+
+The command that makes the drive's LEDs blink is `lsmcli`. To customize this
+command, configure it via a Jinja2 template by running commands of the
+following forms::
+
+ ceph config-key set mgr/cephadm/blink_device_light_cmd "<template>"
+ ceph config-key set mgr/cephadm/<host>/blink_device_light_cmd "lsmcli local-disk-{{ ident_fault }}-led-{{'on' if on else 'off'}} --path '{{ path or dev }}'"
+
+The following arguments can be used to customize the Jinja2 template:
+
+* ``on``
+ A boolean value.
+* ``ident_fault``
+ A string that contains `ident` or `fault`.
+* ``dev``
+ A string that contains the device ID: for example, `SanDisk_X400_M.2_2280_512GB_162924424784`.
+* ``path``
+ A string that contains the device path: for example, `/dev/sda`.
+
+.. _enabling-monitoring:
+
+Enabling monitoring
+-------------------
+
+Ceph can also monitor the health metrics associated with your device. For
+example, SATA drives implement a standard called SMART that provides a wide
+range of internal metrics about the device's usage and health (for example: the
+number of hours powered on, the number of power cycles, the number of
+unrecoverable read errors). Other device types such as SAS and NVMe present a
+similar set of metrics (via slightly different standards). All of these
+metrics can be collected by Ceph via the ``smartctl`` tool.
+
+You can enable or disable health monitoring by running one of the following
+commands:
+
+.. prompt:: bash $
+
+ ceph device monitoring on
+ ceph device monitoring off
+
+Scraping
+--------
+
+If monitoring is enabled, device metrics will be scraped automatically at
+regular intervals. To configure that interval, run a command of the following
+form:
+
+.. prompt:: bash $
+
+ ceph config set mgr mgr/devicehealth/scrape_frequency <seconds>
+
+By default, device metrics are scraped once every 24 hours.
+
+To manually scrape all devices, run the following command:
+
+.. prompt:: bash $
+
+ ceph device scrape-health-metrics
+
+To scrape a single device, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph device scrape-health-metrics <device-id>
+
+To scrape a single daemon's devices, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph device scrape-daemon-health-metrics <who>
+
+To retrieve the stored health metrics for a device (optionally for a specific
+timestamp), run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph device get-health-metrics <devid> [sample-timestamp]
+
+Failure prediction
+------------------
+
+Ceph can predict drive life expectancy and device failures by analyzing the
+health metrics that it collects. The prediction modes are as follows:
+
+* *none*: disable device failure prediction.
+* *local*: use a pre-trained prediction model from the ``ceph-mgr`` daemon.
+
+To configure the prediction mode, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph config set global device_failure_prediction_mode <mode>
+
+Under normal conditions, failure prediction runs periodically in the
+background. For this reason, life expectancy values might be populated only
+after a significant amount of time has passed. The life expectancy of all
+devices is displayed in the output of the following command:
+
+.. prompt:: bash $
+
+ ceph device ls
+
+To see the metadata of a specific device, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph device info <devid>
+
+To explicitly force prediction of a specific device's life expectancy, run a
+command of the following form:
+
+.. prompt:: bash $
+
+ ceph device predict-life-expectancy <devid>
+
+In addition to Ceph's internal device failure prediction, you might have an
+external source of information about device failures. To inform Ceph of a
+specific device's life expectancy, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph device set-life-expectancy <devid> <from> [<to>]
+
+Life expectancies are expressed as a time interval. This means that the
+uncertainty of the life expectancy can be expressed in the form of a range of
+time, and perhaps a wide range of time. The interval's end can be left
+unspecified.
+
+Health alerts
+-------------
+
+The ``mgr/devicehealth/warn_threshold`` configuration option controls the
+health check for an expected device failure. If the device is expected to fail
+within the specified time interval, an alert is raised.
+
+To check the stored life expectancy of all devices and generate any appropriate
+health alert, run the following command:
+
+.. prompt:: bash $
+
+ ceph device check-health
+
+Automatic Migration
+-------------------
+
+The ``mgr/devicehealth/self_heal`` option (enabled by default) automatically
+migrates data away from devices that are expected to fail soon. If this option
+is enabled, the module marks such devices ``out`` so that automatic migration
+will occur.
+
+.. note:: The ``mon_osd_min_up_ratio`` configuration option can help prevent
+ this process from cascading to total failure. If the "self heal" module
+ marks ``out`` so many OSDs that the ratio value of ``mon_osd_min_up_ratio``
+ is exceeded, then the cluster raises the ``DEVICE_HEALTH_TOOMANY`` health
+ check. For instructions on what to do in this situation, see
+ :ref:`DEVICE_HEALTH_TOOMANY<rados_health_checks_device_health_toomany>`.
+
+The ``mgr/devicehealth/mark_out_threshold`` configuration option specifies the
+time interval for automatic migration. If a device is expected to fail within
+the specified time interval, it will be automatically marked ``out``.
diff --git a/doc/rados/operations/erasure-code-clay.rst b/doc/rados/operations/erasure-code-clay.rst
new file mode 100644
index 000000000..1cffa32f5
--- /dev/null
+++ b/doc/rados/operations/erasure-code-clay.rst
@@ -0,0 +1,240 @@
+================
+CLAY code plugin
+================
+
+CLAY (short for coupled-layer) codes are erasure codes designed to bring about significant savings
+in terms of network bandwidth and disk IO when a failed node/OSD/rack is being repaired. Let:
+
+ d = number of OSDs contacted during repair
+
+If *jerasure* is configured with *k=8* and *m=4*, losing one OSD requires
+reading from the *d=8* others to repair. And recovery of say a 1GiB needs
+a download of 8 X 1GiB = 8GiB of information.
+
+However, in the case of the *clay* plugin *d* is configurable within the limits:
+
+ k+1 <= d <= k+m-1
+
+By default, the clay code plugin picks *d=k+m-1* as it provides the greatest savings in terms
+of network bandwidth and disk IO. In the case of the *clay* plugin configured with
+*k=8*, *m=4* and *d=11* when a single OSD fails, d=11 osds are contacted and
+250MiB is downloaded from each of them, resulting in a total download of 11 X 250MiB = 2.75GiB
+amount of information. More general parameters are provided below. The benefits are substantial
+when the repair is carried out for a rack that stores information on the order of
+Terabytes.
+
+ +-------------+---------------------------------------------------------+
+ | plugin | total amount of disk IO |
+ +=============+=========================================================+
+ |jerasure,isa | :math:`k S` |
+ +-------------+---------------------------------------------------------+
+ | clay | :math:`\frac{d S}{d - k + 1} = \frac{(k + m - 1) S}{m}` |
+ +-------------+---------------------------------------------------------+
+
+where *S* is the amount of data stored on a single OSD undergoing repair. In the table above, we have
+used the largest possible value of *d* as this will result in the smallest amount of data download needed
+to achieve recovery from an OSD failure.
+
+Erasure-code profile examples
+=============================
+
+An example configuration that can be used to observe reduced bandwidth usage:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set CLAYprofile \
+ plugin=clay \
+ k=4 m=2 d=5 \
+ crush-failure-domain=host
+ ceph osd pool create claypool erasure CLAYprofile
+
+
+Creating a clay profile
+=======================
+
+To create a new clay code profile:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set {name} \
+ plugin=clay \
+ k={data-chunks} \
+ m={coding-chunks} \
+ [d={helper-chunks}] \
+ [scalar_mds={plugin-name}] \
+ [technique={technique-name}] \
+ [crush-failure-domain={bucket-type}] \
+ [crush-device-class={device-class}] \
+ [directory={directory}] \
+ [--force]
+
+Where:
+
+``k={data chunks}``
+
+:Description: Each object is split into **data-chunks** parts,
+ each of which is stored on a different OSD.
+
+:Type: Integer
+:Required: Yes.
+:Example: 4
+
+``m={coding-chunks}``
+
+:Description: Compute **coding chunks** for each object and store them
+ on different OSDs. The number of coding chunks is also
+ the number of OSDs that can be down without losing data.
+
+:Type: Integer
+:Required: Yes.
+:Example: 2
+
+``d={helper-chunks}``
+
+:Description: Number of OSDs requested to send data during recovery of
+ a single chunk. *d* needs to be chosen such that
+ k+1 <= d <= k+m-1. The larger the *d*, the better the savings.
+
+:Type: Integer
+:Required: No.
+:Default: k+m-1
+
+``scalar_mds={jerasure|isa|shec}``
+
+:Description: **scalar_mds** specifies the plugin that is used as a
+ building block in the layered construction. It can be
+ one of *jerasure*, *isa*, *shec*
+
+:Type: String
+:Required: No.
+:Default: jerasure
+
+``technique={technique}``
+
+:Description: **technique** specifies the technique that will be picked
+ within the 'scalar_mds' plugin specified. Supported techniques
+ are 'reed_sol_van', 'reed_sol_r6_op', 'cauchy_orig',
+ 'cauchy_good', 'liber8tion' for jerasure, 'reed_sol_van',
+ 'cauchy' for isa and 'single', 'multiple' for shec.
+
+:Type: String
+:Required: No.
+:Default: reed_sol_van (for jerasure, isa), single (for shec)
+
+
+``crush-root={root}``
+
+:Description: The name of the crush bucket used for the first step of
+ the CRUSH rule. For instance **step take default**.
+
+:Type: String
+:Required: No.
+:Default: default
+
+
+``crush-failure-domain={bucket-type}``
+
+:Description: Ensure that no two chunks are in a bucket with the same
+ failure domain. For instance, if the failure domain is
+ **host** no two chunks will be stored on the same
+ host. It is used to create a CRUSH rule step such as **step
+ chooseleaf host**.
+
+:Type: String
+:Required: No.
+:Default: host
+
+``crush-device-class={device-class}``
+
+:Description: Restrict placement to devices of a specific class (e.g.,
+ ``ssd`` or ``hdd``), using the crush device class names
+ in the CRUSH map.
+
+:Type: String
+:Required: No.
+:Default:
+
+``directory={directory}``
+
+:Description: Set the **directory** name from which the erasure code
+ plugin is loaded.
+
+:Type: String
+:Required: No.
+:Default: /usr/lib/ceph/erasure-code
+
+``--force``
+
+:Description: Override an existing profile by the same name.
+
+:Type: String
+:Required: No.
+
+
+Notion of sub-chunks
+====================
+
+The Clay code is able to save in terms of disk IO, network bandwidth as it
+is a vector code and it is able to view and manipulate data within a chunk
+at a finer granularity termed as a sub-chunk. The number of sub-chunks within
+a chunk for a Clay code is given by:
+
+ sub-chunk count = :math:`q^{\frac{k+m}{q}}`, where :math:`q = d - k + 1`
+
+
+During repair of an OSD, the helper information requested
+from an available OSD is only a fraction of a chunk. In fact, the number
+of sub-chunks within a chunk that are accessed during repair is given by:
+
+ repair sub-chunk count = :math:`\frac{sub---chunk \: count}{q}`
+
+Examples
+--------
+
+#. For a configuration with *k=4*, *m=2*, *d=5*, the sub-chunk count is
+ 8 and the repair sub-chunk count is 4. Therefore, only half of a chunk is read
+ during repair.
+#. When *k=8*, *m=4*, *d=11* the sub-chunk count is 64 and repair sub-chunk count
+ is 16. A quarter of a chunk is read from an available OSD for repair of a failed
+ chunk.
+
+
+
+How to choose a configuration given a workload
+==============================================
+
+Only a few sub-chunks are read of all the sub-chunks within a chunk. These sub-chunks
+are not necessarily stored consecutively within a chunk. For best disk IO
+performance, it is helpful to read contiguous data. For this reason, it is suggested that
+you choose stripe-size such that the sub-chunk size is sufficiently large.
+
+For a given stripe-size (that's fixed based on a workload), choose ``k``, ``m``, ``d`` such that:
+
+ sub-chunk size = :math:`\frac{stripe-size}{k sub-chunk count}` = 4KB, 8KB, 12KB ...
+
+#. For large size workloads for which the stripe size is large, it is easy to choose k, m, d.
+ For example consider a stripe-size of size 64MB, choosing *k=16*, *m=4* and *d=19* will
+ result in a sub-chunk count of 1024 and a sub-chunk size of 4KB.
+#. For small size workloads, *k=4*, *m=2* is a good configuration that provides both network
+ and disk IO benefits.
+
+Comparisons with LRC
+====================
+
+Locally Recoverable Codes (LRC) are also designed in order to save in terms of network
+bandwidth, disk IO during single OSD recovery. However, the focus in LRCs is to keep the
+number of OSDs contacted during repair (d) to be minimal, but this comes at the cost of storage overhead.
+The *clay* code has a storage overhead m/k. In the case of an *lrc*, it stores (k+m)/d parities in
+addition to the ``m`` parities resulting in a storage overhead (m+(k+m)/d)/k. Both *clay* and *lrc*
+can recover from the failure of any ``m`` OSDs.
+
+ +-----------------+----------------------------------+----------------------------------+
+ | Parameters | disk IO, storage overhead (LRC) | disk IO, storage overhead (CLAY) |
+ +=================+================+=================+==================================+
+ | (k=10, m=4) | 7 * S, 0.6 (d=7) | 3.25 * S, 0.4 (d=13) |
+ +-----------------+----------------------------------+----------------------------------+
+ | (k=16, m=4) | 4 * S, 0.5625 (d=4) | 4.75 * S, 0.25 (d=19) |
+ +-----------------+----------------------------------+----------------------------------+
+
+
+where ``S`` is the amount of data stored of single OSD being recovered.
diff --git a/doc/rados/operations/erasure-code-isa.rst b/doc/rados/operations/erasure-code-isa.rst
new file mode 100644
index 000000000..9a43f89a2
--- /dev/null
+++ b/doc/rados/operations/erasure-code-isa.rst
@@ -0,0 +1,107 @@
+=======================
+ISA erasure code plugin
+=======================
+
+The *isa* plugin encapsulates the `ISA
+<https://01.org/intel%C2%AE-storage-acceleration-library-open-source-version/>`_
+library.
+
+Create an isa profile
+=====================
+
+To create a new *isa* erasure code profile:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set {name} \
+ plugin=isa \
+ technique={reed_sol_van|cauchy} \
+ [k={data-chunks}] \
+ [m={coding-chunks}] \
+ [crush-root={root}] \
+ [crush-failure-domain={bucket-type}] \
+ [crush-device-class={device-class}] \
+ [directory={directory}] \
+ [--force]
+
+Where:
+
+``k={data chunks}``
+
+:Description: Each object is split in **data-chunks** parts,
+ each stored on a different OSD.
+
+:Type: Integer
+:Required: No.
+:Default: 7
+
+``m={coding-chunks}``
+
+:Description: Compute **coding chunks** for each object and store them
+ on different OSDs. The number of coding chunks is also
+ the number of OSDs that can be down without losing data.
+
+:Type: Integer
+:Required: No.
+:Default: 3
+
+``technique={reed_sol_van|cauchy}``
+
+:Description: The ISA plugin comes in two `Reed Solomon
+ <https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction>`_
+ forms. If *reed_sol_van* is set, it is `Vandermonde
+ <https://en.wikipedia.org/wiki/Vandermonde_matrix>`_, if
+ *cauchy* is set, it is `Cauchy
+ <https://en.wikipedia.org/wiki/Cauchy_matrix>`_.
+
+:Type: String
+:Required: No.
+:Default: reed_sol_van
+
+``crush-root={root}``
+
+:Description: The name of the crush bucket used for the first step of
+ the CRUSH rule. For instance **step take default**.
+
+:Type: String
+:Required: No.
+:Default: default
+
+``crush-failure-domain={bucket-type}``
+
+:Description: Ensure that no two chunks are in a bucket with the same
+ failure domain. For instance, if the failure domain is
+ **host** no two chunks will be stored on the same
+ host. It is used to create a CRUSH rule step such as **step
+ chooseleaf host**.
+
+:Type: String
+:Required: No.
+:Default: host
+
+``crush-device-class={device-class}``
+
+:Description: Restrict placement to devices of a specific class (e.g.,
+ ``ssd`` or ``hdd``), using the crush device class names
+ in the CRUSH map.
+
+:Type: String
+:Required: No.
+:Default:
+
+``directory={directory}``
+
+:Description: Set the **directory** name from which the erasure code
+ plugin is loaded.
+
+:Type: String
+:Required: No.
+:Default: /usr/lib/ceph/erasure-code
+
+``--force``
+
+:Description: Override an existing profile by the same name.
+
+:Type: String
+:Required: No.
+
diff --git a/doc/rados/operations/erasure-code-jerasure.rst b/doc/rados/operations/erasure-code-jerasure.rst
new file mode 100644
index 000000000..8a0207748
--- /dev/null
+++ b/doc/rados/operations/erasure-code-jerasure.rst
@@ -0,0 +1,123 @@
+============================
+Jerasure erasure code plugin
+============================
+
+The *jerasure* plugin is the most generic and flexible plugin, it is
+also the default for Ceph erasure coded pools.
+
+The *jerasure* plugin encapsulates the `Jerasure
+<https://github.com/ceph/jerasure>`_ library. It is
+recommended to read the ``jerasure`` documentation to
+understand the parameters. Note that the ``jerasure.org``
+web site as of 2023 may no longer be connected to the original
+project or legitimate.
+
+Create a jerasure profile
+=========================
+
+To create a new *jerasure* erasure code profile:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set {name} \
+ plugin=jerasure \
+ k={data-chunks} \
+ m={coding-chunks} \
+ technique={reed_sol_van|reed_sol_r6_op|cauchy_orig|cauchy_good|liberation|blaum_roth|liber8tion} \
+ [crush-root={root}] \
+ [crush-failure-domain={bucket-type}] \
+ [crush-device-class={device-class}] \
+ [directory={directory}] \
+ [--force]
+
+Where:
+
+``k={data chunks}``
+
+:Description: Each object is split in **data-chunks** parts,
+ each stored on a different OSD.
+
+:Type: Integer
+:Required: Yes.
+:Example: 4
+
+``m={coding-chunks}``
+
+:Description: Compute **coding chunks** for each object and store them
+ on different OSDs. The number of coding chunks is also
+ the number of OSDs that can be down without losing data.
+
+:Type: Integer
+:Required: Yes.
+:Example: 2
+
+``technique={reed_sol_van|reed_sol_r6_op|cauchy_orig|cauchy_good|liberation|blaum_roth|liber8tion}``
+
+:Description: The more flexible technique is *reed_sol_van* : it is
+ enough to set *k* and *m*. The *cauchy_good* technique
+ can be faster but you need to chose the *packetsize*
+ carefully. All of *reed_sol_r6_op*, *liberation*,
+ *blaum_roth*, *liber8tion* are *RAID6* equivalents in
+ the sense that they can only be configured with *m=2*.
+
+:Type: String
+:Required: No.
+:Default: reed_sol_van
+
+``packetsize={bytes}``
+
+:Description: The encoding will be done on packets of *bytes* size at
+ a time. Choosing the right packet size is difficult. The
+ *jerasure* documentation contains extensive information
+ on this topic.
+
+:Type: Integer
+:Required: No.
+:Default: 2048
+
+``crush-root={root}``
+
+:Description: The name of the crush bucket used for the first step of
+ the CRUSH rule. For instance **step take default**.
+
+:Type: String
+:Required: No.
+:Default: default
+
+``crush-failure-domain={bucket-type}``
+
+:Description: Ensure that no two chunks are in a bucket with the same
+ failure domain. For instance, if the failure domain is
+ **host** no two chunks will be stored on the same
+ host. It is used to create a CRUSH rule step such as **step
+ chooseleaf host**.
+
+:Type: String
+:Required: No.
+:Default: host
+
+``crush-device-class={device-class}``
+
+:Description: Restrict placement to devices of a specific class (e.g.,
+ ``ssd`` or ``hdd``), using the crush device class names
+ in the CRUSH map.
+
+:Type: String
+:Required: No.
+
+``directory={directory}``
+
+:Description: Set the **directory** name from which the erasure code
+ plugin is loaded.
+
+:Type: String
+:Required: No.
+:Default: /usr/lib/ceph/erasure-code
+
+``--force``
+
+:Description: Override an existing profile by the same name.
+
+:Type: String
+:Required: No.
+
diff --git a/doc/rados/operations/erasure-code-lrc.rst b/doc/rados/operations/erasure-code-lrc.rst
new file mode 100644
index 000000000..5329603b9
--- /dev/null
+++ b/doc/rados/operations/erasure-code-lrc.rst
@@ -0,0 +1,388 @@
+======================================
+Locally repairable erasure code plugin
+======================================
+
+With the *jerasure* plugin, when an erasure coded object is stored on
+multiple OSDs, recovering from the loss of one OSD requires reading
+from *k* others. For instance if *jerasure* is configured with
+*k=8* and *m=4*, recovering from the loss of one OSD requires reading
+from eight others.
+
+The *lrc* erasure code plugin creates local parity chunks to enable
+recovery using fewer surviving OSDs. For instance if *lrc* is configured with
+*k=8*, *m=4* and *l=4*, it will create an additional parity chunk for
+every four OSDs. When a single OSD is lost, it can be recovered with
+only four OSDs instead of eight.
+
+Erasure code profile examples
+=============================
+
+Reduce recovery bandwidth between hosts
+---------------------------------------
+
+Although it is probably not an interesting use case when all hosts are
+connected to the same switch, reduced bandwidth usage can actually be
+observed.:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set LRCprofile \
+ plugin=lrc \
+ k=4 m=2 l=3 \
+ crush-failure-domain=host
+ ceph osd pool create lrcpool erasure LRCprofile
+
+
+Reduce recovery bandwidth between racks
+---------------------------------------
+
+In Firefly the bandwidth reduction will only be observed if the primary
+OSD is in the same rack as the lost chunk.:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set LRCprofile \
+ plugin=lrc \
+ k=4 m=2 l=3 \
+ crush-locality=rack \
+ crush-failure-domain=host
+ ceph osd pool create lrcpool erasure LRCprofile
+
+
+Create an lrc profile
+=====================
+
+To create a new lrc erasure code profile:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set {name} \
+ plugin=lrc \
+ k={data-chunks} \
+ m={coding-chunks} \
+ l={locality} \
+ [crush-root={root}] \
+ [crush-locality={bucket-type}] \
+ [crush-failure-domain={bucket-type}] \
+ [crush-device-class={device-class}] \
+ [directory={directory}] \
+ [--force]
+
+Where:
+
+``k={data chunks}``
+
+:Description: Each object is split in **data-chunks** parts,
+ each stored on a different OSD.
+
+:Type: Integer
+:Required: Yes.
+:Example: 4
+
+``m={coding-chunks}``
+
+:Description: Compute **coding chunks** for each object and store them
+ on different OSDs. The number of coding chunks is also
+ the number of OSDs that can be down without losing data.
+
+:Type: Integer
+:Required: Yes.
+:Example: 2
+
+``l={locality}``
+
+:Description: Group the coding and data chunks into sets of size
+ **locality**. For instance, for **k=4** and **m=2**,
+ when **locality=3** two groups of three are created.
+ Each set can be recovered without reading chunks
+ from another set.
+
+:Type: Integer
+:Required: Yes.
+:Example: 3
+
+``crush-root={root}``
+
+:Description: The name of the crush bucket used for the first step of
+ the CRUSH rule. For instance **step take default**.
+
+:Type: String
+:Required: No.
+:Default: default
+
+``crush-locality={bucket-type}``
+
+:Description: The type of the CRUSH bucket in which each set of chunks
+ defined by **l** will be stored. For instance, if it is
+ set to **rack**, each group of **l** chunks will be
+ placed in a different rack. It is used to create a
+ CRUSH rule step such as **step choose rack**. If it is not
+ set, no such grouping is done.
+
+:Type: String
+:Required: No.
+
+``crush-failure-domain={bucket-type}``
+
+:Description: Ensure that no two chunks are in a bucket with the same
+ failure domain. For instance, if the failure domain is
+ **host** no two chunks will be stored on the same
+ host. It is used to create a CRUSH rule step such as **step
+ chooseleaf host**.
+
+:Type: String
+:Required: No.
+:Default: host
+
+``crush-device-class={device-class}``
+
+:Description: Restrict placement to devices of a specific class (e.g.,
+ ``ssd`` or ``hdd``), using the crush device class names
+ in the CRUSH map.
+
+:Type: String
+:Required: No.
+:Default:
+
+``directory={directory}``
+
+:Description: Set the **directory** name from which the erasure code
+ plugin is loaded.
+
+:Type: String
+:Required: No.
+:Default: /usr/lib/ceph/erasure-code
+
+``--force``
+
+:Description: Override an existing profile by the same name.
+
+:Type: String
+:Required: No.
+
+Low level plugin configuration
+==============================
+
+The sum of **k** and **m** must be a multiple of the **l** parameter.
+The low level configuration parameters however do not enforce this
+restriction and it may be advantageous to use them for specific
+purposes. It is for instance possible to define two groups, one with 4
+chunks and another with 3 chunks. It is also possible to recursively
+define locality sets, for instance datacenters and racks into
+datacenters. The **k/m/l** are implemented by generating a low level
+configuration.
+
+The *lrc* erasure code plugin recursively applies erasure code
+techniques so that recovering from the loss of some chunks only
+requires a subset of the available chunks, most of the time.
+
+For instance, when three coding steps are described as::
+
+ chunk nr 01234567
+ step 1 _cDD_cDD
+ step 2 cDDD____
+ step 3 ____cDDD
+
+where *c* are coding chunks calculated from the data chunks *D*, the
+loss of chunk *7* can be recovered with the last four chunks. And the
+loss of chunk *2* chunk can be recovered with the first four
+chunks.
+
+Erasure code profile examples using low level configuration
+===========================================================
+
+Minimal testing
+---------------
+
+It is strictly equivalent to using a *K=2* *M=1* erasure code profile. The *DD*
+implies *K=2*, the *c* implies *M=1* and the *jerasure* plugin is used
+by default.:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set LRCprofile \
+ plugin=lrc \
+ mapping=DD_ \
+ layers='[ [ "DDc", "" ] ]'
+ ceph osd pool create lrcpool erasure LRCprofile
+
+Reduce recovery bandwidth between hosts
+---------------------------------------
+
+Although it is probably not an interesting use case when all hosts are
+connected to the same switch, reduced bandwidth usage can actually be
+observed. It is equivalent to **k=4**, **m=2** and **l=3** although
+the layout of the chunks is different. **WARNING: PROMPTS ARE SELECTABLE**
+
+::
+
+ $ ceph osd erasure-code-profile set LRCprofile \
+ plugin=lrc \
+ mapping=__DD__DD \
+ layers='[
+ [ "_cDD_cDD", "" ],
+ [ "cDDD____", "" ],
+ [ "____cDDD", "" ],
+ ]'
+ $ ceph osd pool create lrcpool erasure LRCprofile
+
+
+Reduce recovery bandwidth between racks
+---------------------------------------
+
+In Firefly the reduced bandwidth will only be observed if the primary OSD is in
+the same rack as the lost chunk. **WARNING: PROMPTS ARE SELECTABLE**
+
+::
+
+ $ ceph osd erasure-code-profile set LRCprofile \
+ plugin=lrc \
+ mapping=__DD__DD \
+ layers='[
+ [ "_cDD_cDD", "" ],
+ [ "cDDD____", "" ],
+ [ "____cDDD", "" ],
+ ]' \
+ crush-steps='[
+ [ "choose", "rack", 2 ],
+ [ "chooseleaf", "host", 4 ],
+ ]'
+
+ $ ceph osd pool create lrcpool erasure LRCprofile
+
+Testing with different Erasure Code backends
+--------------------------------------------
+
+LRC now uses jerasure as the default EC backend. It is possible to
+specify the EC backend/algorithm on a per layer basis using the low
+level configuration. The second argument in layers='[ [ "DDc", "" ] ]'
+is actually an erasure code profile to be used for this level. The
+example below specifies the ISA backend with the cauchy technique to
+be used in the lrcpool.:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set LRCprofile \
+ plugin=lrc \
+ mapping=DD_ \
+ layers='[ [ "DDc", "plugin=isa technique=cauchy" ] ]'
+ ceph osd pool create lrcpool erasure LRCprofile
+
+You could also use a different erasure code profile for each
+layer. **WARNING: PROMPTS ARE SELECTABLE**
+
+::
+
+ $ ceph osd erasure-code-profile set LRCprofile \
+ plugin=lrc \
+ mapping=__DD__DD \
+ layers='[
+ [ "_cDD_cDD", "plugin=isa technique=cauchy" ],
+ [ "cDDD____", "plugin=isa" ],
+ [ "____cDDD", "plugin=jerasure" ],
+ ]'
+ $ ceph osd pool create lrcpool erasure LRCprofile
+
+
+
+Erasure coding and decoding algorithm
+=====================================
+
+The steps found in the layers description::
+
+ chunk nr 01234567
+
+ step 1 _cDD_cDD
+ step 2 cDDD____
+ step 3 ____cDDD
+
+are applied in order. For instance, if a 4K object is encoded, it will
+first go through *step 1* and be divided in four 1K chunks (the four
+uppercase D). They are stored in the chunks 2, 3, 6 and 7, in
+order. From these, two coding chunks are calculated (the two lowercase
+c). The coding chunks are stored in the chunks 1 and 5, respectively.
+
+The *step 2* re-uses the content created by *step 1* in a similar
+fashion and stores a single coding chunk *c* at position 0. The last four
+chunks, marked with an underscore (*_*) for readability, are ignored.
+
+The *step 3* stores a single coding chunk *c* at position 4. The three
+chunks created by *step 1* are used to compute this coding chunk,
+i.e. the coding chunk from *step 1* becomes a data chunk in *step 3*.
+
+If chunk *2* is lost::
+
+ chunk nr 01234567
+
+ step 1 _c D_cDD
+ step 2 cD D____
+ step 3 __ _cDDD
+
+decoding will attempt to recover it by walking the steps in reverse
+order: *step 3* then *step 2* and finally *step 1*.
+
+The *step 3* knows nothing about chunk *2* (i.e. it is an underscore)
+and is skipped.
+
+The coding chunk from *step 2*, stored in chunk *0*, allows it to
+recover the content of chunk *2*. There are no more chunks to recover
+and the process stops, without considering *step 1*.
+
+Recovering chunk *2* requires reading chunks *0, 1, 3* and writing
+back chunk *2*.
+
+If chunk *2, 3, 6* are lost::
+
+ chunk nr 01234567
+
+ step 1 _c _c D
+ step 2 cD __ _
+ step 3 __ cD D
+
+The *step 3* can recover the content of chunk *6*::
+
+ chunk nr 01234567
+
+ step 1 _c _cDD
+ step 2 cD ____
+ step 3 __ cDDD
+
+The *step 2* fails to recover and is skipped because there are two
+chunks missing (*2, 3*) and it can only recover from one missing
+chunk.
+
+The coding chunk from *step 1*, stored in chunk *1, 5*, allows it to
+recover the content of chunk *2, 3*::
+
+ chunk nr 01234567
+
+ step 1 _cDD_cDD
+ step 2 cDDD____
+ step 3 ____cDDD
+
+Controlling CRUSH placement
+===========================
+
+The default CRUSH rule provides OSDs that are on different hosts. For instance::
+
+ chunk nr 01234567
+
+ step 1 _cDD_cDD
+ step 2 cDDD____
+ step 3 ____cDDD
+
+needs exactly *8* OSDs, one for each chunk. If the hosts are in two
+adjacent racks, the first four chunks can be placed in the first rack
+and the last four in the second rack. So that recovering from the loss
+of a single OSD does not require using bandwidth between the two
+racks.
+
+For instance::
+
+ crush-steps='[ [ "choose", "rack", 2 ], [ "chooseleaf", "host", 4 ] ]'
+
+will create a rule that will select two crush buckets of type
+*rack* and for each of them choose four OSDs, each of them located in
+different buckets of type *host*.
+
+The CRUSH rule can also be manually crafted for finer control.
diff --git a/doc/rados/operations/erasure-code-profile.rst b/doc/rados/operations/erasure-code-profile.rst
new file mode 100644
index 000000000..947b34c1f
--- /dev/null
+++ b/doc/rados/operations/erasure-code-profile.rst
@@ -0,0 +1,128 @@
+.. _erasure-code-profiles:
+
+=====================
+Erasure code profiles
+=====================
+
+Erasure code is defined by a **profile** and is used when creating an
+erasure coded pool and the associated CRUSH rule.
+
+The **default** erasure code profile (which is created when the Ceph
+cluster is initialized) will split the data into 2 equal-sized chunks,
+and have 2 parity chunks of the same size. It will take as much space
+in the cluster as a 2-replica pool but can sustain the data loss of 2
+chunks out of 4. It is described as a profile with **k=2** and **m=2**,
+meaning the information is spread over four OSD (k+m == 4) and two of
+them can be lost.
+
+To improve redundancy without increasing raw storage requirements, a
+new profile can be created. For instance, a profile with **k=10** and
+**m=4** can sustain the loss of four (**m=4**) OSDs by distributing an
+object on fourteen (k+m=14) OSDs. The object is first divided in
+**10** chunks (if the object is 10MB, each chunk is 1MB) and **4**
+coding chunks are computed, for recovery (each coding chunk has the
+same size as the data chunk, i.e. 1MB). The raw space overhead is only
+40% and the object will not be lost even if four OSDs break at the
+same time.
+
+.. _list of available plugins:
+
+.. toctree::
+ :maxdepth: 1
+
+ erasure-code-jerasure
+ erasure-code-isa
+ erasure-code-lrc
+ erasure-code-shec
+ erasure-code-clay
+
+osd erasure-code-profile set
+============================
+
+To create a new erasure code profile::
+
+ ceph osd erasure-code-profile set {name} \
+ [{directory=directory}] \
+ [{plugin=plugin}] \
+ [{stripe_unit=stripe_unit}] \
+ [{key=value} ...] \
+ [--force]
+
+Where:
+
+``{directory=directory}``
+
+:Description: Set the **directory** name from which the erasure code
+ plugin is loaded.
+
+:Type: String
+:Required: No.
+:Default: /usr/lib/ceph/erasure-code
+
+``{plugin=plugin}``
+
+:Description: Use the erasure code **plugin** to compute coding chunks
+ and recover missing chunks. See the `list of available
+ plugins`_ for more information.
+
+:Type: String
+:Required: No.
+:Default: jerasure
+
+``{stripe_unit=stripe_unit}``
+
+:Description: The amount of data in a data chunk, per stripe. For
+ example, a profile with 2 data chunks and stripe_unit=4K
+ would put the range 0-4K in chunk 0, 4K-8K in chunk 1,
+ then 8K-12K in chunk 0 again. This should be a multiple
+ of 4K for best performance. The default value is taken
+ from the monitor config option
+ ``osd_pool_erasure_code_stripe_unit`` when a pool is
+ created. The stripe_width of a pool using this profile
+ will be the number of data chunks multiplied by this
+ stripe_unit.
+
+:Type: String
+:Required: No.
+
+``{key=value}``
+
+:Description: The semantic of the remaining key/value pairs is defined
+ by the erasure code plugin.
+
+:Type: String
+:Required: No.
+
+``--force``
+
+:Description: Override an existing profile by the same name, and allow
+ setting a non-4K-aligned stripe_unit.
+
+:Type: String
+:Required: No.
+
+osd erasure-code-profile rm
+============================
+
+To remove an erasure code profile::
+
+ ceph osd erasure-code-profile rm {name}
+
+If the profile is referenced by a pool, the deletion will fail.
+
+.. warning:: Removing an erasure code profile using ``osd erasure-code-profile rm`` does not automatically delete the associated CRUSH rule associated with the erasure code profile. It is recommended to manually remove the associated CRUSH rule using ``ceph osd crush rule remove {rule-name}`` to avoid unexpected behavior.
+
+osd erasure-code-profile get
+============================
+
+To display an erasure code profile::
+
+ ceph osd erasure-code-profile get {name}
+
+osd erasure-code-profile ls
+===========================
+
+To list the names of all erasure code profiles::
+
+ ceph osd erasure-code-profile ls
+
diff --git a/doc/rados/operations/erasure-code-shec.rst b/doc/rados/operations/erasure-code-shec.rst
new file mode 100644
index 000000000..4e8f59b0b
--- /dev/null
+++ b/doc/rados/operations/erasure-code-shec.rst
@@ -0,0 +1,145 @@
+========================
+SHEC erasure code plugin
+========================
+
+The *shec* plugin encapsulates the `multiple SHEC
+<http://tracker.ceph.com/projects/ceph/wiki/Shingled_Erasure_Code_(SHEC)>`_
+library. It allows ceph to recover data more efficiently than Reed Solomon codes.
+
+Create an SHEC profile
+======================
+
+To create a new *shec* erasure code profile:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set {name} \
+ plugin=shec \
+ [k={data-chunks}] \
+ [m={coding-chunks}] \
+ [c={durability-estimator}] \
+ [crush-root={root}] \
+ [crush-failure-domain={bucket-type}] \
+ [crush-device-class={device-class}] \
+ [directory={directory}] \
+ [--force]
+
+Where:
+
+``k={data-chunks}``
+
+:Description: Each object is split in **data-chunks** parts,
+ each stored on a different OSD.
+
+:Type: Integer
+:Required: No.
+:Default: 4
+
+``m={coding-chunks}``
+
+:Description: Compute **coding-chunks** for each object and store them on
+ different OSDs. The number of **coding-chunks** does not necessarily
+ equal the number of OSDs that can be down without losing data.
+
+:Type: Integer
+:Required: No.
+:Default: 3
+
+``c={durability-estimator}``
+
+:Description: The number of parity chunks each of which includes each data chunk in its
+ calculation range. The number is used as a **durability estimator**.
+ For instance, if c=2, 2 OSDs can be down without losing data.
+
+:Type: Integer
+:Required: No.
+:Default: 2
+
+``crush-root={root}``
+
+:Description: The name of the crush bucket used for the first step of
+ the CRUSH rule. For instance **step take default**.
+
+:Type: String
+:Required: No.
+:Default: default
+
+``crush-failure-domain={bucket-type}``
+
+:Description: Ensure that no two chunks are in a bucket with the same
+ failure domain. For instance, if the failure domain is
+ **host** no two chunks will be stored on the same
+ host. It is used to create a CRUSH rule step such as **step
+ chooseleaf host**.
+
+:Type: String
+:Required: No.
+:Default: host
+
+``crush-device-class={device-class}``
+
+:Description: Restrict placement to devices of a specific class (e.g.,
+ ``ssd`` or ``hdd``), using the crush device class names
+ in the CRUSH map.
+
+:Type: String
+:Required: No.
+:Default:
+
+``directory={directory}``
+
+:Description: Set the **directory** name from which the erasure code
+ plugin is loaded.
+
+:Type: String
+:Required: No.
+:Default: /usr/lib/ceph/erasure-code
+
+``--force``
+
+:Description: Override an existing profile by the same name.
+
+:Type: String
+:Required: No.
+
+Brief description of SHEC's layouts
+===================================
+
+Space Efficiency
+----------------
+
+Space efficiency is a ratio of data chunks to all ones in a object and
+represented as k/(k+m).
+In order to improve space efficiency, you should increase k or decrease m:
+
+ space efficiency of SHEC(4,3,2) = :math:`\frac{4}{4+3}` = 0.57
+ SHEC(5,3,2) or SHEC(4,2,2) improves SHEC(4,3,2)'s space efficiency
+
+Durability
+----------
+
+The third parameter of SHEC (=c) is a durability estimator, which approximates
+the number of OSDs that can be down without losing data.
+
+``durability estimator of SHEC(4,3,2) = 2``
+
+Recovery Efficiency
+-------------------
+
+Describing calculation of recovery efficiency is beyond the scope of this document,
+but at least increasing m without increasing c achieves improvement of recovery efficiency.
+(However, we must pay attention to the sacrifice of space efficiency in this case.)
+
+``SHEC(4,2,2) -> SHEC(4,3,2) : achieves improvement of recovery efficiency``
+
+Erasure code profile examples
+=============================
+
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set SHECprofile \
+ plugin=shec \
+ k=8 m=4 c=3 \
+ crush-failure-domain=host
+ ceph osd pool create shecpool erasure SHECprofile
diff --git a/doc/rados/operations/erasure-code.rst b/doc/rados/operations/erasure-code.rst
new file mode 100644
index 000000000..e2bd3c296
--- /dev/null
+++ b/doc/rados/operations/erasure-code.rst
@@ -0,0 +1,272 @@
+.. _ecpool:
+
+==============
+ Erasure code
+==============
+
+By default, Ceph `pools <../pools>`_ are created with the type "replicated". In
+replicated-type pools, every object is copied to multiple disks. This
+multiple copying is the method of data protection known as "replication".
+
+By contrast, `erasure-coded <https://en.wikipedia.org/wiki/Erasure_code>`_
+pools use a method of data protection that is different from replication. In
+erasure coding, data is broken into fragments of two kinds: data blocks and
+parity blocks. If a drive fails or becomes corrupted, the parity blocks are
+used to rebuild the data. At scale, erasure coding saves space relative to
+replication.
+
+In this documentation, data blocks are referred to as "data chunks"
+and parity blocks are referred to as "coding chunks".
+
+Erasure codes are also called "forward error correction codes". The
+first forward error correction code was developed in 1950 by Richard
+Hamming at Bell Laboratories.
+
+
+Creating a sample erasure-coded pool
+------------------------------------
+
+The simplest erasure-coded pool is similar to `RAID5
+<https://en.wikipedia.org/wiki/Standard_RAID_levels#RAID_5>`_ and
+requires at least three hosts:
+
+.. prompt:: bash $
+
+ ceph osd pool create ecpool erasure
+
+::
+
+ pool 'ecpool' created
+
+.. prompt:: bash $
+
+ echo ABCDEFGHI | rados --pool ecpool put NYAN -
+ rados --pool ecpool get NYAN -
+
+::
+
+ ABCDEFGHI
+
+Erasure-code profiles
+---------------------
+
+The default erasure-code profile can sustain the overlapping loss of two OSDs
+without losing data. This erasure-code profile is equivalent to a replicated
+pool of size three, but with different storage requirements: instead of
+requiring 3TB to store 1TB, it requires only 2TB to store 1TB. The default
+profile can be displayed with this command:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile get default
+
+::
+
+ k=2
+ m=2
+ plugin=jerasure
+ crush-failure-domain=host
+ technique=reed_sol_van
+
+.. note::
+ The profile just displayed is for the *default* erasure-coded pool, not the
+ *simplest* erasure-coded pool. These two pools are not the same:
+
+ The default erasure-coded pool has two data chunks (K) and two coding chunks
+ (M). The profile of the default erasure-coded pool is "k=2 m=2".
+
+ The simplest erasure-coded pool has two data chunks (K) and one coding chunk
+ (M). The profile of the simplest erasure-coded pool is "k=2 m=1".
+
+Choosing the right profile is important because the profile cannot be modified
+after the pool is created. If you find that you need an erasure-coded pool with
+a profile different than the one you have created, you must create a new pool
+with a different (and presumably more carefully considered) profile. When the
+new pool is created, all objects from the wrongly configured pool must be moved
+to the newly created pool. There is no way to alter the profile of a pool after
+the pool has been created.
+
+The most important parameters of the profile are *K*, *M*, and
+*crush-failure-domain* because they define the storage overhead and
+the data durability. For example, if the desired architecture must
+sustain the loss of two racks with a storage overhead of 67%,
+the following profile can be defined:
+
+.. prompt:: bash $
+
+ ceph osd erasure-code-profile set myprofile \
+ k=3 \
+ m=2 \
+ crush-failure-domain=rack
+ ceph osd pool create ecpool erasure myprofile
+ echo ABCDEFGHI | rados --pool ecpool put NYAN -
+ rados --pool ecpool get NYAN -
+
+::
+
+ ABCDEFGHI
+
+The *NYAN* object will be divided in three (*K=3*) and two additional
+*chunks* will be created (*M=2*). The value of *M* defines how many
+OSDs can be lost simultaneously without losing any data. The
+*crush-failure-domain=rack* will create a CRUSH rule that ensures
+no two *chunks* are stored in the same rack.
+
+.. ditaa::
+ +-------------------+
+ name | NYAN |
+ +-------------------+
+ content | ABCDEFGHI |
+ +--------+----------+
+ |
+ |
+ v
+ +------+------+
+ +---------------+ encode(3,2) +-----------+
+ | +--+--+---+---+ |
+ | | | | |
+ | +-------+ | +-----+ |
+ | | | | |
+ +--v---+ +--v---+ +--v---+ +--v---+ +--v---+
+ name | NYAN | | NYAN | | NYAN | | NYAN | | NYAN |
+ +------+ +------+ +------+ +------+ +------+
+ shard | 1 | | 2 | | 3 | | 4 | | 5 |
+ +------+ +------+ +------+ +------+ +------+
+ content | ABC | | DEF | | GHI | | YXY | | QGC |
+ +--+---+ +--+---+ +--+---+ +--+---+ +--+---+
+ | | | | |
+ | | v | |
+ | | +--+---+ | |
+ | | | OSD1 | | |
+ | | +------+ | |
+ | | | |
+ | | +------+ | |
+ | +------>| OSD2 | | |
+ | +------+ | |
+ | | |
+ | +------+ | |
+ | | OSD3 |<----+ |
+ | +------+ |
+ | |
+ | +------+ |
+ | | OSD4 |<--------------+
+ | +------+
+ |
+ | +------+
+ +----------------->| OSD5 |
+ +------+
+
+
+More information can be found in the `erasure-code profiles
+<../erasure-code-profile>`_ documentation.
+
+
+Erasure Coding with Overwrites
+------------------------------
+
+By default, erasure-coded pools work only with operations that
+perform full object writes and appends (for example, RGW).
+
+Since Luminous, partial writes for an erasure-coded pool may be
+enabled with a per-pool setting. This lets RBD and CephFS store their
+data in an erasure-coded pool:
+
+.. prompt:: bash $
+
+ ceph osd pool set ec_pool allow_ec_overwrites true
+
+This can be enabled only on a pool residing on BlueStore OSDs, since
+BlueStore's checksumming is used during deep scrubs to detect bitrot
+or other corruption. Using Filestore with EC overwrites is not only
+unsafe, but it also results in lower performance compared to BlueStore.
+
+Erasure-coded pools do not support omap, so to use them with RBD and
+CephFS you must instruct them to store their data in an EC pool and
+their metadata in a replicated pool. For RBD, this means using the
+erasure-coded pool as the ``--data-pool`` during image creation:
+
+.. prompt:: bash $
+
+ rbd create --size 1G --data-pool ec_pool replicated_pool/image_name
+
+For CephFS, an erasure-coded pool can be set as the default data pool during
+file system creation or via `file layouts <../../../cephfs/file-layouts>`_.
+
+
+Erasure-coded pools and cache tiering
+-------------------------------------
+
+.. note:: Cache tiering is deprecated in Reef.
+
+Erasure-coded pools require more resources than replicated pools and
+lack some of the functionality supported by replicated pools (for example, omap).
+To overcome these limitations, one can set up a `cache tier <../cache-tiering>`_
+before setting up the erasure-coded pool.
+
+For example, if the pool *hot-storage* is made of fast storage, the following commands
+will place the *hot-storage* pool as a tier of *ecpool* in *writeback*
+mode:
+
+.. prompt:: bash $
+
+ ceph osd tier add ecpool hot-storage
+ ceph osd tier cache-mode hot-storage writeback
+ ceph osd tier set-overlay ecpool hot-storage
+
+The result is that every write and read to the *ecpool* actually uses
+the *hot-storage* pool and benefits from its flexibility and speed.
+
+More information can be found in the `cache tiering
+<../cache-tiering>`_ documentation. Note, however, that cache tiering
+is deprecated and may be removed completely in a future release.
+
+Erasure-coded pool recovery
+---------------------------
+If an erasure-coded pool loses any data shards, it must recover them from others.
+This recovery involves reading from the remaining shards, reconstructing the data, and
+writing new shards.
+
+In Octopus and later releases, erasure-coded pools can recover as long as there are at least *K* shards
+available. (With fewer than *K* shards, you have actually lost data!)
+
+Prior to Octopus, erasure-coded pools required that at least ``min_size`` shards be
+available, even if ``min_size`` was greater than ``K``. This was a conservative
+decision made out of an abundance of caution when designing the new pool
+mode. As a result, however, pools with lost OSDs but without complete data loss were
+unable to recover and go active without manual intervention to temporarily change
+the ``min_size`` setting.
+
+We recommend that ``min_size`` be ``K+1`` or greater to prevent loss of writes and
+loss of data.
+
+
+
+Glossary
+--------
+
+*chunk*
+ When the encoding function is called, it returns chunks of the same size as each other. There are two
+ kinds of chunks: (1) *data chunks*, which can be concatenated to reconstruct the original object, and
+ (2) *coding chunks*, which can be used to rebuild a lost chunk.
+
+*K*
+ The number of data chunks into which an object is divided. For example, if *K* = 2, then a 10KB object
+ is divided into two objects of 5KB each.
+
+*M*
+ The number of coding chunks computed by the encoding function. *M* is equal to the number of OSDs that can
+ be missing from the cluster without the cluster suffering data loss. For example, if there are two coding
+ chunks, then two OSDs can be missing without data loss.
+
+Table of contents
+-----------------
+
+.. toctree::
+ :maxdepth: 1
+
+ erasure-code-profile
+ erasure-code-jerasure
+ erasure-code-isa
+ erasure-code-lrc
+ erasure-code-shec
+ erasure-code-clay
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst
new file mode 100644
index 000000000..d52465602
--- /dev/null
+++ b/doc/rados/operations/health-checks.rst
@@ -0,0 +1,1619 @@
+.. _health-checks:
+
+===============
+ Health checks
+===============
+
+Overview
+========
+
+There is a finite set of health messages that a Ceph cluster can raise. These
+messages are known as *health checks*. Each health check has a unique
+identifier.
+
+The identifier is a terse human-readable string -- that is, the identifier is
+readable in much the same way as a typical variable name. It is intended to
+enable tools (for example, UIs) to make sense of health checks and present them
+in a way that reflects their meaning.
+
+This page lists the health checks that are raised by the monitor and manager
+daemons. In addition to these, you might see health checks that originate
+from MDS daemons (see :ref:`cephfs-health-messages`), and health checks
+that are defined by ``ceph-mgr`` python modules.
+
+Definitions
+===========
+
+Monitor
+-------
+
+DAEMON_OLD_VERSION
+__________________
+
+Warn if one or more old versions of Ceph are running on any daemons. A health
+check is raised if multiple versions are detected. This condition must exist
+for a period of time greater than ``mon_warn_older_version_delay`` (set to one
+week by default) in order for the health check to be raised. This allows most
+upgrades to proceed without the occurrence of a false warning. If the upgrade
+is paused for an extended time period, ``health mute`` can be used by running
+``ceph health mute DAEMON_OLD_VERSION --sticky``. Be sure, however, to run
+``ceph health unmute DAEMON_OLD_VERSION`` after the upgrade has finished.
+
+MON_DOWN
+________
+
+One or more monitor daemons are currently down. The cluster requires a majority
+(more than one-half) of the monitors to be available. When one or more monitors
+are down, clients might have a harder time forming their initial connection to
+the cluster, as they might need to try more addresses before they reach an
+operating monitor.
+
+The down monitor daemon should be restarted as soon as possible to reduce the
+risk of a subsequent monitor failure leading to a service outage.
+
+MON_CLOCK_SKEW
+______________
+
+The clocks on the hosts running the ceph-mon monitor daemons are not
+well-synchronized. This health check is raised if the cluster detects a clock
+skew greater than ``mon_clock_drift_allowed``.
+
+This issue is best resolved by synchronizing the clocks by using a tool like
+``ntpd`` or ``chrony``.
+
+If it is impractical to keep the clocks closely synchronized, the
+``mon_clock_drift_allowed`` threshold can also be increased. However, this
+value must stay significantly below the ``mon_lease`` interval in order for the
+monitor cluster to function properly.
+
+MON_MSGR2_NOT_ENABLED
+_____________________
+
+The :confval:`ms_bind_msgr2` option is enabled but one or more monitors are
+not configured to bind to a v2 port in the cluster's monmap. This
+means that features specific to the msgr2 protocol (for example, encryption)
+are unavailable on some or all connections.
+
+In most cases this can be corrected by running the following command:
+
+.. prompt:: bash $
+
+ ceph mon enable-msgr2
+
+After this command is run, any monitor configured to listen on the old default
+port (6789) will continue to listen for v1 connections on 6789 and begin to
+listen for v2 connections on the new default port 3300.
+
+If a monitor is configured to listen for v1 connections on a non-standard port
+(that is, a port other than 6789), then the monmap will need to be modified
+manually.
+
+
+MON_DISK_LOW
+____________
+
+One or more monitors are low on disk space. This health check is raised if the
+percentage of available space on the file system used by the monitor database
+(normally ``/var/lib/ceph/mon``) drops below the percentage value
+``mon_data_avail_warn`` (default: 30%).
+
+This alert might indicate that some other process or user on the system is
+filling up the file system used by the monitor. It might also
+indicate that the monitor database is too large (see ``MON_DISK_BIG``
+below).
+
+If space cannot be freed, the monitor's data directory might need to be
+moved to another storage device or file system (this relocation process must be carried out while the monitor
+daemon is not running).
+
+
+MON_DISK_CRIT
+_____________
+
+One or more monitors are critically low on disk space. This health check is raised if the
+percentage of available space on the file system used by the monitor database
+(normally ``/var/lib/ceph/mon``) drops below the percentage value
+``mon_data_avail_crit`` (default: 5%). See ``MON_DISK_LOW``, above.
+
+MON_DISK_BIG
+____________
+
+The database size for one or more monitors is very large. This health check is
+raised if the size of the monitor database is larger than
+``mon_data_size_warn`` (default: 15 GiB).
+
+A large database is unusual, but does not necessarily indicate a problem.
+Monitor databases might grow in size when there are placement groups that have
+not reached an ``active+clean`` state in a long time.
+
+This alert might also indicate that the monitor's database is not properly
+compacting, an issue that has been observed with some older versions of leveldb
+and rocksdb. Forcing a compaction with ``ceph daemon mon.<id> compact`` might
+shrink the database's on-disk size.
+
+This alert might also indicate that the monitor has a bug that prevents it from
+pruning the cluster metadata that it stores. If the problem persists, please
+report a bug.
+
+To adjust the warning threshold, run the following command:
+
+.. prompt:: bash $
+
+ ceph config set global mon_data_size_warn <size>
+
+
+AUTH_INSECURE_GLOBAL_ID_RECLAIM
+_______________________________
+
+One or more clients or daemons that are connected to the cluster are not
+securely reclaiming their ``global_id`` (a unique number that identifies each
+entity in the cluster) when reconnecting to a monitor. The client is being
+permitted to connect anyway because the
+``auth_allow_insecure_global_id_reclaim`` option is set to ``true`` (which may
+be necessary until all Ceph clients have been upgraded) and because the
+``auth_expose_insecure_global_id_reclaim`` option is set to ``true`` (which
+allows monitors to detect clients with "insecure reclaim" sooner by forcing
+those clients to reconnect immediately after their initial authentication).
+
+To identify which client(s) are using unpatched Ceph client code, run the
+following command:
+
+.. prompt:: bash $
+
+ ceph health detail
+
+If you collect a dump of the clients that are connected to an individual
+monitor and examine the ``global_id_status`` field in the output of the dump,
+you can see the ``global_id`` reclaim behavior of those clients. Here
+``reclaim_insecure`` means that a client is unpatched and is contributing to
+this health check. To effect a client dump, run the following command:
+
+.. prompt:: bash $
+
+ ceph tell mon.\* sessions
+
+We strongly recommend that all clients in the system be upgraded to a newer
+version of Ceph that correctly reclaims ``global_id`` values. After all clients
+have been updated, run the following command to stop allowing insecure
+reconnections:
+
+.. prompt:: bash $
+
+ ceph config set mon auth_allow_insecure_global_id_reclaim false
+
+If it is impractical to upgrade all clients immediately, you can temporarily
+silence this alert by running the following command:
+
+.. prompt:: bash $
+
+ ceph health mute AUTH_INSECURE_GLOBAL_ID_RECLAIM 1w # 1 week
+
+Although we do NOT recommend doing so, you can also disable this alert
+indefinitely by running the following command:
+
+.. prompt:: bash $
+
+ ceph config set mon mon_warn_on_insecure_global_id_reclaim false
+
+AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED
+_______________________________________
+
+Ceph is currently configured to allow clients that reconnect to monitors using
+an insecure process to reclaim their previous ``global_id``. Such reclaiming is
+allowed because, by default, ``auth_allow_insecure_global_id_reclaim`` is set
+to ``true``. It might be necessary to leave this setting enabled while existing
+Ceph clients are upgraded to newer versions of Ceph that correctly and securely
+reclaim their ``global_id``.
+
+If the ``AUTH_INSECURE_GLOBAL_ID_RECLAIM`` health check has not also been
+raised and if the ``auth_expose_insecure_global_id_reclaim`` setting has not
+been disabled (it is enabled by default), then there are currently no clients
+connected that need to be upgraded. In that case, it is safe to disable
+``insecure global_id reclaim`` by running the following command:
+
+.. prompt:: bash $
+
+ ceph config set mon auth_allow_insecure_global_id_reclaim false
+
+On the other hand, if there are still clients that need to be upgraded, then
+this alert can be temporarily silenced by running the following command:
+
+.. prompt:: bash $
+
+ ceph health mute AUTH_INSECURE_GLOBAL_ID_RECLAIM_ALLOWED 1w # 1 week
+
+Although we do NOT recommend doing so, you can also disable this alert indefinitely
+by running the following command:
+
+.. prompt:: bash $
+
+ ceph config set mon mon_warn_on_insecure_global_id_reclaim_allowed false
+
+
+Manager
+-------
+
+MGR_DOWN
+________
+
+All manager daemons are currently down. The cluster should normally have at
+least one running manager (``ceph-mgr``) daemon. If no manager daemon is
+running, the cluster's ability to monitor itself will be compromised, and parts
+of the management API will become unavailable (for example, the dashboard will
+not work, and most CLI commands that report metrics or runtime state will
+block). However, the cluster will still be able to perform all I/O operations
+and to recover from failures.
+
+The "down" manager daemon should be restarted as soon as possible to ensure
+that the cluster can be monitored (for example, so that the ``ceph -s``
+information is up to date, or so that metrics can be scraped by Prometheus).
+
+
+MGR_MODULE_DEPENDENCY
+_____________________
+
+An enabled manager module is failing its dependency check. This health check
+typically comes with an explanatory message from the module about the problem.
+
+For example, a module might report that a required package is not installed: in
+this case, you should install the required package and restart your manager
+daemons.
+
+This health check is applied only to enabled modules. If a module is not
+enabled, you can see whether it is reporting dependency issues in the output of
+`ceph module ls`.
+
+
+MGR_MODULE_ERROR
+________________
+
+A manager module has experienced an unexpected error. Typically, this means
+that an unhandled exception was raised from the module's `serve` function. The
+human-readable description of the error might be obscurely worded if the
+exception did not provide a useful description of itself.
+
+This health check might indicate a bug: please open a Ceph bug report if you
+think you have encountered a bug.
+
+However, if you believe the error is transient, you may restart your manager
+daemon(s) or use ``ceph mgr fail`` on the active daemon in order to force
+failover to another daemon.
+
+OSDs
+----
+
+OSD_DOWN
+________
+
+One or more OSDs are marked "down". The ceph-osd daemon might have been
+stopped, or peer OSDs might be unable to reach the OSD over the network.
+Common causes include a stopped or crashed daemon, a "down" host, or a network
+outage.
+
+Verify that the host is healthy, the daemon is started, and the network is
+functioning. If the daemon has crashed, the daemon log file
+(``/var/log/ceph/ceph-osd.*``) might contain debugging information.
+
+OSD_<crush type>_DOWN
+_____________________
+
+(for example, OSD_HOST_DOWN, OSD_ROOT_DOWN)
+
+All of the OSDs within a particular CRUSH subtree are marked "down" (for
+example, all OSDs on a host).
+
+OSD_ORPHAN
+__________
+
+An OSD is referenced in the CRUSH map hierarchy, but does not exist.
+
+To remove the OSD from the CRUSH map hierarchy, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd crush rm osd.<id>
+
+OSD_OUT_OF_ORDER_FULL
+_____________________
+
+The utilization thresholds for `nearfull`, `backfillfull`, `full`, and/or
+`failsafe_full` are not ascending. In particular, the following pattern is
+expected: `nearfull < backfillfull`, `backfillfull < full`, and `full <
+failsafe_full`.
+
+To adjust these utilization thresholds, run the following commands:
+
+.. prompt:: bash $
+
+ ceph osd set-nearfull-ratio <ratio>
+ ceph osd set-backfillfull-ratio <ratio>
+ ceph osd set-full-ratio <ratio>
+
+
+OSD_FULL
+________
+
+One or more OSDs have exceeded the `full` threshold and are preventing the
+cluster from servicing writes.
+
+To check utilization by pool, run the following command:
+
+.. prompt:: bash $
+
+ ceph df
+
+To see the currently defined `full` ratio, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd dump | grep full_ratio
+
+A short-term workaround to restore write availability is to raise the full
+threshold by a small amount. To do so, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd set-full-ratio <ratio>
+
+Additional OSDs should be deployed in order to add new storage to the cluster,
+or existing data should be deleted in order to free up space in the cluster.
+
+OSD_BACKFILLFULL
+________________
+
+One or more OSDs have exceeded the `backfillfull` threshold or *would* exceed
+it if the currently-mapped backfills were to finish, which will prevent data
+from rebalancing to this OSD. This alert is an early warning that
+rebalancing might be unable to complete and that the cluster is approaching
+full.
+
+To check utilization by pool, run the following command:
+
+.. prompt:: bash $
+
+ ceph df
+
+OSD_NEARFULL
+____________
+
+One or more OSDs have exceeded the `nearfull` threshold. This alert is an early
+warning that the cluster is approaching full.
+
+To check utilization by pool, run the following command:
+
+.. prompt:: bash $
+
+ ceph df
+
+OSDMAP_FLAGS
+____________
+
+One or more cluster flags of interest have been set. These flags include:
+
+* *full* - the cluster is flagged as full and cannot serve writes
+* *pauserd*, *pausewr* - there are paused reads or writes
+* *noup* - OSDs are not allowed to start
+* *nodown* - OSD failure reports are being ignored, and that means that the
+ monitors will not mark OSDs "down"
+* *noin* - OSDs that were previously marked ``out`` are not being marked
+ back ``in`` when they start
+* *noout* - "down" OSDs are not automatically being marked ``out`` after the
+ configured interval
+* *nobackfill*, *norecover*, *norebalance* - recovery or data
+ rebalancing is suspended
+* *noscrub*, *nodeep_scrub* - scrubbing is disabled
+* *notieragent* - cache-tiering activity is suspended
+
+With the exception of *full*, these flags can be set or cleared by running the
+following commands:
+
+.. prompt:: bash $
+
+ ceph osd set <flag>
+ ceph osd unset <flag>
+
+OSD_FLAGS
+_________
+
+One or more OSDs or CRUSH {nodes,device classes} have a flag of interest set.
+These flags include:
+
+* *noup*: these OSDs are not allowed to start
+* *nodown*: failure reports for these OSDs will be ignored
+* *noin*: if these OSDs were previously marked ``out`` automatically
+ after a failure, they will not be marked ``in`` when they start
+* *noout*: if these OSDs are "down" they will not automatically be marked
+ ``out`` after the configured interval
+
+To set and clear these flags in batch, run the following commands:
+
+.. prompt:: bash $
+
+ ceph osd set-group <flags> <who>
+ ceph osd unset-group <flags> <who>
+
+For example:
+
+.. prompt:: bash $
+
+ ceph osd set-group noup,noout osd.0 osd.1
+ ceph osd unset-group noup,noout osd.0 osd.1
+ ceph osd set-group noup,noout host-foo
+ ceph osd unset-group noup,noout host-foo
+ ceph osd set-group noup,noout class-hdd
+ ceph osd unset-group noup,noout class-hdd
+
+OLD_CRUSH_TUNABLES
+__________________
+
+The CRUSH map is using very old settings and should be updated. The oldest set
+of tunables that can be used (that is, the oldest client version that can
+connect to the cluster) without raising this health check is determined by the
+``mon_crush_min_required_version`` config option. For more information, see
+:ref:`crush-map-tunables`.
+
+OLD_CRUSH_STRAW_CALC_VERSION
+____________________________
+
+The CRUSH map is using an older, non-optimal method of calculating intermediate
+weight values for ``straw`` buckets.
+
+The CRUSH map should be updated to use the newer method (that is:
+``straw_calc_version=1``). For more information, see :ref:`crush-map-tunables`.
+
+CACHE_POOL_NO_HIT_SET
+_____________________
+
+One or more cache pools are not configured with a *hit set* to track
+utilization. This issue prevents the tiering agent from identifying cold
+objects that are to be flushed and evicted from the cache.
+
+To configure hit sets on the cache pool, run the following commands:
+
+.. prompt:: bash $
+
+ ceph osd pool set <poolname> hit_set_type <type>
+ ceph osd pool set <poolname> hit_set_period <period-in-seconds>
+ ceph osd pool set <poolname> hit_set_count <number-of-hitsets>
+ ceph osd pool set <poolname> hit_set_fpp <target-false-positive-rate>
+
+OSD_NO_SORTBITWISE
+__________________
+
+No pre-Luminous v12.y.z OSDs are running, but the ``sortbitwise`` flag has not
+been set.
+
+The ``sortbitwise`` flag must be set in order for OSDs running Luminous v12.y.z
+or newer to start. To safely set the flag, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd set sortbitwise
+
+OSD_FILESTORE
+__________________
+
+Warn if OSDs are running Filestore. The Filestore OSD back end has been
+deprecated; the BlueStore back end has been the default object store since the
+Ceph Luminous release.
+
+The 'mclock_scheduler' is not supported for Filestore OSDs. For this reason,
+the default 'osd_op_queue' is set to 'wpq' for Filestore OSDs and is enforced
+even if the user attempts to change it.
+
+
+
+.. prompt:: bash $
+
+ ceph report | jq -c '."osd_metadata" | .[] | select(.osd_objectstore | contains("filestore")) | {id, osd_objectstore}'
+
+**In order to upgrade to Reef or a later release, you must first migrate any
+Filestore OSDs to BlueStore.**
+
+If you are upgrading a pre-Reef release to Reef or later, but it is not
+feasible to migrate Filestore OSDs to BlueStore immediately, you can
+temporarily silence this alert by running the following command:
+
+.. prompt:: bash $
+
+ ceph health mute OSD_FILESTORE
+
+Since this migration can take a considerable amount of time to complete, we
+recommend that you begin the process well in advance of any update to Reef or
+to later releases.
+
+POOL_FULL
+_________
+
+One or more pools have reached their quota and are no longer allowing writes.
+
+To see pool quotas and utilization, run the following command:
+
+.. prompt:: bash $
+
+ ceph df detail
+
+If you opt to raise the pool quota, run the following commands:
+
+.. prompt:: bash $
+
+ ceph osd pool set-quota <poolname> max_objects <num-objects>
+ ceph osd pool set-quota <poolname> max_bytes <num-bytes>
+
+If not, delete some existing data to reduce utilization.
+
+BLUEFS_SPILLOVER
+________________
+
+One or more OSDs that use the BlueStore back end have been allocated `db`
+partitions (that is, storage space for metadata, normally on a faster device),
+but because that space has been filled, metadata has "spilled over" onto the
+slow device. This is not necessarily an error condition or even unexpected
+behavior, but may result in degraded performance. If the administrator had
+expected that all metadata would fit on the faster device, this alert indicates
+that not enough space was provided.
+
+To disable this alert on all OSDs, run the following command:
+
+.. prompt:: bash $
+
+ ceph config set osd bluestore_warn_on_bluefs_spillover false
+
+Alternatively, to disable the alert on a specific OSD, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph config set osd.123 bluestore_warn_on_bluefs_spillover false
+
+To secure more metadata space, you can destroy and reprovision the OSD in
+question. This process involves data migration and recovery.
+
+It might also be possible to expand the LVM logical volume that backs the `db`
+storage. If the underlying LV has been expanded, you must stop the OSD daemon
+and inform BlueFS of the device-size change by running the following command:
+
+.. prompt:: bash $
+
+ ceph-bluestore-tool bluefs-bdev-expand --path /var/lib/ceph/osd/ceph-$ID
+
+BLUEFS_AVAILABLE_SPACE
+______________________
+
+To see how much space is free for BlueFS, run the following command:
+
+.. prompt:: bash $
+
+ ceph daemon osd.123 bluestore bluefs available
+
+This will output up to three values: ``BDEV_DB free``, ``BDEV_SLOW free``, and
+``available_from_bluestore``. ``BDEV_DB`` and ``BDEV_SLOW`` report the amount
+of space that has been acquired by BlueFS and is now considered free. The value
+``available_from_bluestore`` indicates the ability of BlueStore to relinquish
+more space to BlueFS. It is normal for this value to differ from the amount of
+BlueStore free space, because the BlueFS allocation unit is typically larger
+than the BlueStore allocation unit. This means that only part of the BlueStore
+free space will be available for BlueFS.
+
+BLUEFS_LOW_SPACE
+_________________
+
+If BlueFS is running low on available free space and there is not much free
+space available from BlueStore (in other words, `available_from_bluestore` has
+a low value), consider reducing the BlueFS allocation unit size. To simulate
+available space when the allocation unit is different, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph daemon osd.123 bluestore bluefs available <alloc-unit-size>
+
+BLUESTORE_FRAGMENTATION
+_______________________
+
+As BlueStore operates, the free space on the underlying storage will become
+fragmented. This is normal and unavoidable, but excessive fragmentation causes
+slowdown. To inspect BlueStore fragmentation, run the following command:
+
+.. prompt:: bash $
+
+ ceph daemon osd.123 bluestore allocator score block
+
+The fragmentation score is given in a [0-1] range.
+[0.0 .. 0.4] tiny fragmentation
+[0.4 .. 0.7] small, acceptable fragmentation
+[0.7 .. 0.9] considerable, but safe fragmentation
+[0.9 .. 1.0] severe fragmentation, might impact BlueFS's ability to get space from BlueStore
+
+To see a detailed report of free fragments, run the following command:
+
+.. prompt:: bash $
+
+ ceph daemon osd.123 bluestore allocator dump block
+
+For OSD processes that are not currently running, fragmentation can be
+inspected with `ceph-bluestore-tool`. To see the fragmentation score, run the
+following command:
+
+.. prompt:: bash $
+
+ ceph-bluestore-tool --path /var/lib/ceph/osd/ceph-123 --allocator block free-score
+
+To dump detailed free chunks, run the following command:
+
+.. prompt:: bash $
+
+ ceph-bluestore-tool --path /var/lib/ceph/osd/ceph-123 --allocator block free-dump
+
+BLUESTORE_LEGACY_STATFS
+_______________________
+
+One or more OSDs have BlueStore volumes that were created prior to the
+Nautilus release. (In Nautilus, BlueStore tracks its internal usage
+statistics on a granular, per-pool basis.)
+
+If *all* OSDs
+are older than Nautilus, this means that the per-pool metrics are
+simply unavailable. But if there is a mixture of pre-Nautilus and
+post-Nautilus OSDs, the cluster usage statistics reported by ``ceph
+df`` will be inaccurate.
+
+The old OSDs can be updated to use the new usage-tracking scheme by stopping
+each OSD, running a repair operation, and then restarting the OSD. For example,
+to update ``osd.123``, run the following commands:
+
+.. prompt:: bash $
+
+ systemctl stop ceph-osd@123
+ ceph-bluestore-tool repair --path /var/lib/ceph/osd/ceph-123
+ systemctl start ceph-osd@123
+
+To disable this alert, run the following command:
+
+.. prompt:: bash $
+
+ ceph config set global bluestore_warn_on_legacy_statfs false
+
+BLUESTORE_NO_PER_POOL_OMAP
+__________________________
+
+One or more OSDs have volumes that were created prior to the Octopus release.
+(In Octopus and later releases, BlueStore tracks omap space utilization by
+pool.)
+
+If there are any BlueStore OSDs that do not have the new tracking enabled, the
+cluster will report an approximate value for per-pool omap usage based on the
+most recent deep scrub.
+
+The OSDs can be updated to track by pool by stopping each OSD, running a repair
+operation, and then restarting the OSD. For example, to update ``osd.123``, run
+the following commands:
+
+.. prompt:: bash $
+
+ systemctl stop ceph-osd@123
+ ceph-bluestore-tool repair --path /var/lib/ceph/osd/ceph-123
+ systemctl start ceph-osd@123
+
+To disable this alert, run the following command:
+
+.. prompt:: bash $
+
+ ceph config set global bluestore_warn_on_no_per_pool_omap false
+
+BLUESTORE_NO_PER_PG_OMAP
+__________________________
+
+One or more OSDs have volumes that were created prior to Pacific. (In Pacific
+and later releases Bluestore tracks omap space utilitzation by Placement Group
+(PG).)
+
+Per-PG omap allows faster PG removal when PGs migrate.
+
+The older OSDs can be updated to track by PG by stopping each OSD, running a
+repair operation, and then restarting the OSD. For example, to update
+``osd.123``, run the following commands:
+
+.. prompt:: bash $
+
+ systemctl stop ceph-osd@123
+ ceph-bluestore-tool repair --path /var/lib/ceph/osd/ceph-123
+ systemctl start ceph-osd@123
+
+To disable this alert, run the following command:
+
+.. prompt:: bash $
+
+ ceph config set global bluestore_warn_on_no_per_pg_omap false
+
+
+BLUESTORE_DISK_SIZE_MISMATCH
+____________________________
+
+One or more BlueStore OSDs have an internal inconsistency between the size of
+the physical device and the metadata that tracks its size. This inconsistency
+can lead to the OSD(s) crashing in the future.
+
+The OSDs that have this inconsistency should be destroyed and reprovisioned. Be
+very careful to execute this procedure on only one OSD at a time, so as to
+minimize the risk of losing any data. To execute this procedure, where ``$N``
+is the OSD that has the inconsistency, run the following commands:
+
+.. prompt:: bash $
+
+ ceph osd out osd.$N
+ while ! ceph osd safe-to-destroy osd.$N ; do sleep 1m ; done
+ ceph osd destroy osd.$N
+ ceph-volume lvm zap /path/to/device
+ ceph-volume lvm create --osd-id $N --data /path/to/device
+
+.. note::
+
+ Wait for this recovery procedure to completely on one OSD before running it
+ on the next.
+
+BLUESTORE_NO_COMPRESSION
+________________________
+
+One or more OSDs is unable to load a BlueStore compression plugin. This issue
+might be caused by a broken installation, in which the ``ceph-osd`` binary does
+not match the compression plugins. Or it might be caused by a recent upgrade in
+which the ``ceph-osd`` daemon was not restarted.
+
+To resolve this issue, verify that all of the packages on the host that is
+running the affected OSD(s) are correctly installed and that the OSD daemon(s)
+have been restarted. If the problem persists, check the OSD log for information
+about the source of the problem.
+
+BLUESTORE_SPURIOUS_READ_ERRORS
+______________________________
+
+One or more BlueStore OSDs detect spurious read errors on the main device.
+BlueStore has recovered from these errors by retrying disk reads. This alert
+might indicate issues with underlying hardware, issues with the I/O subsystem,
+or something similar. In theory, such issues can cause permanent data
+corruption. Some observations on the root cause of spurious read errors can be
+found here: https://tracker.ceph.com/issues/22464
+
+This alert does not require an immediate response, but the affected host might
+need additional attention: for example, upgrading the host to the latest
+OS/kernel versions and implementing hardware-resource-utilization monitoring.
+
+To disable this alert on all OSDs, run the following command:
+
+.. prompt:: bash $
+
+ ceph config set osd bluestore_warn_on_spurious_read_errors false
+
+Or, to disable this alert on a specific OSD, run the following command:
+
+.. prompt:: bash $
+
+ ceph config set osd.123 bluestore_warn_on_spurious_read_errors false
+
+Device health
+-------------
+
+DEVICE_HEALTH
+_____________
+
+One or more OSD devices are expected to fail soon, where the warning threshold
+is determined by the ``mgr/devicehealth/warn_threshold`` config option.
+
+Because this alert applies only to OSDs that are currently marked ``in``, the
+appropriate response to this expected failure is (1) to mark the OSD ``out`` so
+that data is migrated off of the OSD, and then (2) to remove the hardware from
+the system. Note that this marking ``out`` is normally done automatically if
+``mgr/devicehealth/self_heal`` is enabled (as determined by
+``mgr/devicehealth/mark_out_threshold``).
+
+To check device health, run the following command:
+
+.. prompt:: bash $
+
+ ceph device info <device-id>
+
+Device life expectancy is set either by a prediction model that the mgr runs or
+by an external tool that is activated by running the following command:
+
+.. prompt:: bash $
+
+ ceph device set-life-expectancy <device-id> <from> <to>
+
+You can change the stored life expectancy manually, but such a change usually
+doesn't accomplish anything. The reason for this is that whichever tool
+originally set the stored life expectancy will probably undo your change by
+setting it again, and a change to the stored value does not affect the actual
+health of the hardware device.
+
+DEVICE_HEALTH_IN_USE
+____________________
+
+One or more devices (that is, OSDs) are expected to fail soon and have been
+marked ``out`` of the cluster (as controlled by
+``mgr/devicehealth/mark_out_threshold``), but they are still participating in
+one or more Placement Groups. This might be because the OSD(s) were marked
+``out`` only recently and data is still migrating, or because data cannot be
+migrated off of the OSD(s) for some reason (for example, the cluster is nearly
+full, or the CRUSH hierarchy is structured so that there isn't another suitable
+OSD to migrate the data to).
+
+This message can be silenced by disabling self-heal behavior (that is, setting
+``mgr/devicehealth/self_heal`` to ``false``), by adjusting
+``mgr/devicehealth/mark_out_threshold``, or by addressing whichever condition
+is preventing data from being migrated off of the ailing OSD(s).
+
+.. _rados_health_checks_device_health_toomany:
+
+DEVICE_HEALTH_TOOMANY
+_____________________
+
+Too many devices (that is, OSDs) are expected to fail soon, and because
+``mgr/devicehealth/self_heal`` behavior is enabled, marking ``out`` all of the
+ailing OSDs would exceed the cluster's ``mon_osd_min_in_ratio`` ratio. This
+ratio prevents a cascade of too many OSDs from being automatically marked
+``out``.
+
+You should promptly add new OSDs to the cluster to prevent data loss, or
+incrementally replace the failing OSDs.
+
+Alternatively, you can silence this health check by adjusting options including
+``mon_osd_min_in_ratio`` or ``mgr/devicehealth/mark_out_threshold``. Be
+warned, however, that this will increase the likelihood of unrecoverable data
+loss.
+
+
+Data health (pools & placement groups)
+--------------------------------------
+
+PG_AVAILABILITY
+_______________
+
+Data availability is reduced. In other words, the cluster is unable to service
+potential read or write requests for at least some data in the cluster. More
+precisely, one or more Placement Groups (PGs) are in a state that does not
+allow I/O requests to be serviced. Any of the following PG states are
+problematic if they do not clear quickly: *peering*, *stale*, *incomplete*, and
+the lack of *active*.
+
+For detailed information about which PGs are affected, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph health detail
+
+In most cases, the root cause of this issue is that one or more OSDs are
+currently ``down``: see ``OSD_DOWN`` above.
+
+To see the state of a specific problematic PG, run the following command:
+
+.. prompt:: bash $
+
+ ceph tell <pgid> query
+
+PG_DEGRADED
+___________
+
+Data redundancy is reduced for some data: in other words, the cluster does not
+have the desired number of replicas for all data (in the case of replicated
+pools) or erasure code fragments (in the case of erasure-coded pools). More
+precisely, one or more Placement Groups (PGs):
+
+* have the *degraded* or *undersized* flag set, which means that there are not
+ enough instances of that PG in the cluster; or
+* have not had the *clean* state set for a long time.
+
+For detailed information about which PGs are affected, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph health detail
+
+In most cases, the root cause of this issue is that one or more OSDs are
+currently "down": see ``OSD_DOWN`` above.
+
+To see the state of a specific problematic PG, run the following command:
+
+.. prompt:: bash $
+
+ ceph tell <pgid> query
+
+
+PG_RECOVERY_FULL
+________________
+
+Data redundancy might be reduced or even put at risk for some data due to a
+lack of free space in the cluster. More precisely, one or more Placement Groups
+have the *recovery_toofull* flag set, which means that the cluster is unable to
+migrate or recover data because one or more OSDs are above the ``full``
+threshold.
+
+For steps to resolve this condition, see *OSD_FULL* above.
+
+PG_BACKFILL_FULL
+________________
+
+Data redundancy might be reduced or even put at risk for some data due to a
+lack of free space in the cluster. More precisely, one or more Placement Groups
+have the *backfill_toofull* flag set, which means that the cluster is unable to
+migrate or recover data because one or more OSDs are above the ``backfillfull``
+threshold.
+
+For steps to resolve this condition, see *OSD_BACKFILLFULL* above.
+
+PG_DAMAGED
+__________
+
+Data scrubbing has discovered problems with data consistency in the cluster.
+More precisely, one or more Placement Groups either (1) have the *inconsistent*
+or ``snaptrim_error`` flag set, which indicates that an earlier data scrub
+operation found a problem, or (2) have the *repair* flag set, which means that
+a repair for such an inconsistency is currently in progress.
+
+For more information, see :doc:`pg-repair`.
+
+OSD_SCRUB_ERRORS
+________________
+
+Recent OSD scrubs have discovered inconsistencies. This alert is generally
+paired with *PG_DAMAGED* (see above).
+
+For more information, see :doc:`pg-repair`.
+
+OSD_TOO_MANY_REPAIRS
+____________________
+
+The count of read repairs has exceeded the config value threshold
+``mon_osd_warn_num_repaired`` (default: ``10``). Because scrub handles errors
+only for data at rest, and because any read error that occurs when another
+replica is available will be repaired immediately so that the client can get
+the object data, there might exist failing disks that are not registering any
+scrub errors. This repair count is maintained as a way of identifying any such
+failing disks.
+
+
+LARGE_OMAP_OBJECTS
+__________________
+
+One or more pools contain large omap objects, as determined by
+``osd_deep_scrub_large_omap_object_key_threshold`` (threshold for the number of
+keys to determine what is considered a large omap object) or
+``osd_deep_scrub_large_omap_object_value_sum_threshold`` (the threshold for the
+summed size in bytes of all key values to determine what is considered a large
+omap object) or both. To find more information on object name, key count, and
+size in bytes, search the cluster log for 'Large omap object found'. This issue
+can be caused by RGW-bucket index objects that do not have automatic resharding
+enabled. For more information on resharding, see :ref:`RGW Dynamic Bucket Index
+Resharding <rgw_dynamic_bucket_index_resharding>`.
+
+To adjust the thresholds mentioned above, run the following commands:
+
+.. prompt:: bash $
+
+ ceph config set osd osd_deep_scrub_large_omap_object_key_threshold <keys>
+ ceph config set osd osd_deep_scrub_large_omap_object_value_sum_threshold <bytes>
+
+CACHE_POOL_NEAR_FULL
+____________________
+
+A cache-tier pool is nearly full, as determined by the ``target_max_bytes`` and
+``target_max_objects`` properties of the cache pool. Once the pool reaches the
+target threshold, write requests to the pool might block while data is flushed
+and evicted from the cache. This state normally leads to very high latencies
+and poor performance.
+
+To adjust the cache pool's target size, run the following commands:
+
+.. prompt:: bash $
+
+ ceph osd pool set <cache-pool-name> target_max_bytes <bytes>
+ ceph osd pool set <cache-pool-name> target_max_objects <objects>
+
+There might be other reasons that normal cache flush and evict activity are
+throttled: for example, reduced availability of the base tier, reduced
+performance of the base tier, or overall cluster load.
+
+TOO_FEW_PGS
+___________
+
+The number of Placement Groups (PGs) that are in use in the cluster is below
+the configurable threshold of ``mon_pg_warn_min_per_osd`` PGs per OSD. This can
+lead to suboptimal distribution and suboptimal balance of data across the OSDs
+in the cluster, and a reduction of overall performance.
+
+If data pools have not yet been created, this condition is expected.
+
+To address this issue, you can increase the PG count for existing pools or
+create new pools. For more information, see
+:ref:`choosing-number-of-placement-groups`.
+
+POOL_PG_NUM_NOT_POWER_OF_TWO
+____________________________
+
+One or more pools have a ``pg_num`` value that is not a power of two. Although
+this is not strictly incorrect, it does lead to a less balanced distribution of
+data because some Placement Groups will have roughly twice as much data as
+others have.
+
+This is easily corrected by setting the ``pg_num`` value for the affected
+pool(s) to a nearby power of two. To do so, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> pg_num <value>
+
+To disable this health check, run the following command:
+
+.. prompt:: bash $
+
+ ceph config set global mon_warn_on_pool_pg_num_not_power_of_two false
+
+POOL_TOO_FEW_PGS
+________________
+
+One or more pools should probably have more Placement Groups (PGs), given the
+amount of data that is currently stored in the pool. This issue can lead to
+suboptimal distribution and suboptimal balance of data across the OSDs in the
+cluster, and a reduction of overall performance. This alert is raised only if
+the ``pg_autoscale_mode`` property on the pool is set to ``warn``.
+
+To disable the alert, entirely disable auto-scaling of PGs for the pool by
+running the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> pg_autoscale_mode off
+
+To allow the cluster to automatically adjust the number of PGs for the pool,
+run the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> pg_autoscale_mode on
+
+Alternatively, to manually set the number of PGs for the pool to the
+recommended amount, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> pg_num <new-pg-num>
+
+For more information, see :ref:`choosing-number-of-placement-groups` and
+:ref:`pg-autoscaler`.
+
+TOO_MANY_PGS
+____________
+
+The number of Placement Groups (PGs) in use in the cluster is above the
+configurable threshold of ``mon_max_pg_per_osd`` PGs per OSD. If this threshold
+is exceeded, the cluster will not allow new pools to be created, pool `pg_num`
+to be increased, or pool replication to be increased (any of which, if allowed,
+would lead to more PGs in the cluster). A large number of PGs can lead to
+higher memory utilization for OSD daemons, slower peering after cluster state
+changes (for example, OSD restarts, additions, or removals), and higher load on
+the Manager and Monitor daemons.
+
+The simplest way to mitigate the problem is to increase the number of OSDs in
+the cluster by adding more hardware. Note that, because the OSD count that is
+used for the purposes of this health check is the number of ``in`` OSDs,
+marking ``out`` OSDs ``in`` (if there are any ``out`` OSDs available) can also
+help. To do so, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd in <osd id(s)>
+
+For more information, see :ref:`choosing-number-of-placement-groups`.
+
+POOL_TOO_MANY_PGS
+_________________
+
+One or more pools should probably have fewer Placement Groups (PGs), given the
+amount of data that is currently stored in the pool. This issue can lead to
+higher memory utilization for OSD daemons, slower peering after cluster state
+changes (for example, OSD restarts, additions, or removals), and higher load on
+the Manager and Monitor daemons. This alert is raised only if the
+``pg_autoscale_mode`` property on the pool is set to ``warn``.
+
+To disable the alert, entirely disable auto-scaling of PGs for the pool by
+running the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> pg_autoscale_mode off
+
+To allow the cluster to automatically adjust the number of PGs for the pool,
+run the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> pg_autoscale_mode on
+
+Alternatively, to manually set the number of PGs for the pool to the
+recommended amount, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> pg_num <new-pg-num>
+
+For more information, see :ref:`choosing-number-of-placement-groups` and
+:ref:`pg-autoscaler`.
+
+
+POOL_TARGET_SIZE_BYTES_OVERCOMMITTED
+____________________________________
+
+One or more pools have a ``target_size_bytes`` property that is set in order to
+estimate the expected size of the pool, but the value(s) of this property are
+greater than the total available storage (either by themselves or in
+combination with other pools).
+
+This alert is usually an indication that the ``target_size_bytes`` value for
+the pool is too large and should be reduced or set to zero. To reduce the
+``target_size_bytes`` value or set it to zero, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> target_size_bytes 0
+
+The above command sets the value of ``target_size_bytes`` to zero. To set the
+value of ``target_size_bytes`` to a non-zero value, replace the ``0`` with that
+non-zero value.
+
+For more information, see :ref:`specifying_pool_target_size`.
+
+POOL_HAS_TARGET_SIZE_BYTES_AND_RATIO
+____________________________________
+
+One or more pools have both ``target_size_bytes`` and ``target_size_ratio`` set
+in order to estimate the expected size of the pool. Only one of these
+properties should be non-zero. If both are set to a non-zero value, then
+``target_size_ratio`` takes precedence and ``target_size_bytes`` is ignored.
+
+To reset ``target_size_bytes`` to zero, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool-name> target_size_bytes 0
+
+For more information, see :ref:`specifying_pool_target_size`.
+
+TOO_FEW_OSDS
+____________
+
+The number of OSDs in the cluster is below the configurable threshold of
+``osd_pool_default_size``. This means that some or all data may not be able to
+satisfy the data protection policy specified in CRUSH rules and pool settings.
+
+SMALLER_PGP_NUM
+_______________
+
+One or more pools have a ``pgp_num`` value less than ``pg_num``. This alert is
+normally an indication that the Placement Group (PG) count was increased
+without any increase in the placement behavior.
+
+This disparity is sometimes brought about deliberately, in order to separate
+out the `split` step when the PG count is adjusted from the data migration that
+is needed when ``pgp_num`` is changed.
+
+This issue is normally resolved by setting ``pgp_num`` to match ``pg_num``, so
+as to trigger the data migration, by running the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set <pool> pgp_num <pg-num-value>
+
+MANY_OBJECTS_PER_PG
+___________________
+
+One or more pools have an average number of objects per Placement Group (PG)
+that is significantly higher than the overall cluster average. The specific
+threshold is determined by the ``mon_pg_warn_max_object_skew`` configuration
+value.
+
+This alert is usually an indication that the pool(s) that contain most of the
+data in the cluster have too few PGs, or that other pools that contain less
+data have too many PGs. See *TOO_MANY_PGS* above.
+
+To silence the health check, raise the threshold by adjusting the
+``mon_pg_warn_max_object_skew`` config option on the managers.
+
+The health check will be silenced for a specific pool only if
+``pg_autoscale_mode`` is set to ``on``.
+
+POOL_APP_NOT_ENABLED
+____________________
+
+A pool exists but the pool has not been tagged for use by a particular
+application.
+
+To resolve this issue, tag the pool for use by an application. For
+example, if the pool is used by RBD, run the following command:
+
+.. prompt:: bash $
+
+ rbd pool init <poolname>
+
+Alternatively, if the pool is being used by a custom application (here 'foo'),
+you can label the pool by running the following low-level command:
+
+.. prompt:: bash $
+
+ ceph osd pool application enable foo
+
+For more information, see :ref:`associate-pool-to-application`.
+
+POOL_FULL
+_________
+
+One or more pools have reached (or are very close to reaching) their quota. The
+threshold to raise this health check is determined by the
+``mon_pool_quota_crit_threshold`` configuration option.
+
+Pool quotas can be adjusted up or down (or removed) by running the following
+commands:
+
+.. prompt:: bash $
+
+ ceph osd pool set-quota <pool> max_bytes <bytes>
+ ceph osd pool set-quota <pool> max_objects <objects>
+
+To disable a quota, set the quota value to 0.
+
+POOL_NEAR_FULL
+______________
+
+One or more pools are approaching a configured fullness threshold.
+
+One of the several thresholds that can raise this health check is determined by
+the ``mon_pool_quota_warn_threshold`` configuration option.
+
+Pool quotas can be adjusted up or down (or removed) by running the following
+commands:
+
+.. prompt:: bash $
+
+ ceph osd pool set-quota <pool> max_bytes <bytes>
+ ceph osd pool set-quota <pool> max_objects <objects>
+
+To disable a quota, set the quota value to 0.
+
+Other thresholds that can raise the two health checks above are
+``mon_osd_nearfull_ratio`` and ``mon_osd_full_ratio``. For details and
+resolution, see :ref:`storage-capacity` and :ref:`no-free-drive-space`.
+
+OBJECT_MISPLACED
+________________
+
+One or more objects in the cluster are not stored on the node that CRUSH would
+prefer that they be stored on. This alert is an indication that data migration
+due to a recent cluster change has not yet completed.
+
+Misplaced data is not a dangerous condition in and of itself; data consistency
+is never at risk, and old copies of objects will not be removed until the
+desired number of new copies (in the desired locations) has been created.
+
+OBJECT_UNFOUND
+______________
+
+One or more objects in the cluster cannot be found. More precisely, the OSDs
+know that a new or updated copy of an object should exist, but no such copy has
+been found on OSDs that are currently online.
+
+Read or write requests to unfound objects will block.
+
+Ideally, a "down" OSD that has a more recent copy of the unfound object can be
+brought back online. To identify candidate OSDs, check the peering state of the
+PG(s) responsible for the unfound object. To see the peering state, run the
+following command:
+
+.. prompt:: bash $
+
+ ceph tell <pgid> query
+
+On the other hand, if the latest copy of the object is not available, the
+cluster can be told to roll back to a previous version of the object. For more
+information, see :ref:`failures-osd-unfound`.
+
+SLOW_OPS
+________
+
+One or more OSD requests or monitor requests are taking a long time to process.
+This alert might be an indication of extreme load, a slow storage device, or a
+software bug.
+
+To query the request queue for the daemon that is causing the slowdown, run the
+following command from the daemon's host:
+
+.. prompt:: bash $
+
+ ceph daemon osd.<id> ops
+
+To see a summary of the slowest recent requests, run the following command:
+
+.. prompt:: bash $
+
+ ceph daemon osd.<id> dump_historic_ops
+
+To see the location of a specific OSD, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd find osd.<id>
+
+PG_NOT_SCRUBBED
+_______________
+
+One or more Placement Groups (PGs) have not been scrubbed recently. PGs are
+normally scrubbed within an interval determined by
+:confval:`osd_scrub_max_interval` globally. This interval can be overridden on
+per-pool basis by changing the value of the variable
+:confval:`scrub_max_interval`. This health check is raised if a certain
+percentage (determined by ``mon_warn_pg_not_scrubbed_ratio``) of the interval
+has elapsed after the time the scrub was scheduled and no scrub has been
+performed.
+
+PGs will be scrubbed only if they are flagged as ``clean`` (which means that
+they are to be cleaned, and not that they have been examined and found to be
+clean). Misplaced or degraded PGs will not be flagged as ``clean`` (see
+*PG_AVAILABILITY* and *PG_DEGRADED* above).
+
+To manually initiate a scrub of a clean PG, run the following command:
+
+.. prompt: bash $
+
+ ceph pg scrub <pgid>
+
+PG_NOT_DEEP_SCRUBBED
+____________________
+
+One or more Placement Groups (PGs) have not been deep scrubbed recently. PGs
+are normally scrubbed every :confval:`osd_deep_scrub_interval` seconds at most.
+This health check is raised if a certain percentage (determined by
+``mon_warn_pg_not_deep_scrubbed_ratio``) of the interval has elapsed after the
+time the scrub was scheduled and no scrub has been performed.
+
+PGs will receive a deep scrub only if they are flagged as *clean* (which means
+that they are to be cleaned, and not that they have been examined and found to
+be clean). Misplaced or degraded PGs might not be flagged as ``clean`` (see
+*PG_AVAILABILITY* and *PG_DEGRADED* above).
+
+To manually initiate a deep scrub of a clean PG, run the following command:
+
+.. prompt:: bash $
+
+ ceph pg deep-scrub <pgid>
+
+
+PG_SLOW_SNAP_TRIMMING
+_____________________
+
+The snapshot trim queue for one or more PGs has exceeded the configured warning
+threshold. This alert indicates either that an extremely large number of
+snapshots was recently deleted, or that OSDs are unable to trim snapshots
+quickly enough to keep up with the rate of new snapshot deletions.
+
+The warning threshold is determined by the ``mon_osd_snap_trim_queue_warn_on``
+option (default: 32768).
+
+This alert might be raised if OSDs are under excessive load and unable to keep
+up with their background work, or if the OSDs' internal metadata database is
+heavily fragmented and unable to perform. The alert might also indicate some
+other performance issue with the OSDs.
+
+The exact size of the snapshot trim queue is reported by the ``snaptrimq_len``
+field of ``ceph pg ls -f json-detail``.
+
+Stretch Mode
+------------
+
+INCORRECT_NUM_BUCKETS_STRETCH_MODE
+__________________________________
+
+Stretch mode currently only support 2 dividing buckets with OSDs, this warning suggests
+that the number of dividing buckets is not equal to 2 after stretch mode is enabled.
+You can expect unpredictable failures and MON assertions until the condition is fixed.
+
+We encourage you to fix this by removing additional dividing buckets or bump the
+number of dividing buckets to 2.
+
+UNEVEN_WEIGHTS_STRETCH_MODE
+___________________________
+
+The 2 dividing buckets must have equal weights when stretch mode is enabled.
+This warning suggests that the 2 dividing buckets have uneven weights after
+stretch mode is enabled. This is not immediately fatal, however, you can expect
+Ceph to be confused when trying to process transitions between dividing buckets.
+
+We encourage you to fix this by making the weights even on both dividing buckets.
+This can be done by making sure the combined weight of the OSDs on each dividing
+bucket are the same.
+
+Miscellaneous
+-------------
+
+RECENT_CRASH
+____________
+
+One or more Ceph daemons have crashed recently, and the crash(es) have not yet
+been acknowledged and archived by the administrator. This alert might indicate
+a software bug, a hardware problem (for example, a failing disk), or some other
+problem.
+
+To list recent crashes, run the following command:
+
+.. prompt:: bash $
+
+ ceph crash ls-new
+
+To examine information about a specific crash, run the following command:
+
+.. prompt:: bash $
+
+ ceph crash info <crash-id>
+
+To silence this alert, you can archive the crash (perhaps after the crash
+has been examined by an administrator) by running the following command:
+
+.. prompt:: bash $
+
+ ceph crash archive <crash-id>
+
+Similarly, to archive all recent crashes, run the following command:
+
+.. prompt:: bash $
+
+ ceph crash archive-all
+
+Archived crashes will still be visible by running the command ``ceph crash
+ls``, but not by running the command ``ceph crash ls-new``.
+
+The time period that is considered recent is determined by the option
+``mgr/crash/warn_recent_interval`` (default: two weeks).
+
+To entirely disable this alert, run the following command:
+
+.. prompt:: bash $
+
+ ceph config set mgr/crash/warn_recent_interval 0
+
+RECENT_MGR_MODULE_CRASH
+_______________________
+
+One or more ``ceph-mgr`` modules have crashed recently, and the crash(es) have
+not yet been acknowledged and archived by the administrator. This alert
+usually indicates a software bug in one of the software modules that are
+running inside the ``ceph-mgr`` daemon. The module that experienced the problem
+might be disabled as a result, but other modules are unaffected and continue to
+function as expected.
+
+As with the *RECENT_CRASH* health check, a specific crash can be inspected by
+running the following command:
+
+.. prompt:: bash $
+
+ ceph crash info <crash-id>
+
+To silence this alert, you can archive the crash (perhaps after the crash has
+been examined by an administrator) by running the following command:
+
+.. prompt:: bash $
+
+ ceph crash archive <crash-id>
+
+Similarly, to archive all recent crashes, run the following command:
+
+.. prompt:: bash $
+
+ ceph crash archive-all
+
+Archived crashes will still be visible by running the command ``ceph crash ls``
+but not by running the command ``ceph crash ls-new``.
+
+The time period that is considered recent is determined by the option
+``mgr/crash/warn_recent_interval`` (default: two weeks).
+
+To entirely disable this alert, run the following command:
+
+.. prompt:: bash $
+
+ ceph config set mgr/crash/warn_recent_interval 0
+
+TELEMETRY_CHANGED
+_________________
+
+Telemetry has been enabled, but because the contents of the telemetry report
+have changed in the meantime, telemetry reports will not be sent.
+
+Ceph developers occasionally revise the telemetry feature to include new and
+useful information, or to remove information found to be useless or sensitive.
+If any new information is included in the report, Ceph requires the
+administrator to re-enable telemetry. This requirement ensures that the
+administrator has an opportunity to (re)review the information that will be
+shared.
+
+To review the contents of the telemetry report, run the following command:
+
+.. prompt:: bash $
+
+ ceph telemetry show
+
+Note that the telemetry report consists of several channels that may be
+independently enabled or disabled. For more information, see :ref:`telemetry`.
+
+To re-enable telemetry (and silence the alert), run the following command:
+
+.. prompt:: bash $
+
+ ceph telemetry on
+
+To disable telemetry (and silence the alert), run the following command:
+
+.. prompt:: bash $
+
+ ceph telemetry off
+
+AUTH_BAD_CAPS
+_____________
+
+One or more auth users have capabilities that cannot be parsed by the monitors.
+As a general rule, this alert indicates that there are one or more daemon types
+that the user is not authorized to use to perform any action.
+
+This alert is most likely to be raised after an upgrade if (1) the capabilities
+were set with an older version of Ceph that did not properly validate the
+syntax of those capabilities, or if (2) the syntax of the capabilities has
+changed.
+
+To remove the user(s) in question, run the following command:
+
+.. prompt:: bash $
+
+ ceph auth rm <entity-name>
+
+(This resolves the health check, but it prevents clients from being able to
+authenticate as the removed user.)
+
+Alternatively, to update the capabilities for the user(s), run the following
+command:
+
+.. prompt:: bash $
+
+ ceph auth <entity-name> <daemon-type> <caps> [<daemon-type> <caps> ...]
+
+For more information about auth capabilities, see :ref:`user-management`.
+
+OSD_NO_DOWN_OUT_INTERVAL
+________________________
+
+The ``mon_osd_down_out_interval`` option is set to zero, which means that the
+system does not automatically perform any repair or healing operations when an
+OSD fails. Instead, an administrator an external orchestrator must manually
+mark "down" OSDs as ``out`` (by running ``ceph osd out <osd-id>``) in order to
+trigger recovery.
+
+This option is normally set to five or ten minutes, which should be enough time
+for a host to power-cycle or reboot.
+
+To silence this alert, set ``mon_warn_on_osd_down_out_interval_zero`` to
+``false`` by running the following command:
+
+.. prompt:: bash $
+
+ ceph config global mon mon_warn_on_osd_down_out_interval_zero false
+
+DASHBOARD_DEBUG
+_______________
+
+The Dashboard debug mode is enabled. This means that if there is an error while
+processing a REST API request, the HTTP error response will contain a Python
+traceback. This mode should be disabled in production environments because such
+a traceback might contain and expose sensitive information.
+
+To disable the debug mode, run the following command:
+
+.. prompt:: bash $
+
+ ceph dashboard debug disable
diff --git a/doc/rados/operations/index.rst b/doc/rados/operations/index.rst
new file mode 100644
index 000000000..15525c1d3
--- /dev/null
+++ b/doc/rados/operations/index.rst
@@ -0,0 +1,99 @@
+.. _rados-operations:
+
+====================
+ Cluster Operations
+====================
+
+.. raw:: html
+
+ <table><colgroup><col width="50%"><col width="50%"></colgroup><tbody valign="top"><tr><td><h3>High-level Operations</h3>
+
+High-level cluster operations consist primarily of starting, stopping, and
+restarting a cluster with the ``ceph`` service; checking the cluster's health;
+and, monitoring an operating cluster.
+
+.. toctree::
+ :maxdepth: 1
+
+ operating
+ health-checks
+ monitoring
+ monitoring-osd-pg
+ user-management
+ pg-repair
+
+.. raw:: html
+
+ </td><td><h3>Data Placement</h3>
+
+Once you have your cluster up and running, you may begin working with data
+placement. Ceph supports petabyte-scale data storage clusters, with storage
+pools and placement groups that distribute data across the cluster using Ceph's
+CRUSH algorithm.
+
+.. toctree::
+ :maxdepth: 1
+
+ data-placement
+ pools
+ erasure-code
+ cache-tiering
+ placement-groups
+ upmap
+ read-balancer
+ balancer
+ crush-map
+ crush-map-edits
+ stretch-mode
+ change-mon-elections
+
+
+
+.. raw:: html
+
+ </td></tr><tr><td><h3>Low-level Operations</h3>
+
+Low-level cluster operations consist of starting, stopping, and restarting a
+particular daemon within a cluster; changing the settings of a particular
+daemon or subsystem; and, adding a daemon to the cluster or removing a daemon
+from the cluster. The most common use cases for low-level operations include
+growing or shrinking the Ceph cluster and replacing legacy or failed hardware
+with new hardware.
+
+.. toctree::
+ :maxdepth: 1
+
+ add-or-rm-osds
+ add-or-rm-mons
+ devices
+ bluestore-migration
+ Command Reference <control>
+
+
+
+.. raw:: html
+
+ </td><td><h3>Troubleshooting</h3>
+
+Ceph is still on the leading edge, so you may encounter situations that require
+you to evaluate your Ceph configuration and modify your logging and debugging
+settings to identify and remedy issues you are encountering with your cluster.
+
+.. toctree::
+ :maxdepth: 1
+
+ ../troubleshooting/community
+ ../troubleshooting/troubleshooting-mon
+ ../troubleshooting/troubleshooting-osd
+ ../troubleshooting/troubleshooting-pg
+ ../troubleshooting/log-and-debug
+ ../troubleshooting/cpu-profiling
+ ../troubleshooting/memory-profiling
+
+
+
+
+.. raw:: html
+
+ </td></tr></tbody></table>
+
diff --git a/doc/rados/operations/monitoring-osd-pg.rst b/doc/rados/operations/monitoring-osd-pg.rst
new file mode 100644
index 000000000..b0a6767a1
--- /dev/null
+++ b/doc/rados/operations/monitoring-osd-pg.rst
@@ -0,0 +1,556 @@
+=========================
+ Monitoring OSDs and PGs
+=========================
+
+High availability and high reliability require a fault-tolerant approach to
+managing hardware and software issues. Ceph has no single point of failure and
+it can service requests for data even when in a "degraded" mode. Ceph's `data
+placement`_ introduces a layer of indirection to ensure that data doesn't bind
+directly to specific OSDs. For this reason, tracking system faults
+requires finding the `placement group`_ (PG) and the underlying OSDs at the
+root of the problem.
+
+.. tip:: A fault in one part of the cluster might prevent you from accessing a
+ particular object, but that doesn't mean that you are prevented from
+ accessing other objects. When you run into a fault, don't panic. Just
+ follow the steps for monitoring your OSDs and placement groups, and then
+ begin troubleshooting.
+
+Ceph is self-repairing. However, when problems persist, monitoring OSDs and
+placement groups will help you identify the problem.
+
+
+Monitoring OSDs
+===============
+
+An OSD is either *in* service (``in``) or *out* of service (``out``). An OSD is
+either running and reachable (``up``), or it is not running and not reachable
+(``down``).
+
+If an OSD is ``up``, it may be either ``in`` service (clients can read and
+write data) or it is ``out`` of service. If the OSD was ``in`` but then due to
+a failure or a manual action was set to the ``out`` state, Ceph will migrate
+placement groups to the other OSDs to maintin the configured redundancy.
+
+If an OSD is ``out`` of service, CRUSH will not assign placement groups to it.
+If an OSD is ``down``, it will also be ``out``.
+
+.. note:: If an OSD is ``down`` and ``in``, there is a problem and this
+ indicates that the cluster is not in a healthy state.
+
+.. ditaa::
+
+ +----------------+ +----------------+
+ | | | |
+ | OSD #n In | | OSD #n Up |
+ | | | |
+ +----------------+ +----------------+
+ ^ ^
+ | |
+ | |
+ v v
+ +----------------+ +----------------+
+ | | | |
+ | OSD #n Out | | OSD #n Down |
+ | | | |
+ +----------------+ +----------------+
+
+If you run the commands ``ceph health``, ``ceph -s``, or ``ceph -w``,
+you might notice that the cluster does not always show ``HEALTH OK``. Don't
+panic. There are certain circumstances in which it is expected and normal that
+the cluster will **NOT** show ``HEALTH OK``:
+
+#. You haven't started the cluster yet.
+#. You have just started or restarted the cluster and it's not ready to show
+ health statuses yet, because the PGs are in the process of being created and
+ the OSDs are in the process of peering.
+#. You have just added or removed an OSD.
+#. You have just have modified your cluster map.
+
+Checking to see if OSDs are ``up`` and running is an important aspect of monitoring them:
+whenever the cluster is up and running, every OSD that is ``in`` the cluster should also
+be ``up`` and running. To see if all of the cluster's OSDs are running, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph osd stat
+
+The output provides the following information: the total number of OSDs (x),
+how many OSDs are ``up`` (y), how many OSDs are ``in`` (z), and the map epoch (eNNNN). ::
+
+ x osds: y up, z in; epoch: eNNNN
+
+If the number of OSDs that are ``in`` the cluster is greater than the number of
+OSDs that are ``up``, run the following command to identify the ``ceph-osd``
+daemons that are not running:
+
+.. prompt:: bash $
+
+ ceph osd tree
+
+::
+
+ #ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
+ -1 2.00000 pool openstack
+ -3 2.00000 rack dell-2950-rack-A
+ -2 2.00000 host dell-2950-A1
+ 0 ssd 1.00000 osd.0 up 1.00000 1.00000
+ 1 ssd 1.00000 osd.1 down 1.00000 1.00000
+
+.. tip:: Searching through a well-designed CRUSH hierarchy to identify the physical
+ locations of particular OSDs might help you troubleshoot your cluster.
+
+If an OSD is ``down``, start it by running the following command:
+
+.. prompt:: bash $
+
+ sudo systemctl start ceph-osd@1
+
+For problems associated with OSDs that have stopped or won't restart, see `OSD Not Running`_.
+
+
+PG Sets
+=======
+
+When CRUSH assigns a PG to OSDs, it takes note of how many replicas of the PG
+are required by the pool and then assigns each replica to a different OSD.
+For example, if the pool requires three replicas of a PG, CRUSH might assign
+them individually to ``osd.1``, ``osd.2`` and ``osd.3``. CRUSH seeks a
+pseudo-random placement that takes into account the failure domains that you
+have set in your `CRUSH map`_; for this reason, PGs are rarely assigned to
+immediately adjacent OSDs in a large cluster.
+
+Ceph processes client requests with the **Acting Set** of OSDs: this is the set
+of OSDs that currently have a full and working version of a PG shard and that
+are therefore responsible for handling requests. By contrast, the **Up Set** is
+the set of OSDs that contain a shard of a specific PG. Data is moved or copied
+to the **Up Set**, or planned to be moved or copied, to the **Up Set**. See
+:ref:`Placement Group Concepts <rados_operations_pg_concepts>`.
+
+Sometimes an OSD in the Acting Set is ``down`` or otherwise unable to
+service requests for objects in the PG. When this kind of situation
+arises, don't panic. Common examples of such a situation include:
+
+- You added or removed an OSD, CRUSH reassigned the PG to
+ other OSDs, and this reassignment changed the composition of the Acting Set and triggered
+ the migration of data by means of a "backfill" process.
+- An OSD was ``down``, was restarted, and is now ``recovering``.
+- An OSD in the Acting Set is ``down`` or unable to service requests,
+ and another OSD has temporarily assumed its duties.
+
+Typically, the Up Set and the Acting Set are identical. When they are not, it
+might indicate that Ceph is migrating the PG (in other words, that the PG has
+been remapped), that an OSD is recovering, or that there is a problem with the
+cluster (in such scenarios, Ceph usually shows a "HEALTH WARN" state with a
+"stuck stale" message).
+
+To retrieve a list of PGs, run the following command:
+
+.. prompt:: bash $
+
+ ceph pg dump
+
+To see which OSDs are within the Acting Set and the Up Set for a specific PG, run the following command:
+
+.. prompt:: bash $
+
+ ceph pg map {pg-num}
+
+The output provides the following information: the osdmap epoch (eNNN), the PG number
+({pg-num}), the OSDs in the Up Set (up[]), and the OSDs in the Acting Set
+(acting[])::
+
+ osdmap eNNN pg {raw-pg-num} ({pg-num}) -> up [0,1,2] acting [0,1,2]
+
+.. note:: If the Up Set and the Acting Set do not match, this might indicate
+ that the cluster is rebalancing itself or that there is a problem with
+ the cluster.
+
+
+Peering
+=======
+
+Before you can write data to a PG, it must be in an ``active`` state and it
+will preferably be in a ``clean`` state. For Ceph to determine the current
+state of a PG, peering must take place. That is, the primary OSD of the PG
+(that is, the first OSD in the Acting Set) must peer with the secondary and
+OSDs so that consensus on the current state of the PG can be established. In
+the following diagram, we assume a pool with three replicas of the PG:
+
+.. ditaa::
+
+ +---------+ +---------+ +-------+
+ | OSD 1 | | OSD 2 | | OSD 3 |
+ +---------+ +---------+ +-------+
+ | | |
+ | Request To | |
+ | Peer | |
+ |-------------->| |
+ |<--------------| |
+ | Peering |
+ | |
+ | Request To |
+ | Peer |
+ |----------------------------->|
+ |<-----------------------------|
+ | Peering |
+
+The OSDs also report their status to the monitor. For details, see `Configuring Monitor/OSD
+Interaction`_. To troubleshoot peering issues, see `Peering
+Failure`_.
+
+
+Monitoring PG States
+====================
+
+If you run the commands ``ceph health``, ``ceph -s``, or ``ceph -w``,
+you might notice that the cluster does not always show ``HEALTH OK``. After
+first checking to see if the OSDs are running, you should also check PG
+states. There are certain PG-peering-related circumstances in which it is expected
+and normal that the cluster will **NOT** show ``HEALTH OK``:
+
+#. You have just created a pool and the PGs haven't peered yet.
+#. The PGs are recovering.
+#. You have just added an OSD to or removed an OSD from the cluster.
+#. You have just modified your CRUSH map and your PGs are migrating.
+#. There is inconsistent data in different replicas of a PG.
+#. Ceph is scrubbing a PG's replicas.
+#. Ceph doesn't have enough storage capacity to complete backfilling operations.
+
+If one of these circumstances causes Ceph to show ``HEALTH WARN``, don't
+panic. In many cases, the cluster will recover on its own. In some cases, however, you
+might need to take action. An important aspect of monitoring PGs is to check their
+status as ``active`` and ``clean``: that is, it is important to ensure that, when the
+cluster is up and running, all PGs are ``active`` and (preferably) ``clean``.
+To see the status of every PG, run the following command:
+
+.. prompt:: bash $
+
+ ceph pg stat
+
+The output provides the following information: the total number of PGs (x), how many
+PGs are in a particular state such as ``active+clean`` (y), and the
+amount of data stored (z). ::
+
+ x pgs: y active+clean; z bytes data, aa MB used, bb GB / cc GB avail
+
+.. note:: It is common for Ceph to report multiple states for PGs (for example,
+ ``active+clean``, ``active+clean+remapped``, ``active+clean+scrubbing``.
+
+Here Ceph shows not only the PG states, but also storage capacity used (aa),
+the amount of storage capacity remaining (bb), and the total storage capacity
+of the PG. These values can be important in a few cases:
+
+- The cluster is reaching its ``near full ratio`` or ``full ratio``.
+- Data is not being distributed across the cluster due to an error in the
+ CRUSH configuration.
+
+
+.. topic:: Placement Group IDs
+
+ PG IDs consist of the pool number (not the pool name) followed by a period
+ (.) and a hexadecimal number. You can view pool numbers and their names from
+ in the output of ``ceph osd lspools``. For example, the first pool that was
+ created corresponds to pool number ``1``. A fully qualified PG ID has the
+ following form::
+
+ {pool-num}.{pg-id}
+
+ It typically resembles the following::
+
+ 1.1701b
+
+
+To retrieve a list of PGs, run the following command:
+
+.. prompt:: bash $
+
+ ceph pg dump
+
+To format the output in JSON format and save it to a file, run the following command:
+
+.. prompt:: bash $
+
+ ceph pg dump -o {filename} --format=json
+
+To query a specific PG, run the following command:
+
+.. prompt:: bash $
+
+ ceph pg {poolnum}.{pg-id} query
+
+Ceph will output the query in JSON format.
+
+The following subsections describe the most common PG states in detail.
+
+
+Creating
+--------
+
+PGs are created when you create a pool: the command that creates a pool
+specifies the total number of PGs for that pool, and when the pool is created
+all of those PGs are created as well. Ceph will echo ``creating`` while it is
+creating PGs. After the PG(s) are created, the OSDs that are part of a PG's
+Acting Set will peer. Once peering is complete, the PG status should be
+``active+clean``. This status means that Ceph clients begin writing to the
+PG.
+
+.. ditaa::
+
+ /-----------\ /-----------\ /-----------\
+ | Creating |------>| Peering |------>| Active |
+ \-----------/ \-----------/ \-----------/
+
+Peering
+-------
+
+When a PG peers, the OSDs that store the replicas of its data converge on an
+agreed state of the data and metadata within that PG. When peering is complete,
+those OSDs agree about the state of that PG. However, completion of the peering
+process does **NOT** mean that each replica has the latest contents.
+
+.. topic:: Authoritative History
+
+ Ceph will **NOT** acknowledge a write operation to a client until that write
+ operation is persisted by every OSD in the Acting Set. This practice ensures
+ that at least one member of the Acting Set will have a record of every
+ acknowledged write operation since the last successful peering operation.
+
+ Given an accurate record of each acknowledged write operation, Ceph can
+ construct a new authoritative history of the PG--that is, a complete and
+ fully ordered set of operations that, if performed, would bring an OSD’s
+ copy of the PG up to date.
+
+
+Active
+------
+
+After Ceph has completed the peering process, a PG should become ``active``.
+The ``active`` state means that the data in the PG is generally available for
+read and write operations in the primary and replica OSDs.
+
+
+Clean
+-----
+
+When a PG is in the ``clean`` state, all OSDs holding its data and metadata
+have successfully peered and there are no stray replicas. Ceph has replicated
+all objects in the PG the correct number of times.
+
+
+Degraded
+--------
+
+When a client writes an object to the primary OSD, the primary OSD is
+responsible for writing the replicas to the replica OSDs. After the primary OSD
+writes the object to storage, the PG will remain in a ``degraded``
+state until the primary OSD has received an acknowledgement from the replica
+OSDs that Ceph created the replica objects successfully.
+
+The reason that a PG can be ``active+degraded`` is that an OSD can be
+``active`` even if it doesn't yet hold all of the PG's objects. If an OSD goes
+``down``, Ceph marks each PG assigned to the OSD as ``degraded``. The PGs must
+peer again when the OSD comes back online. However, a client can still write a
+new object to a ``degraded`` PG if it is ``active``.
+
+If an OSD is ``down`` and the ``degraded`` condition persists, Ceph might mark the
+``down`` OSD as ``out`` of the cluster and remap the data from the ``down`` OSD
+to another OSD. The time between being marked ``down`` and being marked ``out``
+is determined by ``mon_osd_down_out_interval``, which is set to ``600`` seconds
+by default.
+
+A PG can also be in the ``degraded`` state because there are one or more
+objects that Ceph expects to find in the PG but that Ceph cannot find. Although
+you cannot read or write to unfound objects, you can still access all of the other
+objects in the ``degraded`` PG.
+
+
+Recovering
+----------
+
+Ceph was designed for fault-tolerance, because hardware and other server
+problems are expected or even routine. When an OSD goes ``down``, its contents
+might fall behind the current state of other replicas in the PGs. When the OSD
+has returned to the ``up`` state, the contents of the PGs must be updated to
+reflect that current state. During that time period, the OSD might be in a
+``recovering`` state.
+
+Recovery is not always trivial, because a hardware failure might cause a
+cascading failure of multiple OSDs. For example, a network switch for a rack or
+cabinet might fail, which can cause the OSDs of a number of host machines to
+fall behind the current state of the cluster. In such a scenario, general
+recovery is possible only if each of the OSDs recovers after the fault has been
+resolved.]
+
+Ceph provides a number of settings that determine how the cluster balances the
+resource contention between the need to process new service requests and the
+need to recover data objects and restore the PGs to the current state. The
+``osd_recovery_delay_start`` setting allows an OSD to restart, re-peer, and
+even process some replay requests before starting the recovery process. The
+``osd_recovery_thread_timeout`` setting determines the duration of a thread
+timeout, because multiple OSDs might fail, restart, and re-peer at staggered
+rates. The ``osd_recovery_max_active`` setting limits the number of recovery
+requests an OSD can entertain simultaneously, in order to prevent the OSD from
+failing to serve. The ``osd_recovery_max_chunk`` setting limits the size of
+the recovered data chunks, in order to prevent network congestion.
+
+
+Back Filling
+------------
+
+When a new OSD joins the cluster, CRUSH will reassign PGs from OSDs that are
+already in the cluster to the newly added OSD. It can put excessive load on the
+new OSD to force it to immediately accept the reassigned PGs. Back filling the
+OSD with the PGs allows this process to begin in the background. After the
+backfill operations have completed, the new OSD will begin serving requests as
+soon as it is ready.
+
+During the backfill operations, you might see one of several states:
+``backfill_wait`` indicates that a backfill operation is pending, but is not
+yet underway; ``backfilling`` indicates that a backfill operation is currently
+underway; and ``backfill_toofull`` indicates that a backfill operation was
+requested but couldn't be completed due to insufficient storage capacity. When
+a PG cannot be backfilled, it might be considered ``incomplete``.
+
+The ``backfill_toofull`` state might be transient. It might happen that, as PGs
+are moved around, space becomes available. The ``backfill_toofull`` state is
+similar to ``backfill_wait`` in that backfill operations can proceed as soon as
+conditions change.
+
+Ceph provides a number of settings to manage the load spike associated with the
+reassignment of PGs to an OSD (especially a new OSD). The ``osd_max_backfills``
+setting specifies the maximum number of concurrent backfills to and from an OSD
+(default: 1). The ``backfill_full_ratio`` setting allows an OSD to refuse a
+backfill request if the OSD is approaching its full ratio (default: 90%). This
+setting can be changed with the ``ceph osd set-backfillfull-ratio`` command. If
+an OSD refuses a backfill request, the ``osd_backfill_retry_interval`` setting
+allows an OSD to retry the request after a certain interval (default: 30
+seconds). OSDs can also set ``osd_backfill_scan_min`` and
+``osd_backfill_scan_max`` in order to manage scan intervals (default: 64 and
+512, respectively).
+
+
+Remapped
+--------
+
+When the Acting Set that services a PG changes, the data migrates from the old
+Acting Set to the new Acting Set. Because it might take time for the new
+primary OSD to begin servicing requests, the old primary OSD might be required
+to continue servicing requests until the PG data migration is complete. After
+data migration has completed, the mapping uses the primary OSD of the new
+Acting Set.
+
+
+Stale
+-----
+
+Although Ceph uses heartbeats in order to ensure that hosts and daemons are
+running, the ``ceph-osd`` daemons might enter a ``stuck`` state where they are
+not reporting statistics in a timely manner (for example, there might be a
+temporary network fault). By default, OSD daemons report their PG, up through,
+boot, and failure statistics every half second (that is, in accordance with a
+value of ``0.5``), which is more frequent than the reports defined by the
+heartbeat thresholds. If the primary OSD of a PG's Acting Set fails to report
+to the monitor or if other OSDs have reported the primary OSD ``down``, the
+monitors will mark the PG ``stale``.
+
+When you start your cluster, it is common to see the ``stale`` state until the
+peering process completes. After your cluster has been running for a while,
+however, seeing PGs in the ``stale`` state indicates that the primary OSD for
+those PGs is ``down`` or not reporting PG statistics to the monitor.
+
+
+Identifying Troubled PGs
+========================
+
+As previously noted, a PG is not necessarily having problems just because its
+state is not ``active+clean``. When PGs are stuck, this might indicate that
+Ceph cannot perform self-repairs. The stuck states include:
+
+- **Unclean**: PGs contain objects that have not been replicated the desired
+ number of times. Under normal conditions, it can be assumed that these PGs
+ are recovering.
+- **Inactive**: PGs cannot process reads or writes because they are waiting for
+ an OSD that has the most up-to-date data to come back ``up``.
+- **Stale**: PG are in an unknown state, because the OSDs that host them have
+ not reported to the monitor cluster for a certain period of time (determined
+ by ``mon_osd_report_timeout``).
+
+To identify stuck PGs, run the following command:
+
+.. prompt:: bash $
+
+ ceph pg dump_stuck [unclean|inactive|stale|undersized|degraded]
+
+For more detail, see `Placement Group Subsystem`_. To troubleshoot stuck PGs,
+see `Troubleshooting PG Errors`_.
+
+
+Finding an Object Location
+==========================
+
+To store object data in the Ceph Object Store, a Ceph client must:
+
+#. Set an object name
+#. Specify a `pool`_
+
+The Ceph client retrieves the latest cluster map, the CRUSH algorithm
+calculates how to map the object to a PG, and then the algorithm calculates how
+to dynamically assign the PG to an OSD. To find the object location given only
+the object name and the pool name, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd map {poolname} {object-name} [namespace]
+
+.. topic:: Exercise: Locate an Object
+
+ As an exercise, let's create an object. We can specify an object name, a path
+ to a test file that contains some object data, and a pool name by using the
+ ``rados put`` command on the command line. For example:
+
+ .. prompt:: bash $
+
+ rados put {object-name} {file-path} --pool=data
+ rados put test-object-1 testfile.txt --pool=data
+
+ To verify that the Ceph Object Store stored the object, run the
+ following command:
+
+ .. prompt:: bash $
+
+ rados -p data ls
+
+ To identify the object location, run the following commands:
+
+ .. prompt:: bash $
+
+ ceph osd map {pool-name} {object-name}
+ ceph osd map data test-object-1
+
+ Ceph should output the object's location. For example::
+
+ osdmap e537 pool 'data' (1) object 'test-object-1' -> pg 1.d1743484 (1.4) -> up ([0,1], p0) acting ([0,1], p0)
+
+ To remove the test object, simply delete it by running the ``rados rm``
+ command. For example:
+
+ .. prompt:: bash $
+
+ rados rm test-object-1 --pool=data
+
+As the cluster evolves, the object location may change dynamically. One benefit
+of Ceph's dynamic rebalancing is that Ceph spares you the burden of manually
+performing the migration. For details, see the `Architecture`_ section.
+
+.. _data placement: ../data-placement
+.. _pool: ../pools
+.. _placement group: ../placement-groups
+.. _Architecture: ../../../architecture
+.. _OSD Not Running: ../../troubleshooting/troubleshooting-osd#osd-not-running
+.. _Troubleshooting PG Errors: ../../troubleshooting/troubleshooting-pg#troubleshooting-pg-errors
+.. _Peering Failure: ../../troubleshooting/troubleshooting-pg#failures-osd-peering
+.. _CRUSH map: ../crush-map
+.. _Configuring Monitor/OSD Interaction: ../../configuration/mon-osd-interaction/
+.. _Placement Group Subsystem: ../control#placement-group-subsystem
diff --git a/doc/rados/operations/monitoring.rst b/doc/rados/operations/monitoring.rst
new file mode 100644
index 000000000..a9171f2d8
--- /dev/null
+++ b/doc/rados/operations/monitoring.rst
@@ -0,0 +1,644 @@
+======================
+ Monitoring a Cluster
+======================
+
+After you have a running cluster, you can use the ``ceph`` tool to monitor your
+cluster. Monitoring a cluster typically involves checking OSD status, monitor
+status, placement group status, and metadata server status.
+
+Using the command line
+======================
+
+Interactive mode
+----------------
+
+To run the ``ceph`` tool in interactive mode, type ``ceph`` at the command line
+with no arguments. For example:
+
+.. prompt:: bash $
+
+ ceph
+
+.. prompt:: ceph>
+ :prompts: ceph>
+
+ health
+ status
+ quorum_status
+ mon stat
+
+Non-default paths
+-----------------
+
+If you specified non-default locations for your configuration or keyring when
+you install the cluster, you may specify their locations to the ``ceph`` tool
+by running the following command:
+
+.. prompt:: bash $
+
+ ceph -c /path/to/conf -k /path/to/keyring health
+
+Checking a Cluster's Status
+===========================
+
+After you start your cluster, and before you start reading and/or writing data,
+you should check your cluster's status.
+
+To check a cluster's status, run the following command:
+
+.. prompt:: bash $
+
+ ceph status
+
+Alternatively, you can run the following command:
+
+.. prompt:: bash $
+
+ ceph -s
+
+In interactive mode, this operation is performed by typing ``status`` and
+pressing **Enter**:
+
+.. prompt:: ceph>
+ :prompts: ceph>
+
+ status
+
+Ceph will print the cluster status. For example, a tiny Ceph "demonstration
+cluster" that is running one instance of each service (monitor, manager, and
+OSD) might print the following:
+
+::
+
+ cluster:
+ id: 477e46f1-ae41-4e43-9c8f-72c918ab0a20
+ health: HEALTH_OK
+
+ services:
+ mon: 3 daemons, quorum a,b,c
+ mgr: x(active)
+ mds: cephfs_a-1/1/1 up {0=a=up:active}, 2 up:standby
+ osd: 3 osds: 3 up, 3 in
+
+ data:
+ pools: 2 pools, 16 pgs
+ objects: 21 objects, 2.19K
+ usage: 546 GB used, 384 GB / 931 GB avail
+ pgs: 16 active+clean
+
+
+How Ceph Calculates Data Usage
+------------------------------
+
+The ``usage`` value reflects the *actual* amount of raw storage used. The ``xxx
+GB / xxx GB`` value means the amount available (the lesser number) of the
+overall storage capacity of the cluster. The notional number reflects the size
+of the stored data before it is replicated, cloned or snapshotted. Therefore,
+the amount of data actually stored typically exceeds the notional amount
+stored, because Ceph creates replicas of the data and may also use storage
+capacity for cloning and snapshotting.
+
+
+Watching a Cluster
+==================
+
+Each daemon in the Ceph cluster maintains a log of events, and the Ceph cluster
+itself maintains a *cluster log* that records high-level events about the
+entire Ceph cluster. These events are logged to disk on monitor servers (in
+the default location ``/var/log/ceph/ceph.log``), and they can be monitored via
+the command line.
+
+To follow the cluster log, run the following command:
+
+.. prompt:: bash $
+
+ ceph -w
+
+Ceph will print the status of the system, followed by each log message as it is
+added. For example:
+
+::
+
+ cluster:
+ id: 477e46f1-ae41-4e43-9c8f-72c918ab0a20
+ health: HEALTH_OK
+
+ services:
+ mon: 3 daemons, quorum a,b,c
+ mgr: x(active)
+ mds: cephfs_a-1/1/1 up {0=a=up:active}, 2 up:standby
+ osd: 3 osds: 3 up, 3 in
+
+ data:
+ pools: 2 pools, 16 pgs
+ objects: 21 objects, 2.19K
+ usage: 546 GB used, 384 GB / 931 GB avail
+ pgs: 16 active+clean
+
+
+ 2017-07-24 08:15:11.329298 mon.a mon.0 172.21.9.34:6789/0 23 : cluster [INF] osd.0 172.21.9.34:6806/20527 boot
+ 2017-07-24 08:15:14.258143 mon.a mon.0 172.21.9.34:6789/0 39 : cluster [INF] Activating manager daemon x
+ 2017-07-24 08:15:15.446025 mon.a mon.0 172.21.9.34:6789/0 47 : cluster [INF] Manager daemon x is now available
+
+Instead of printing log lines as they are added, you might want to print only
+the most recent lines. Run ``ceph log last [n]`` to see the most recent ``n``
+lines from the cluster log.
+
+Monitoring Health Checks
+========================
+
+Ceph continuously runs various *health checks*. When
+a health check fails, this failure is reflected in the output of ``ceph status`` and
+``ceph health``. The cluster log receives messages that
+indicate when a check has failed and when the cluster has recovered.
+
+For example, when an OSD goes down, the ``health`` section of the status
+output is updated as follows:
+
+::
+
+ health: HEALTH_WARN
+ 1 osds down
+ Degraded data redundancy: 21/63 objects degraded (33.333%), 16 pgs unclean, 16 pgs degraded
+
+At the same time, cluster log messages are emitted to record the failure of the
+health checks:
+
+::
+
+ 2017-07-25 10:08:58.265945 mon.a mon.0 172.21.9.34:6789/0 91 : cluster [WRN] Health check failed: 1 osds down (OSD_DOWN)
+ 2017-07-25 10:09:01.302624 mon.a mon.0 172.21.9.34:6789/0 94 : cluster [WRN] Health check failed: Degraded data redundancy: 21/63 objects degraded (33.333%), 16 pgs unclean, 16 pgs degraded (PG_DEGRADED)
+
+When the OSD comes back online, the cluster log records the cluster's return
+to a healthy state:
+
+::
+
+ 2017-07-25 10:11:11.526841 mon.a mon.0 172.21.9.34:6789/0 109 : cluster [WRN] Health check update: Degraded data redundancy: 2 pgs unclean, 2 pgs degraded, 2 pgs undersized (PG_DEGRADED)
+ 2017-07-25 10:11:13.535493 mon.a mon.0 172.21.9.34:6789/0 110 : cluster [INF] Health check cleared: PG_DEGRADED (was: Degraded data redundancy: 2 pgs unclean, 2 pgs degraded, 2 pgs undersized)
+ 2017-07-25 10:11:13.535577 mon.a mon.0 172.21.9.34:6789/0 111 : cluster [INF] Cluster is now healthy
+
+Network Performance Checks
+--------------------------
+
+Ceph OSDs send heartbeat ping messages to each other in order to monitor daemon
+availability and network performance. If a single delayed response is detected,
+this might indicate nothing more than a busy OSD. But if multiple delays
+between distinct pairs of OSDs are detected, this might indicate a failed
+network switch, a NIC failure, or a layer 1 failure.
+
+By default, a heartbeat time that exceeds 1 second (1000 milliseconds) raises a
+health check (a ``HEALTH_WARN``. For example:
+
+::
+
+ HEALTH_WARN Slow OSD heartbeats on back (longest 1118.001ms)
+
+In the output of the ``ceph health detail`` command, you can see which OSDs are
+experiencing delays and how long the delays are. The output of ``ceph health
+detail`` is limited to ten lines. Here is an example of the output you can
+expect from the ``ceph health detail`` command::
+
+ [WRN] OSD_SLOW_PING_TIME_BACK: Slow OSD heartbeats on back (longest 1118.001ms)
+ Slow OSD heartbeats on back from osd.0 [dc1,rack1] to osd.1 [dc1,rack1] 1118.001 msec possibly improving
+ Slow OSD heartbeats on back from osd.0 [dc1,rack1] to osd.2 [dc1,rack2] 1030.123 msec
+ Slow OSD heartbeats on back from osd.2 [dc1,rack2] to osd.1 [dc1,rack1] 1015.321 msec
+ Slow OSD heartbeats on back from osd.1 [dc1,rack1] to osd.0 [dc1,rack1] 1010.456 msec
+
+To see more detail and to collect a complete dump of network performance
+information, use the ``dump_osd_network`` command. This command is usually sent
+to a Ceph Manager Daemon, but it can be used to collect information about a
+specific OSD's interactions by sending it to that OSD. The default threshold
+for a slow heartbeat is 1 second (1000 milliseconds), but this can be
+overridden by providing a number of milliseconds as an argument.
+
+To show all network performance data with a specified threshold of 0, send the
+following command to the mgr:
+
+.. prompt:: bash $
+
+ ceph daemon /var/run/ceph/ceph-mgr.x.asok dump_osd_network 0
+
+::
+
+ {
+ "threshold": 0,
+ "entries": [
+ {
+ "last update": "Wed Sep 4 17:04:49 2019",
+ "stale": false,
+ "from osd": 2,
+ "to osd": 0,
+ "interface": "front",
+ "average": {
+ "1min": 1.023,
+ "5min": 0.860,
+ "15min": 0.883
+ },
+ "min": {
+ "1min": 0.818,
+ "5min": 0.607,
+ "15min": 0.607
+ },
+ "max": {
+ "1min": 1.164,
+ "5min": 1.173,
+ "15min": 1.544
+ },
+ "last": 0.924
+ },
+ {
+ "last update": "Wed Sep 4 17:04:49 2019",
+ "stale": false,
+ "from osd": 2,
+ "to osd": 0,
+ "interface": "back",
+ "average": {
+ "1min": 0.968,
+ "5min": 0.897,
+ "15min": 0.830
+ },
+ "min": {
+ "1min": 0.860,
+ "5min": 0.563,
+ "15min": 0.502
+ },
+ "max": {
+ "1min": 1.171,
+ "5min": 1.216,
+ "15min": 1.456
+ },
+ "last": 0.845
+ },
+ {
+ "last update": "Wed Sep 4 17:04:48 2019",
+ "stale": false,
+ "from osd": 0,
+ "to osd": 1,
+ "interface": "front",
+ "average": {
+ "1min": 0.965,
+ "5min": 0.811,
+ "15min": 0.850
+ },
+ "min": {
+ "1min": 0.650,
+ "5min": 0.488,
+ "15min": 0.466
+ },
+ "max": {
+ "1min": 1.252,
+ "5min": 1.252,
+ "15min": 1.362
+ },
+ "last": 0.791
+ },
+ ...
+
+
+
+Muting Health Checks
+--------------------
+
+Health checks can be muted so that they have no effect on the overall
+reported status of the cluster. For example, if the cluster has raised a
+single health check and then you mute that health check, then the cluster will report a status of ``HEALTH_OK``.
+To mute a specific health check, use the health check code that corresponds to that health check (see :ref:`health-checks`), and
+run the following command:
+
+.. prompt:: bash $
+
+ ceph health mute <code>
+
+For example, to mute an ``OSD_DOWN`` health check, run the following command:
+
+.. prompt:: bash $
+
+ ceph health mute OSD_DOWN
+
+Mutes are reported as part of the short and long form of the ``ceph health`` command's output.
+For example, in the above scenario, the cluster would report:
+
+.. prompt:: bash $
+
+ ceph health
+
+::
+
+ HEALTH_OK (muted: OSD_DOWN)
+
+.. prompt:: bash $
+
+ ceph health detail
+
+::
+
+ HEALTH_OK (muted: OSD_DOWN)
+ (MUTED) OSD_DOWN 1 osds down
+ osd.1 is down
+
+A mute can be removed by running the following command:
+
+.. prompt:: bash $
+
+ ceph health unmute <code>
+
+For example:
+
+.. prompt:: bash $
+
+ ceph health unmute OSD_DOWN
+
+A "health mute" can have a TTL (**T**\ime **T**\o **L**\ive)
+associated with it: this means that the mute will automatically expire
+after a specified period of time. The TTL is specified as an optional
+duration argument, as seen in the following examples:
+
+.. prompt:: bash $
+
+ ceph health mute OSD_DOWN 4h # mute for 4 hours
+ ceph health mute MON_DOWN 15m # mute for 15 minutes
+
+Normally, if a muted health check is resolved (for example, if the OSD that raised the ``OSD_DOWN`` health check
+in the example above has come back up), the mute goes away. If the health check comes
+back later, it will be reported in the usual way.
+
+It is possible to make a health mute "sticky": this means that the mute will remain even if the
+health check clears. For example, to make a health mute "sticky", you might run the following command:
+
+.. prompt:: bash $
+
+ ceph health mute OSD_DOWN 1h --sticky # ignore any/all down OSDs for next hour
+
+Most health mutes disappear if the unhealthy condition that triggered the health check gets worse.
+For example, suppose that there is one OSD down and the health check is muted. In that case, if
+one or more additional OSDs go down, then the health mute disappears. This behavior occurs in any health check with a threshold value.
+
+
+Checking a Cluster's Usage Stats
+================================
+
+To check a cluster's data usage and data distribution among pools, use the
+``df`` command. This option is similar to Linux's ``df`` command. Run the
+following command:
+
+.. prompt:: bash $
+
+ ceph df
+
+The output of ``ceph df`` resembles the following::
+
+ CLASS SIZE AVAIL USED RAW USED %RAW USED
+ ssd 202 GiB 200 GiB 2.0 GiB 2.0 GiB 1.00
+ TOTAL 202 GiB 200 GiB 2.0 GiB 2.0 GiB 1.00
+
+ --- POOLS ---
+ POOL ID PGS STORED (DATA) (OMAP) OBJECTS USED (DATA) (OMAP) %USED MAX AVAIL QUOTA OBJECTS QUOTA BYTES DIRTY USED COMPR UNDER COMPR
+ device_health_metrics 1 1 242 KiB 15 KiB 227 KiB 4 251 KiB 24 KiB 227 KiB 0 297 GiB N/A N/A 4 0 B 0 B
+ cephfs.a.meta 2 32 6.8 KiB 6.8 KiB 0 B 22 96 KiB 96 KiB 0 B 0 297 GiB N/A N/A 22 0 B 0 B
+ cephfs.a.data 3 32 0 B 0 B 0 B 0 0 B 0 B 0 B 0 99 GiB N/A N/A 0 0 B 0 B
+ test 4 32 22 MiB 22 MiB 50 KiB 248 19 MiB 19 MiB 50 KiB 0 297 GiB N/A N/A 248 0 B 0 B
+
+- **CLASS:** For example, "ssd" or "hdd".
+- **SIZE:** The amount of storage capacity managed by the cluster.
+- **AVAIL:** The amount of free space available in the cluster.
+- **USED:** The amount of raw storage consumed by user data (excluding
+ BlueStore's database).
+- **RAW USED:** The amount of raw storage consumed by user data, internal
+ overhead, and reserved capacity.
+- **%RAW USED:** The percentage of raw storage used. Watch this number in
+ conjunction with ``full ratio`` and ``near full ratio`` to be forewarned when
+ your cluster approaches the fullness thresholds. See `Storage Capacity`_.
+
+
+**POOLS:**
+
+The POOLS section of the output provides a list of pools and the *notional*
+usage of each pool. This section of the output **DOES NOT** reflect replicas,
+clones, or snapshots. For example, if you store an object with 1MB of data,
+then the notional usage will be 1MB, but the actual usage might be 2MB or more
+depending on the number of replicas, clones, and snapshots.
+
+- **ID:** The number of the specific node within the pool.
+- **STORED:** The actual amount of data that the user has stored in a pool.
+ This is similar to the USED column in earlier versions of Ceph, but the
+ calculations (for BlueStore!) are more precise (in that gaps are properly
+ handled).
+
+ - **(DATA):** Usage for RBD (RADOS Block Device), CephFS file data, and RGW
+ (RADOS Gateway) object data.
+ - **(OMAP):** Key-value pairs. Used primarily by CephFS and RGW (RADOS
+ Gateway) for metadata storage.
+
+- **OBJECTS:** The notional number of objects stored per pool (that is, the
+ number of objects other than replicas, clones, or snapshots).
+- **USED:** The space allocated for a pool over all OSDs. This includes space
+ for replication, space for allocation granularity, and space for the overhead
+ associated with erasure-coding. Compression savings and object-content gaps
+ are also taken into account. However, BlueStore's database is not included in
+ the amount reported under USED.
+
+ - **(DATA):** Object usage for RBD (RADOS Block Device), CephFS file data,
+ and RGW (RADOS Gateway) object data.
+ - **(OMAP):** Object key-value pairs. Used primarily by CephFS and RGW (RADOS
+ Gateway) for metadata storage.
+
+- **%USED:** The notional percentage of storage used per pool.
+- **MAX AVAIL:** An estimate of the notional amount of data that can be written
+ to this pool.
+- **QUOTA OBJECTS:** The number of quota objects.
+- **QUOTA BYTES:** The number of bytes in the quota objects.
+- **DIRTY:** The number of objects in the cache pool that have been written to
+ the cache pool but have not yet been flushed to the base pool. This field is
+ available only when cache tiering is in use.
+- **USED COMPR:** The amount of space allocated for compressed data. This
+ includes compressed data in addition to all of the space required for
+ replication, allocation granularity, and erasure- coding overhead.
+- **UNDER COMPR:** The amount of data that has passed through compression
+ (summed over all replicas) and that is worth storing in a compressed form.
+
+
+.. note:: The numbers in the POOLS section are notional. They do not include
+ the number of replicas, clones, or snapshots. As a result, the sum of the
+ USED and %USED amounts in the POOLS section of the output will not be equal
+ to the sum of the USED and %USED amounts in the RAW section of the output.
+
+.. note:: The MAX AVAIL value is a complicated function of the replication or
+ the kind of erasure coding used, the CRUSH rule that maps storage to
+ devices, the utilization of those devices, and the configured
+ ``mon_osd_full_ratio`` setting.
+
+
+Checking OSD Status
+===================
+
+To check if OSDs are ``up`` and ``in``, run the
+following command:
+
+.. prompt:: bash #
+
+ ceph osd stat
+
+Alternatively, you can run the following command:
+
+.. prompt:: bash #
+
+ ceph osd dump
+
+To view OSDs according to their position in the CRUSH map, run the following
+command:
+
+.. prompt:: bash #
+
+ ceph osd tree
+
+To print out a CRUSH tree that displays a host, its OSDs, whether the OSDs are
+``up``, and the weight of the OSDs, run the following command:
+
+.. code-block:: bash
+
+ #ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
+ -1 3.00000 pool default
+ -3 3.00000 rack mainrack
+ -2 3.00000 host osd-host
+ 0 ssd 1.00000 osd.0 up 1.00000 1.00000
+ 1 ssd 1.00000 osd.1 up 1.00000 1.00000
+ 2 ssd 1.00000 osd.2 up 1.00000 1.00000
+
+See `Monitoring OSDs and Placement Groups`_.
+
+Checking Monitor Status
+=======================
+
+If your cluster has multiple monitors, then you need to perform certain
+"monitor status" checks. After starting the cluster and before reading or
+writing data, you should check quorum status. A quorum must be present when
+multiple monitors are running to ensure proper functioning of your Ceph
+cluster. Check monitor status regularly in order to ensure that all of the
+monitors are running.
+
+To display the monitor map, run the following command:
+
+.. prompt:: bash $
+
+ ceph mon stat
+
+Alternatively, you can run the following command:
+
+.. prompt:: bash $
+
+ ceph mon dump
+
+To check the quorum status for the monitor cluster, run the following command:
+
+.. prompt:: bash $
+
+ ceph quorum_status
+
+Ceph returns the quorum status. For example, a Ceph cluster that consists of
+three monitors might return the following:
+
+.. code-block:: javascript
+
+ { "election_epoch": 10,
+ "quorum": [
+ 0,
+ 1,
+ 2],
+ "quorum_names": [
+ "a",
+ "b",
+ "c"],
+ "quorum_leader_name": "a",
+ "monmap": { "epoch": 1,
+ "fsid": "444b489c-4f16-4b75-83f0-cb8097468898",
+ "modified": "2011-12-12 13:28:27.505520",
+ "created": "2011-12-12 13:28:27.505520",
+ "features": {"persistent": [
+ "kraken",
+ "luminous",
+ "mimic"],
+ "optional": []
+ },
+ "mons": [
+ { "rank": 0,
+ "name": "a",
+ "addr": "127.0.0.1:6789/0",
+ "public_addr": "127.0.0.1:6789/0"},
+ { "rank": 1,
+ "name": "b",
+ "addr": "127.0.0.1:6790/0",
+ "public_addr": "127.0.0.1:6790/0"},
+ { "rank": 2,
+ "name": "c",
+ "addr": "127.0.0.1:6791/0",
+ "public_addr": "127.0.0.1:6791/0"}
+ ]
+ }
+ }
+
+Checking MDS Status
+===================
+
+Metadata servers provide metadata services for CephFS. Metadata servers have
+two sets of states: ``up | down`` and ``active | inactive``. To check if your
+metadata servers are ``up`` and ``active``, run the following command:
+
+.. prompt:: bash $
+
+ ceph mds stat
+
+To display details of the metadata servers, run the following command:
+
+.. prompt:: bash $
+
+ ceph fs dump
+
+
+Checking Placement Group States
+===============================
+
+Placement groups (PGs) map objects to OSDs. PGs are monitored in order to
+ensure that they are ``active`` and ``clean``. See `Monitoring OSDs and
+Placement Groups`_.
+
+.. _Monitoring OSDs and Placement Groups: ../monitoring-osd-pg
+
+.. _rados-monitoring-using-admin-socket:
+
+Using the Admin Socket
+======================
+
+The Ceph admin socket allows you to query a daemon via a socket interface. By
+default, Ceph sockets reside under ``/var/run/ceph``. To access a daemon via
+the admin socket, log in to the host that is running the daemon and run one of
+the two following commands:
+
+.. prompt:: bash $
+
+ ceph daemon {daemon-name}
+ ceph daemon {path-to-socket-file}
+
+For example, the following commands are equivalent to each other:
+
+.. prompt:: bash $
+
+ ceph daemon osd.0 foo
+ ceph daemon /var/run/ceph/ceph-osd.0.asok foo
+
+To view the available admin-socket commands, run the following command:
+
+.. prompt:: bash $
+
+ ceph daemon {daemon-name} help
+
+Admin-socket commands enable you to view and set your configuration at runtime.
+For more on viewing your configuration, see `Viewing a Configuration at
+Runtime`_. There are two methods of setting configuration value at runtime: (1)
+using the admin socket, which bypasses the monitor and requires a direct login
+to the host in question, and (2) using the ``ceph tell {daemon-type}.{id}
+config set`` command, which relies on the monitor and does not require a direct
+login.
+
+.. _Viewing a Configuration at Runtime: ../../configuration/ceph-conf#viewing-a-configuration-at-runtime
+.. _Storage Capacity: ../../configuration/mon-config-ref#storage-capacity
diff --git a/doc/rados/operations/operating.rst b/doc/rados/operations/operating.rst
new file mode 100644
index 000000000..f4a2fd988
--- /dev/null
+++ b/doc/rados/operations/operating.rst
@@ -0,0 +1,174 @@
+=====================
+ Operating a Cluster
+=====================
+
+.. index:: systemd; operating a cluster
+
+
+Running Ceph with systemd
+=========================
+
+In all distributions that support systemd (CentOS 7, Fedora, Debian
+Jessie 8 and later, and SUSE), systemd files (and NOT legacy SysVinit scripts)
+are used to manage Ceph daemons. Ceph daemons therefore behave like any other daemons
+that can be controlled by the ``systemctl`` command, as in the following examples:
+
+.. prompt:: bash $
+
+ sudo systemctl start ceph.target # start all daemons
+ sudo systemctl status ceph-osd@12 # check status of osd.12
+
+To list all of the Ceph systemd units on a node, run the following command:
+
+.. prompt:: bash $
+
+ sudo systemctl status ceph\*.service ceph\*.target
+
+
+Starting all daemons
+--------------------
+
+To start all of the daemons on a Ceph node (regardless of their type), run the
+following command:
+
+.. prompt:: bash $
+
+ sudo systemctl start ceph.target
+
+
+Stopping all daemons
+--------------------
+
+To stop all of the daemons on a Ceph node (regardless of their type), run the
+following command:
+
+.. prompt:: bash $
+
+ sudo systemctl stop ceph\*.service ceph\*.target
+
+
+Starting all daemons by type
+----------------------------
+
+To start all of the daemons of a particular type on a Ceph node, run one of the
+following commands:
+
+.. prompt:: bash $
+
+ sudo systemctl start ceph-osd.target
+ sudo systemctl start ceph-mon.target
+ sudo systemctl start ceph-mds.target
+
+
+Stopping all daemons by type
+----------------------------
+
+To stop all of the daemons of a particular type on a Ceph node, run one of the
+following commands:
+
+.. prompt:: bash $
+
+ sudo systemctl stop ceph-osd\*.service ceph-osd.target
+ sudo systemctl stop ceph-mon\*.service ceph-mon.target
+ sudo systemctl stop ceph-mds\*.service ceph-mds.target
+
+
+Starting a daemon
+-----------------
+
+To start a specific daemon instance on a Ceph node, run one of the
+following commands:
+
+.. prompt:: bash $
+
+ sudo systemctl start ceph-osd@{id}
+ sudo systemctl start ceph-mon@{hostname}
+ sudo systemctl start ceph-mds@{hostname}
+
+For example:
+
+.. prompt:: bash $
+
+ sudo systemctl start ceph-osd@1
+ sudo systemctl start ceph-mon@ceph-server
+ sudo systemctl start ceph-mds@ceph-server
+
+
+Stopping a daemon
+-----------------
+
+To stop a specific daemon instance on a Ceph node, run one of the
+following commands:
+
+.. prompt:: bash $
+
+ sudo systemctl stop ceph-osd@{id}
+ sudo systemctl stop ceph-mon@{hostname}
+ sudo systemctl stop ceph-mds@{hostname}
+
+For example:
+
+.. prompt:: bash $
+
+ sudo systemctl stop ceph-osd@1
+ sudo systemctl stop ceph-mon@ceph-server
+ sudo systemctl stop ceph-mds@ceph-server
+
+
+.. index:: sysvinit; operating a cluster
+
+Running Ceph with SysVinit
+==========================
+
+Each time you start, restart, or stop Ceph daemons, you must specify at least one option and one command.
+Likewise, each time you start, restart, or stop your entire cluster, you must specify at least one option and one command.
+In both cases, you can also specify a daemon type or a daemon instance. ::
+
+ {commandline} [options] [commands] [daemons]
+
+The ``ceph`` options include:
+
++-----------------+----------+-------------------------------------------------+
+| Option | Shortcut | Description |
++=================+==========+=================================================+
+| ``--verbose`` | ``-v`` | Use verbose logging. |
++-----------------+----------+-------------------------------------------------+
+| ``--valgrind`` | ``N/A`` | (Dev and QA only) Use `Valgrind`_ debugging. |
++-----------------+----------+-------------------------------------------------+
+| ``--allhosts`` | ``-a`` | Execute on all nodes listed in ``ceph.conf``. |
+| | | Otherwise, it only executes on ``localhost``. |
++-----------------+----------+-------------------------------------------------+
+| ``--restart`` | ``N/A`` | Automatically restart daemon if it core dumps. |
++-----------------+----------+-------------------------------------------------+
+| ``--norestart`` | ``N/A`` | Do not restart a daemon if it core dumps. |
++-----------------+----------+-------------------------------------------------+
+| ``--conf`` | ``-c`` | Use an alternate configuration file. |
++-----------------+----------+-------------------------------------------------+
+
+The ``ceph`` commands include:
+
++------------------+------------------------------------------------------------+
+| Command | Description |
++==================+============================================================+
+| ``start`` | Start the daemon(s). |
++------------------+------------------------------------------------------------+
+| ``stop`` | Stop the daemon(s). |
++------------------+------------------------------------------------------------+
+| ``forcestop`` | Force the daemon(s) to stop. Same as ``kill -9``. |
++------------------+------------------------------------------------------------+
+| ``killall`` | Kill all daemons of a particular type. |
++------------------+------------------------------------------------------------+
+| ``cleanlogs`` | Cleans out the log directory. |
++------------------+------------------------------------------------------------+
+| ``cleanalllogs`` | Cleans out **everything** in the log directory. |
++------------------+------------------------------------------------------------+
+
+The ``[daemons]`` option allows the ``ceph`` service to target specific daemon types
+in order to perform subsystem operations. Daemon types include:
+
+- ``mon``
+- ``osd``
+- ``mds``
+
+.. _Valgrind: http://www.valgrind.org/
+.. _initctl: http://manpages.ubuntu.com/manpages/raring/en/man8/initctl.8.html
diff --git a/doc/rados/operations/pg-concepts.rst b/doc/rados/operations/pg-concepts.rst
new file mode 100644
index 000000000..83062b53a
--- /dev/null
+++ b/doc/rados/operations/pg-concepts.rst
@@ -0,0 +1,104 @@
+.. _rados_operations_pg_concepts:
+
+==========================
+ Placement Group Concepts
+==========================
+
+When you execute commands like ``ceph -w``, ``ceph osd dump``, and other
+commands related to placement groups, Ceph may return values using some
+of the following terms:
+
+*Peering*
+ The process of bringing all of the OSDs that store
+ a Placement Group (PG) into agreement about the state
+ of all of the objects (and their metadata) in that PG.
+ Note that agreeing on the state does not mean that
+ they all have the latest contents.
+
+*Acting Set*
+ The ordered list of OSDs who are (or were as of some epoch)
+ responsible for a particular placement group.
+
+*Up Set*
+ The ordered list of OSDs responsible for a particular placement
+ group for a particular epoch according to CRUSH. Normally this
+ is the same as the *Acting Set*, except when the *Acting Set* has
+ been explicitly overridden via ``pg_temp`` in the OSD Map.
+
+*Current Interval* or *Past Interval*
+ A sequence of OSD map epochs during which the *Acting Set* and *Up
+ Set* for particular placement group do not change.
+
+*Primary*
+ The member (and by convention first) of the *Acting Set*,
+ that is responsible for coordination peering, and is
+ the only OSD that will accept client-initiated
+ writes to objects in a placement group.
+
+*Replica*
+ A non-primary OSD in the *Acting Set* for a placement group
+ (and who has been recognized as such and *activated* by the primary).
+
+*Stray*
+ An OSD that is not a member of the current *Acting Set*, but
+ has not yet been told that it can delete its copies of a
+ particular placement group.
+
+*Recovery*
+ Ensuring that copies of all of the objects in a placement group
+ are on all of the OSDs in the *Acting Set*. Once *Peering* has
+ been performed, the *Primary* can start accepting write operations,
+ and *Recovery* can proceed in the background.
+
+*PG Info*
+ Basic metadata about the placement group's creation epoch, the version
+ for the most recent write to the placement group, *last epoch started*,
+ *last epoch clean*, and the beginning of the *current interval*. Any
+ inter-OSD communication about placement groups includes the *PG Info*,
+ such that any OSD that knows a placement group exists (or once existed)
+ also has a lower bound on *last epoch clean* or *last epoch started*.
+
+*PG Log*
+ A list of recent updates made to objects in a placement group.
+ Note that these logs can be truncated after all OSDs
+ in the *Acting Set* have acknowledged up to a certain
+ point.
+
+*Missing Set*
+ Each OSD notes update log entries and if they imply updates to
+ the contents of an object, adds that object to a list of needed
+ updates. This list is called the *Missing Set* for that ``<OSD,PG>``.
+
+*Authoritative History*
+ A complete, and fully ordered set of operations that, if
+ performed, would bring an OSD's copy of a placement group
+ up to date.
+
+*Epoch*
+ A (monotonically increasing) OSD map version number
+
+*Last Epoch Start*
+ The last epoch at which all nodes in the *Acting Set*
+ for a particular placement group agreed on an
+ *Authoritative History*. At this point, *Peering* is
+ deemed to have been successful.
+
+*up_thru*
+ Before a *Primary* can successfully complete the *Peering* process,
+ it must inform a monitor that is alive through the current
+ OSD map *Epoch* by having the monitor set its *up_thru* in the osd
+ map. This helps *Peering* ignore previous *Acting Sets* for which
+ *Peering* never completed after certain sequences of failures, such as
+ the second interval below:
+
+ - *acting set* = [A,B]
+ - *acting set* = [A]
+ - *acting set* = [] very shortly after (e.g., simultaneous failure, but staggered detection)
+ - *acting set* = [B] (B restarts, A does not)
+
+*Last Epoch Clean*
+ The last *Epoch* at which all nodes in the *Acting set*
+ for a particular placement group were completely
+ up to date (both placement group logs and object contents).
+ At this point, *recovery* is deemed to have been
+ completed.
diff --git a/doc/rados/operations/pg-repair.rst b/doc/rados/operations/pg-repair.rst
new file mode 100644
index 000000000..609318fca
--- /dev/null
+++ b/doc/rados/operations/pg-repair.rst
@@ -0,0 +1,118 @@
+============================
+Repairing PG Inconsistencies
+============================
+Sometimes a Placement Group (PG) might become ``inconsistent``. To return the PG
+to an ``active+clean`` state, you must first determine which of the PGs has become
+inconsistent and then run the ``pg repair`` command on it. This page contains
+commands for diagnosing PGs and the command for repairing PGs that have become
+inconsistent.
+
+.. highlight:: console
+
+Commands for Diagnosing PG Problems
+===================================
+The commands in this section provide various ways of diagnosing broken PGs.
+
+To see a high-level (low-detail) overview of Ceph cluster health, run the
+following command:
+
+.. prompt:: bash #
+
+ ceph health detail
+
+To see more detail on the status of the PGs, run the following command:
+
+.. prompt:: bash #
+
+ ceph pg dump --format=json-pretty
+
+To see a list of inconsistent PGs, run the following command:
+
+.. prompt:: bash #
+
+ rados list-inconsistent-pg {pool}
+
+To see a list of inconsistent RADOS objects, run the following command:
+
+.. prompt:: bash #
+
+ rados list-inconsistent-obj {pgid}
+
+To see a list of inconsistent snapsets in a specific PG, run the following
+command:
+
+.. prompt:: bash #
+
+ rados list-inconsistent-snapset {pgid}
+
+
+Commands for Repairing PGs
+==========================
+The form of the command to repair a broken PG is as follows:
+
+.. prompt:: bash #
+
+ ceph pg repair {pgid}
+
+Here ``{pgid}`` represents the id of the affected PG.
+
+For example:
+
+.. prompt:: bash #
+
+ ceph pg repair 1.4
+
+.. note:: PG IDs have the form ``N.xxxxx``, where ``N`` is the number of the
+ pool that contains the PG. The command ``ceph osd listpools`` and the
+ command ``ceph osd dump | grep pool`` return a list of pool numbers.
+
+More Information on PG Repair
+=============================
+Ceph stores and updates the checksums of objects stored in the cluster. When a
+scrub is performed on a PG, the OSD attempts to choose an authoritative copy
+from among its replicas. Only one of the possible cases is consistent. After
+performing a deep scrub, Ceph calculates the checksum of an object that is read
+from disk and compares it to the checksum that was previously recorded. If the
+current checksum and the previously recorded checksum do not match, that
+mismatch is considered to be an inconsistency. In the case of replicated pools,
+any mismatch between the checksum of any replica of an object and the checksum
+of the authoritative copy means that there is an inconsistency. The discovery
+of these inconsistencies cause a PG's state to be set to ``inconsistent``.
+
+The ``pg repair`` command attempts to fix inconsistencies of various kinds. If
+``pg repair`` finds an inconsistent PG, it attempts to overwrite the digest of
+the inconsistent copy with the digest of the authoritative copy. If ``pg
+repair`` finds an inconsistent replicated pool, it marks the inconsistent copy
+as missing. In the case of replicated pools, recovery is beyond the scope of
+``pg repair``.
+
+In the case of erasure-coded and BlueStore pools, Ceph will automatically
+perform repairs if ``osd_scrub_auto_repair`` (default ``false``) is set to
+``true`` and if no more than ``osd_scrub_auto_repair_num_errors`` (default
+``5``) errors are found.
+
+The ``pg repair`` command will not solve every problem. Ceph does not
+automatically repair PGs when they are found to contain inconsistencies.
+
+The checksum of a RADOS object or an omap is not always available. Checksums
+are calculated incrementally. If a replicated object is updated
+non-sequentially, the write operation involved in the update changes the object
+and invalidates its checksum. The whole object is not read while the checksum
+is recalculated. The ``pg repair`` command is able to make repairs even when
+checksums are not available to it, as in the case of Filestore. Users working
+with replicated Filestore pools might prefer manual repair to ``ceph pg
+repair``.
+
+This material is relevant for Filestore, but not for BlueStore, which has its
+own internal checksums. The matched-record checksum and the calculated checksum
+cannot prove that any specific copy is in fact authoritative. If there is no
+checksum available, ``pg repair`` favors the data on the primary, but this
+might not be the uncorrupted replica. Because of this uncertainty, human
+intervention is necessary when an inconsistency is discovered. This
+intervention sometimes involves use of ``ceph-objectstore-tool``.
+
+External Links
+==============
+https://ceph.io/geen-categorie/ceph-manually-repair-object/ - This page
+contains a walkthrough of the repair of a PG. It is recommended reading if you
+want to repair a PG but have never done so.
diff --git a/doc/rados/operations/pg-states.rst b/doc/rados/operations/pg-states.rst
new file mode 100644
index 000000000..495229d92
--- /dev/null
+++ b/doc/rados/operations/pg-states.rst
@@ -0,0 +1,118 @@
+========================
+ Placement Group States
+========================
+
+When checking a cluster's status (e.g., running ``ceph -w`` or ``ceph -s``),
+Ceph will report on the status of the placement groups. A placement group has
+one or more states. The optimum state for placement groups in the placement group
+map is ``active + clean``.
+
+*creating*
+ Ceph is still creating the placement group.
+
+*activating*
+ The placement group is peered but not yet active.
+
+*active*
+ Ceph will process requests to the placement group.
+
+*clean*
+ Ceph replicated all objects in the placement group the correct number of times.
+
+*down*
+ A replica with necessary data is down, so the placement group is offline.
+
+*laggy*
+ A replica is not acknowledging new leases from the primary in a timely fashion; IO is temporarily paused.
+
+*wait*
+ The set of OSDs for this PG has just changed and IO is temporarily paused until the previous interval's leases expire.
+
+*scrubbing*
+ Ceph is checking the placement group metadata for inconsistencies.
+
+*deep*
+ Ceph is checking the placement group data against stored checksums.
+
+*degraded*
+ Ceph has not replicated some objects in the placement group the correct number of times yet.
+
+*inconsistent*
+ Ceph detects inconsistencies in the one or more replicas of an object in the placement group
+ (e.g. objects are the wrong size, objects are missing from one replica *after* recovery finished, etc.).
+
+*peering*
+ The placement group is undergoing the peering process
+
+*repair*
+ Ceph is checking the placement group and repairing any inconsistencies it finds (if possible).
+
+*recovering*
+ Ceph is migrating/synchronizing objects and their replicas.
+
+*forced_recovery*
+ High recovery priority of that PG is enforced by user.
+
+*recovery_wait*
+ The placement group is waiting in line to start recover.
+
+*recovery_toofull*
+ A recovery operation is waiting because the destination OSD is over its
+ full ratio.
+
+*recovery_unfound*
+ Recovery stopped due to unfound objects.
+
+*backfilling*
+ Ceph is scanning and synchronizing the entire contents of a placement group
+ instead of inferring what contents need to be synchronized from the logs of
+ recent operations. Backfill is a special case of recovery.
+
+*forced_backfill*
+ High backfill priority of that PG is enforced by user.
+
+*backfill_wait*
+ The placement group is waiting in line to start backfill.
+
+*backfill_toofull*
+ A backfill operation is waiting because the destination OSD is over
+ the backfillfull ratio.
+
+*backfill_unfound*
+ Backfill stopped due to unfound objects.
+
+*incomplete*
+ Ceph detects that a placement group is missing information about
+ writes that may have occurred, or does not have any healthy
+ copies. If you see this state, try to start any failed OSDs that may
+ contain the needed information. In the case of an erasure coded pool
+ temporarily reducing min_size may allow recovery.
+
+*stale*
+ The placement group is in an unknown state - the monitors have not received
+ an update for it since the placement group mapping changed.
+
+*remapped*
+ The placement group is temporarily mapped to a different set of OSDs from what
+ CRUSH specified.
+
+*undersized*
+ The placement group has fewer copies than the configured pool replication level.
+
+*peered*
+ The placement group has peered, but cannot serve client IO due to not having
+ enough copies to reach the pool's configured min_size parameter. Recovery
+ may occur in this state, so the pg may heal up to min_size eventually.
+
+*snaptrim*
+ Trimming snaps.
+
+*snaptrim_wait*
+ Queued to trim snaps.
+
+*snaptrim_error*
+ Error stopped trimming snaps.
+
+*unknown*
+ The ceph-mgr hasn't yet received any information about the PG's state from an
+ OSD since mgr started up.
diff --git a/doc/rados/operations/placement-groups.rst b/doc/rados/operations/placement-groups.rst
new file mode 100644
index 000000000..dda4a0177
--- /dev/null
+++ b/doc/rados/operations/placement-groups.rst
@@ -0,0 +1,897 @@
+.. _placement groups:
+
+==================
+ Placement Groups
+==================
+
+.. _pg-autoscaler:
+
+Autoscaling placement groups
+============================
+
+Placement groups (PGs) are an internal implementation detail of how Ceph
+distributes data. Autoscaling provides a way to manage PGs, and especially to
+manage the number of PGs present in different pools. When *pg-autoscaling* is
+enabled, the cluster is allowed to make recommendations or automatic
+adjustments with respect to the number of PGs for each pool (``pgp_num``) in
+accordance with expected cluster utilization and expected pool utilization.
+
+Each pool has a ``pg_autoscale_mode`` property that can be set to ``off``,
+``on``, or ``warn``:
+
+* ``off``: Disable autoscaling for this pool. It is up to the administrator to
+ choose an appropriate ``pgp_num`` for each pool. For more information, see
+ :ref:`choosing-number-of-placement-groups`.
+* ``on``: Enable automated adjustments of the PG count for the given pool.
+* ``warn``: Raise health checks when the PG count is in need of adjustment.
+
+To set the autoscaling mode for an existing pool, run a command of the
+following form:
+
+.. prompt:: bash #
+
+ ceph osd pool set <pool-name> pg_autoscale_mode <mode>
+
+For example, to enable autoscaling on pool ``foo``, run the following command:
+
+.. prompt:: bash #
+
+ ceph osd pool set foo pg_autoscale_mode on
+
+There is also a ``pg_autoscale_mode`` setting for any pools that are created
+after the initial setup of the cluster. To change this setting, run a command
+of the following form:
+
+.. prompt:: bash #
+
+ ceph config set global osd_pool_default_pg_autoscale_mode <mode>
+
+You can disable or enable the autoscaler for all pools with the ``noautoscale``
+flag. By default, this flag is set to ``off``, but you can set it to ``on`` by
+running the following command:
+
+.. prompt:: bash #
+
+ ceph osd pool set noautoscale
+
+To set the ``noautoscale`` flag to ``off``, run the following command:
+
+.. prompt:: bash #
+
+ ceph osd pool unset noautoscale
+
+To get the value of the flag, run the following command:
+
+.. prompt:: bash #
+
+ ceph osd pool get noautoscale
+
+Viewing PG scaling recommendations
+----------------------------------
+
+To view each pool, its relative utilization, and any recommended changes to the
+PG count, run the following command:
+
+.. prompt:: bash #
+
+ ceph osd pool autoscale-status
+
+The output will resemble the following::
+
+ POOL SIZE TARGET SIZE RATE RAW CAPACITY RATIO TARGET RATIO EFFECTIVE RATIO BIAS PG_NUM NEW PG_NUM AUTOSCALE BULK
+ a 12900M 3.0 82431M 0.4695 8 128 warn True
+ c 0 3.0 82431M 0.0000 0.2000 0.9884 1.0 1 64 warn True
+ b 0 953.6M 3.0 82431M 0.0347 8 warn False
+
+- **POOL** is the name of the pool.
+
+- **SIZE** is the amount of data stored in the pool.
+
+- **TARGET SIZE** (if present) is the amount of data that is expected to be
+ stored in the pool, as specified by the administrator. The system uses the
+ greater of the two values for its calculation.
+
+- **RATE** is the multiplier for the pool that determines how much raw storage
+ capacity is consumed. For example, a three-replica pool will have a ratio of
+ 3.0, and a ``k=4 m=2`` erasure-coded pool will have a ratio of 1.5.
+
+- **RAW CAPACITY** is the total amount of raw storage capacity on the specific
+ OSDs that are responsible for storing the data of the pool (and perhaps the
+ data of other pools).
+
+- **RATIO** is the ratio of (1) the storage consumed by the pool to (2) the
+ total raw storage capacity. In order words, RATIO is defined as
+ (SIZE * RATE) / RAW CAPACITY.
+
+- **TARGET RATIO** (if present) is the ratio of the expected storage of this
+ pool (that is, the amount of storage that this pool is expected to consume,
+ as specified by the administrator) to the expected storage of all other pools
+ that have target ratios set. If both ``target_size_bytes`` and
+ ``target_size_ratio`` are specified, then ``target_size_ratio`` takes
+ precedence.
+
+- **EFFECTIVE RATIO** is the result of making two adjustments to the target
+ ratio:
+
+ #. Subtracting any capacity expected to be used by pools that have target
+ size set.
+
+ #. Normalizing the target ratios among pools that have target ratio set so
+ that collectively they target cluster capacity. For example, four pools
+ with target_ratio 1.0 would have an effective ratio of 0.25.
+
+ The system's calculations use whichever of these two ratios (that is, the
+ target ratio and the effective ratio) is greater.
+
+- **BIAS** is used as a multiplier to manually adjust a pool's PG in accordance
+ with prior information about how many PGs a specific pool is expected to
+ have.
+
+- **PG_NUM** is either the current number of PGs associated with the pool or,
+ if a ``pg_num`` change is in progress, the current number of PGs that the
+ pool is working towards.
+
+- **NEW PG_NUM** (if present) is the value that the system is recommending the
+ ``pg_num`` of the pool to be changed to. It is always a power of 2, and it is
+ present only if the recommended value varies from the current value by more
+ than the default factor of ``3``. To adjust this factor (in the following
+ example, it is changed to ``2``), run the following command:
+
+ .. prompt:: bash #
+
+ ceph osd pool set threshold 2.0
+
+- **AUTOSCALE** is the pool's ``pg_autoscale_mode`` and is set to ``on``,
+ ``off``, or ``warn``.
+
+- **BULK** determines whether the pool is ``bulk``. It has a value of ``True``
+ or ``False``. A ``bulk`` pool is expected to be large and should initially
+ have a large number of PGs so that performance does not suffer]. On the other
+ hand, a pool that is not ``bulk`` is expected to be small (for example, a
+ ``.mgr`` pool or a meta pool).
+
+.. note::
+
+ If the ``ceph osd pool autoscale-status`` command returns no output at all,
+ there is probably at least one pool that spans multiple CRUSH roots. This
+ 'spanning pool' issue can happen in scenarios like the following:
+ when a new deployment auto-creates the ``.mgr`` pool on the ``default``
+ CRUSH root, subsequent pools are created with rules that constrain them to a
+ specific shadow CRUSH tree. For example, if you create an RBD metadata pool
+ that is constrained to ``deviceclass = ssd`` and an RBD data pool that is
+ constrained to ``deviceclass = hdd``, you will encounter this issue. To
+ remedy this issue, constrain the spanning pool to only one device class. In
+ the above scenario, there is likely to be a ``replicated-ssd`` CRUSH rule in
+ effect, and the ``.mgr`` pool can be constrained to ``ssd`` devices by
+ running the following commands:
+
+ .. prompt:: bash #
+
+ ceph osd pool set .mgr crush_rule replicated-ssd
+ ceph osd pool set pool 1 crush_rule to replicated-ssd
+
+ This intervention will result in a small amount of backfill, but
+ typically this traffic completes quickly.
+
+
+Automated scaling
+-----------------
+
+In the simplest approach to automated scaling, the cluster is allowed to
+automatically scale ``pgp_num`` in accordance with usage. Ceph considers the
+total available storage and the target number of PGs for the whole system,
+considers how much data is stored in each pool, and apportions PGs accordingly.
+The system is conservative with its approach, making changes to a pool only
+when the current number of PGs (``pg_num``) varies by more than a factor of 3
+from the recommended number.
+
+The target number of PGs per OSD is determined by the ``mon_target_pg_per_osd``
+parameter (default: 100), which can be adjusted by running the following
+command:
+
+.. prompt:: bash #
+
+ ceph config set global mon_target_pg_per_osd 100
+
+The autoscaler analyzes pools and adjusts on a per-subtree basis. Because each
+pool might map to a different CRUSH rule, and each rule might distribute data
+across different devices, Ceph will consider the utilization of each subtree of
+the hierarchy independently. For example, a pool that maps to OSDs of class
+``ssd`` and a pool that maps to OSDs of class ``hdd`` will each have optimal PG
+counts that are determined by how many of these two different device types
+there are.
+
+If a pool uses OSDs under two or more CRUSH roots (for example, shadow trees
+with both ``ssd`` and ``hdd`` devices), the autoscaler issues a warning to the
+user in the manager log. The warning states the name of the pool and the set of
+roots that overlap each other. The autoscaler does not scale any pools with
+overlapping roots because this condition can cause problems with the scaling
+process. We recommend constraining each pool so that it belongs to only one
+root (that is, one OSD class) to silence the warning and ensure a successful
+scaling process.
+
+.. _managing_bulk_flagged_pools:
+
+Managing pools that are flagged with ``bulk``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a pool is flagged ``bulk``, then the autoscaler starts the pool with a full
+complement of PGs and then scales down the number of PGs only if the usage
+ratio across the pool is uneven. However, if a pool is not flagged ``bulk``,
+then the autoscaler starts the pool with minimal PGs and creates additional PGs
+only if there is more usage in the pool.
+
+To create a pool that will be flagged ``bulk``, run the following command:
+
+.. prompt:: bash #
+
+ ceph osd pool create <pool-name> --bulk
+
+To set or unset the ``bulk`` flag of an existing pool, run the following
+command:
+
+.. prompt:: bash #
+
+ ceph osd pool set <pool-name> bulk <true/false/1/0>
+
+To get the ``bulk`` flag of an existing pool, run the following command:
+
+.. prompt:: bash #
+
+ ceph osd pool get <pool-name> bulk
+
+.. _specifying_pool_target_size:
+
+Specifying expected pool size
+-----------------------------
+
+When a cluster or pool is first created, it consumes only a small fraction of
+the total cluster capacity and appears to the system as if it should need only
+a small number of PGs. However, in some cases, cluster administrators know
+which pools are likely to consume most of the system capacity in the long run.
+When Ceph is provided with this information, a more appropriate number of PGs
+can be used from the beginning, obviating subsequent changes in ``pg_num`` and
+the associated overhead cost of relocating data.
+
+The *target size* of a pool can be specified in two ways: either in relation to
+the absolute size (in bytes) of the pool, or as a weight relative to all other
+pools that have ``target_size_ratio`` set.
+
+For example, to tell the system that ``mypool`` is expected to consume 100 TB,
+run the following command:
+
+.. prompt:: bash #
+
+ ceph osd pool set mypool target_size_bytes 100T
+
+Alternatively, to tell the system that ``mypool`` is expected to consume a
+ratio of 1.0 relative to other pools that have ``target_size_ratio`` set,
+adjust the ``target_size_ratio`` setting of ``my pool`` by running the
+following command:
+
+.. prompt:: bash #
+
+ ceph osd pool set mypool target_size_ratio 1.0
+
+If `mypool` is the only pool in the cluster, then it is expected to use 100% of
+the total cluster capacity. However, if the cluster contains a second pool that
+has ``target_size_ratio`` set to 1.0, then both pools are expected to use 50%
+of the total cluster capacity.
+
+The ``ceph osd pool create`` command has two command-line options that can be
+used to set the target size of a pool at creation time: ``--target-size-bytes
+<bytes>`` and ``--target-size-ratio <ratio>``.
+
+Note that if the target-size values that have been specified are impossible
+(for example, a capacity larger than the total cluster), then a health check
+(``POOL_TARGET_SIZE_BYTES_OVERCOMMITTED``) will be raised.
+
+If both ``target_size_ratio`` and ``target_size_bytes`` are specified for a
+pool, then the latter will be ignored, the former will be used in system
+calculations, and a health check (``POOL_HAS_TARGET_SIZE_BYTES_AND_RATIO``)
+will be raised.
+
+Specifying bounds on a pool's PGs
+---------------------------------
+
+It is possible to specify both the minimum number and the maximum number of PGs
+for a pool.
+
+Setting a Minimum Number of PGs and a Maximum Number of PGs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a minimum is set, then Ceph will not itself reduce (nor recommend that you
+reduce) the number of PGs to a value below the configured value. Setting a
+minimum serves to establish a lower bound on the amount of parallelism enjoyed
+by a client during I/O, even if a pool is mostly empty.
+
+If a maximum is set, then Ceph will not itself increase (or recommend that you
+increase) the number of PGs to a value above the configured value.
+
+To set the minimum number of PGs for a pool, run a command of the following
+form:
+
+.. prompt:: bash #
+
+ ceph osd pool set <pool-name> pg_num_min <num>
+
+To set the maximum number of PGs for a pool, run a command of the following
+form:
+
+.. prompt:: bash #
+
+ ceph osd pool set <pool-name> pg_num_max <num>
+
+In addition, the ``ceph osd pool create`` command has two command-line options
+that can be used to specify the minimum or maximum PG count of a pool at
+creation time: ``--pg-num-min <num>`` and ``--pg-num-max <num>``.
+
+.. _preselection:
+
+Preselecting pg_num
+===================
+
+When creating a pool with the following command, you have the option to
+preselect the value of the ``pg_num`` parameter:
+
+.. prompt:: bash #
+
+ ceph osd pool create {pool-name} [pg_num]
+
+If you opt not to specify ``pg_num`` in this command, the cluster uses the PG
+autoscaler to automatically configure the parameter in accordance with the
+amount of data that is stored in the pool (see :ref:`pg-autoscaler` above).
+
+However, your decision of whether or not to specify ``pg_num`` at creation time
+has no effect on whether the parameter will be automatically tuned by the
+cluster afterwards. As seen above, autoscaling of PGs is enabled or disabled by
+running a command of the following form:
+
+.. prompt:: bash #
+
+ ceph osd pool set {pool-name} pg_autoscale_mode (on|off|warn)
+
+Without the balancer, the suggested target is approximately 100 PG replicas on
+each OSD. With the balancer, an initial target of 50 PG replicas on each OSD is
+reasonable.
+
+The autoscaler attempts to satisfy the following conditions:
+
+- the number of PGs per OSD should be proportional to the amount of data in the
+ pool
+- there should be 50-100 PGs per pool, taking into account the replication
+ overhead or erasure-coding fan-out of each PG's replicas across OSDs
+
+Use of Placement Groups
+=======================
+
+A placement group aggregates objects within a pool. The tracking of RADOS
+object placement and object metadata on a per-object basis is computationally
+expensive. It would be infeasible for a system with millions of RADOS
+objects to efficiently track placement on a per-object basis.
+
+.. ditaa::
+ /-----\ /-----\ /-----\ /-----\ /-----\
+ | obj | | obj | | obj | | obj | | obj |
+ \-----/ \-----/ \-----/ \-----/ \-----/
+ | | | | |
+ +--------+--------+ +---+----+
+ | |
+ v v
+ +-----------------------+ +-----------------------+
+ | Placement Group #1 | | Placement Group #2 |
+ | | | |
+ +-----------------------+ +-----------------------+
+ | |
+ +------------------------------+
+ |
+ v
+ +-----------------------+
+ | Pool |
+ | |
+ +-----------------------+
+
+The Ceph client calculates which PG a RADOS object should be in. As part of
+this calculation, the client hashes the object ID and performs an operation
+involving both the number of PGs in the specified pool and the pool ID. For
+details, see `Mapping PGs to OSDs`_.
+
+The contents of a RADOS object belonging to a PG are stored in a set of OSDs.
+For example, in a replicated pool of size two, each PG will store objects on
+two OSDs, as shown below:
+
+.. ditaa::
+ +-----------------------+ +-----------------------+
+ | Placement Group #1 | | Placement Group #2 |
+ | | | |
+ +-----------------------+ +-----------------------+
+ | | | |
+ v v v v
+ /----------\ /----------\ /----------\ /----------\
+ | | | | | | | |
+ | OSD #1 | | OSD #2 | | OSD #2 | | OSD #3 |
+ | | | | | | | |
+ \----------/ \----------/ \----------/ \----------/
+
+
+If OSD #2 fails, another OSD will be assigned to Placement Group #1 and then
+filled with copies of all objects in OSD #1. If the pool size is changed from
+two to three, an additional OSD will be assigned to the PG and will receive
+copies of all objects in the PG.
+
+An OSD assigned to a PG is not owned exclusively by that PG; rather, the OSD is
+shared with other PGs either from the same pool or from other pools. In our
+example, OSD #2 is shared by Placement Group #1 and Placement Group #2. If OSD
+#2 fails, then Placement Group #2 must restore copies of objects (by making use
+of OSD #3).
+
+When the number of PGs increases, several consequences ensue. The new PGs are
+assigned OSDs. The result of the CRUSH function changes, which means that some
+objects from the already-existing PGs are copied to the new PGs and removed
+from the old ones.
+
+Factors Relevant To Specifying pg_num
+=====================================
+
+On the one hand, the criteria of data durability and even distribution across
+OSDs weigh in favor of a high number of PGs. On the other hand, the criteria of
+saving CPU resources and minimizing memory usage weigh in favor of a low number
+of PGs.
+
+.. _data durability:
+
+Data durability
+---------------
+
+When an OSD fails, the risk of data loss is increased until replication of the
+data it hosted is restored to the configured level. To illustrate this point,
+let's imagine a scenario that results in permanent data loss in a single PG:
+
+#. The OSD fails and all copies of the object that it contains are lost. For
+ each object within the PG, the number of its replicas suddenly drops from
+ three to two.
+
+#. Ceph starts recovery for this PG by choosing a new OSD on which to re-create
+ the third copy of each object.
+
+#. Another OSD within the same PG fails before the new OSD is fully populated
+ with the third copy. Some objects will then only have one surviving copy.
+
+#. Ceph selects yet another OSD and continues copying objects in order to
+ restore the desired number of copies.
+
+#. A third OSD within the same PG fails before recovery is complete. If this
+ OSD happened to contain the only remaining copy of an object, the object is
+ permanently lost.
+
+In a cluster containing 10 OSDs with 512 PGs in a three-replica pool, CRUSH
+will give each PG three OSDs. Ultimately, each OSD hosts :math:`\frac{(512 *
+3)}{10} = ~150` PGs. So when the first OSD fails in the above scenario,
+recovery will begin for all 150 PGs at the same time.
+
+The 150 PGs that are being recovered are likely to be homogeneously distributed
+across the 9 remaining OSDs. Each remaining OSD is therefore likely to send
+copies of objects to all other OSDs and also likely to receive some new objects
+to be stored because it has become part of a new PG.
+
+The amount of time it takes for this recovery to complete depends on the
+architecture of the Ceph cluster. Compare two setups: (1) Each OSD is hosted by
+a 1 TB SSD on a single machine, all of the OSDs are connected to a 10 Gb/s
+switch, and the recovery of a single OSD completes within a certain number of
+minutes. (2) There are two OSDs per machine using HDDs with no SSD WAL+DB and
+a 1 Gb/s switch. In the second setup, recovery will be at least one order of
+magnitude slower.
+
+In such a cluster, the number of PGs has almost no effect on data durability.
+Whether there are 128 PGs per OSD or 8192 PGs per OSD, the recovery will be no
+slower or faster.
+
+However, an increase in the number of OSDs can increase the speed of recovery.
+Suppose our Ceph cluster is expanded from 10 OSDs to 20 OSDs. Each OSD now
+participates in only ~75 PGs rather than ~150 PGs. All 19 remaining OSDs will
+still be required to replicate the same number of objects in order to recover.
+But instead of there being only 10 OSDs that have to copy ~100 GB each, there
+are now 20 OSDs that have to copy only 50 GB each. If the network had
+previously been a bottleneck, recovery now happens twice as fast.
+
+Similarly, suppose that our cluster grows to 40 OSDs. Each OSD will host only
+~38 PGs. And if an OSD dies, recovery will take place faster than before unless
+it is blocked by another bottleneck. Now, however, suppose that our cluster
+grows to 200 OSDs. Each OSD will host only ~7 PGs. And if an OSD dies, recovery
+will happen across at most :math:`\approx 21 = (7 \times 3)` OSDs
+associated with these PGs. This means that recovery will take longer than when
+there were only 40 OSDs. For this reason, the number of PGs should be
+increased.
+
+No matter how brief the recovery time is, there is always a chance that an
+additional OSD will fail while recovery is in progress. Consider the cluster
+with 10 OSDs described above: if any of the OSDs fail, then :math:`\approx 17`
+(approximately 150 divided by 9) PGs will have only one remaining copy. And if
+any of the 8 remaining OSDs fail, then 2 (approximately 17 divided by 8) PGs
+are likely to lose their remaining objects. This is one reason why setting
+``size=2`` is risky.
+
+When the number of OSDs in the cluster increases to 20, the number of PGs that
+would be damaged by the loss of three OSDs significantly decreases. The loss of
+a second OSD degrades only approximately :math:`4` or (:math:`\frac{75}{19}`)
+PGs rather than :math:`\approx 17` PGs, and the loss of a third OSD results in
+data loss only if it is one of the 4 OSDs that contains the remaining copy.
+This means -- assuming that the probability of losing one OSD during recovery
+is 0.0001% -- that the probability of data loss when three OSDs are lost is
+:math:`\approx 17 \times 10 \times 0.0001%` in the cluster with 10 OSDs, and
+only :math:`\approx 4 \times 20 \times 0.0001%` in the cluster with 20 OSDs.
+
+In summary, the greater the number of OSDs, the faster the recovery and the
+lower the risk of permanently losing a PG due to cascading failures. As far as
+data durability is concerned, in a cluster with fewer than 50 OSDs, it doesn't
+much matter whether there are 512 or 4096 PGs.
+
+.. note:: It can take a long time for an OSD that has been recently added to
+ the cluster to be populated with the PGs assigned to it. However, no object
+ degradation or impact on data durability will result from the slowness of
+ this process since Ceph populates data into the new PGs before removing it
+ from the old PGs.
+
+.. _object distribution:
+
+Object distribution within a pool
+---------------------------------
+
+Under ideal conditions, objects are evenly distributed across PGs. Because
+CRUSH computes the PG for each object but does not know how much data is stored
+in each OSD associated with the PG, the ratio between the number of PGs and the
+number of OSDs can have a significant influence on data distribution.
+
+For example, suppose that there is only a single PG for ten OSDs in a
+three-replica pool. In that case, only three OSDs would be used because CRUSH
+would have no other option. However, if more PGs are available, RADOS objects are
+more likely to be evenly distributed across OSDs. CRUSH makes every effort to
+distribute OSDs evenly across all existing PGs.
+
+As long as there are one or two orders of magnitude more PGs than OSDs, the
+distribution is likely to be even. For example: 256 PGs for 3 OSDs, 512 PGs for
+10 OSDs, or 1024 PGs for 10 OSDs.
+
+However, uneven data distribution can emerge due to factors other than the
+ratio of PGs to OSDs. For example, since CRUSH does not take into account the
+size of the RADOS objects, the presence of a few very large RADOS objects can
+create an imbalance. Suppose that one million 4 KB RADOS objects totaling 4 GB
+are evenly distributed among 1024 PGs on 10 OSDs. These RADOS objects will
+consume 4 GB / 10 = 400 MB on each OSD. If a single 400 MB RADOS object is then
+added to the pool, the three OSDs supporting the PG in which the RADOS object
+has been placed will each be filled with 400 MB + 400 MB = 800 MB but the seven
+other OSDs will still contain only 400 MB.
+
+.. _resource usage:
+
+Memory, CPU and network usage
+-----------------------------
+
+Every PG in the cluster imposes memory, network, and CPU demands upon OSDs and
+MONs. These needs must be met at all times and are increased during recovery.
+Indeed, one of the main reasons PGs were developed was to share this overhead
+by clustering objects together.
+
+For this reason, minimizing the number of PGs saves significant resources.
+
+.. _choosing-number-of-placement-groups:
+
+Choosing the Number of PGs
+==========================
+
+.. note: It is rarely necessary to do the math in this section by hand.
+ Instead, use the ``ceph osd pool autoscale-status`` command in combination
+ with the ``target_size_bytes`` or ``target_size_ratio`` pool properties. For
+ more information, see :ref:`pg-autoscaler`.
+
+If you have more than 50 OSDs, we recommend approximately 50-100 PGs per OSD in
+order to balance resource usage, data durability, and data distribution. If you
+have fewer than 50 OSDs, follow the guidance in the `preselection`_ section.
+For a single pool, use the following formula to get a baseline value:
+
+ Total PGs = :math:`\frac{OSDs \times 100}{pool \: size}`
+
+Here **pool size** is either the number of replicas for replicated pools or the
+K+M sum for erasure-coded pools. To retrieve this sum, run the command ``ceph
+osd erasure-code-profile get``.
+
+Next, check whether the resulting baseline value is consistent with the way you
+designed your Ceph cluster to maximize `data durability`_ and `object
+distribution`_ and to minimize `resource usage`_.
+
+This value should be **rounded up to the nearest power of two**.
+
+Each pool's ``pg_num`` should be a power of two. Other values are likely to
+result in uneven distribution of data across OSDs. It is best to increase
+``pg_num`` for a pool only when it is feasible and desirable to set the next
+highest power of two. Note that this power of two rule is per-pool; it is
+neither necessary nor easy to align the sum of all pools' ``pg_num`` to a power
+of two.
+
+For example, if you have a cluster with 200 OSDs and a single pool with a size
+of 3 replicas, estimate the number of PGs as follows:
+
+ :math:`\frac{200 \times 100}{3} = 6667`. Rounded up to the nearest power of 2: 8192.
+
+When using multiple data pools to store objects, make sure that you balance the
+number of PGs per pool against the number of PGs per OSD so that you arrive at
+a reasonable total number of PGs. It is important to find a number that
+provides reasonably low variance per OSD without taxing system resources or
+making the peering process too slow.
+
+For example, suppose you have a cluster of 10 pools, each with 512 PGs on 10
+OSDs. That amounts to 5,120 PGs distributed across 10 OSDs, or 512 PGs per OSD.
+This cluster will not use too many resources. However, in a cluster of 1,000
+pools, each with 512 PGs on 10 OSDs, the OSDs will have to handle ~50,000 PGs
+each. This cluster will require significantly more resources and significantly
+more time for peering.
+
+For determining the optimal number of PGs per OSD, we recommend the `PGCalc`_
+tool.
+
+
+.. _setting the number of placement groups:
+
+Setting the Number of PGs
+=========================
+
+Setting the initial number of PGs in a pool must be done at the time you create
+the pool. See `Create a Pool`_ for details.
+
+However, even after a pool is created, if the ``pg_autoscaler`` is not being
+used to manage ``pg_num`` values, you can change the number of PGs by running a
+command of the following form:
+
+.. prompt:: bash #
+
+ ceph osd pool set {pool-name} pg_num {pg_num}
+
+If you increase the number of PGs, your cluster will not rebalance until you
+increase the number of PGs for placement (``pgp_num``). The ``pgp_num``
+parameter specifies the number of PGs that are to be considered for placement
+by the CRUSH algorithm. Increasing ``pg_num`` splits the PGs in your cluster,
+but data will not be migrated to the newer PGs until ``pgp_num`` is increased.
+The ``pgp_num`` parameter should be equal to the ``pg_num`` parameter. To
+increase the number of PGs for placement, run a command of the following form:
+
+.. prompt:: bash #
+
+ ceph osd pool set {pool-name} pgp_num {pgp_num}
+
+If you decrease the number of PGs, then ``pgp_num`` is adjusted automatically.
+In releases of Ceph that are Nautilus and later (inclusive), when the
+``pg_autoscaler`` is not used, ``pgp_num`` is automatically stepped to match
+``pg_num``. This process manifests as periods of remapping of PGs and of
+backfill, and is expected behavior and normal.
+
+.. _rados_ops_pgs_get_pg_num:
+
+Get the Number of PGs
+=====================
+
+To get the number of PGs in a pool, run a command of the following form:
+
+.. prompt:: bash #
+
+ ceph osd pool get {pool-name} pg_num
+
+
+Get a Cluster's PG Statistics
+=============================
+
+To see the details of the PGs in your cluster, run a command of the following
+form:
+
+.. prompt:: bash #
+
+ ceph pg dump [--format {format}]
+
+Valid formats are ``plain`` (default) and ``json``.
+
+
+Get Statistics for Stuck PGs
+============================
+
+To see the statistics for all PGs that are stuck in a specified state, run a
+command of the following form:
+
+.. prompt:: bash #
+
+ ceph pg dump_stuck inactive|unclean|stale|undersized|degraded [--format <format>] [-t|--threshold <seconds>]
+
+- **Inactive** PGs cannot process reads or writes because they are waiting for
+ enough OSDs with the most up-to-date data to come ``up`` and ``in``.
+
+- **Undersized** PGs contain objects that have not been replicated the desired
+ number of times. Under normal conditions, it can be assumed that these PGs
+ are recovering.
+
+- **Stale** PGs are in an unknown state -- the OSDs that host them have not
+ reported to the monitor cluster for a certain period of time (determined by
+ ``mon_osd_report_timeout``).
+
+Valid formats are ``plain`` (default) and ``json``. The threshold defines the
+minimum number of seconds the PG is stuck before it is included in the returned
+statistics (default: 300).
+
+
+Get a PG Map
+============
+
+To get the PG map for a particular PG, run a command of the following form:
+
+.. prompt:: bash #
+
+ ceph pg map {pg-id}
+
+For example:
+
+.. prompt:: bash #
+
+ ceph pg map 1.6c
+
+Ceph will return the PG map, the PG, and the OSD status. The output resembles
+the following:
+
+.. prompt:: bash #
+
+ osdmap e13 pg 1.6c (1.6c) -> up [1,0] acting [1,0]
+
+
+Get a PG's Statistics
+=====================
+
+To see statistics for a particular PG, run a command of the following form:
+
+.. prompt:: bash #
+
+ ceph pg {pg-id} query
+
+
+Scrub a PG
+==========
+
+To scrub a PG, run a command of the following form:
+
+.. prompt:: bash #
+
+ ceph pg scrub {pg-id}
+
+Ceph checks the primary and replica OSDs, generates a catalog of all objects in
+the PG, and compares the objects against each other in order to ensure that no
+objects are missing or mismatched and that their contents are consistent. If
+the replicas all match, then a final semantic sweep takes place to ensure that
+all snapshot-related object metadata is consistent. Errors are reported in
+logs.
+
+To scrub all PGs from a specific pool, run a command of the following form:
+
+.. prompt:: bash #
+
+ ceph osd pool scrub {pool-name}
+
+
+Prioritize backfill/recovery of PG(s)
+=====================================
+
+You might encounter a situation in which multiple PGs require recovery or
+backfill, but the data in some PGs is more important than the data in others
+(for example, some PGs hold data for images that are used by running machines
+and other PGs are used by inactive machines and hold data that is less
+relevant). In that case, you might want to prioritize recovery or backfill of
+the PGs with especially important data so that the performance of the cluster
+and the availability of their data are restored sooner. To designate specific
+PG(s) as prioritized during recovery, run a command of the following form:
+
+.. prompt:: bash #
+
+ ceph pg force-recovery {pg-id} [{pg-id #2}] [{pg-id #3} ...]
+
+To mark specific PG(s) as prioritized during backfill, run a command of the
+following form:
+
+.. prompt:: bash #
+
+ ceph pg force-backfill {pg-id} [{pg-id #2}] [{pg-id #3} ...]
+
+These commands instruct Ceph to perform recovery or backfill on the specified
+PGs before processing the other PGs. Prioritization does not interrupt current
+backfills or recovery, but places the specified PGs at the top of the queue so
+that they will be acted upon next. If you change your mind or realize that you
+have prioritized the wrong PGs, run one or both of the following commands:
+
+.. prompt:: bash #
+
+ ceph pg cancel-force-recovery {pg-id} [{pg-id #2}] [{pg-id #3} ...]
+ ceph pg cancel-force-backfill {pg-id} [{pg-id #2}] [{pg-id #3} ...]
+
+These commands remove the ``force`` flag from the specified PGs, so that the
+PGs will be processed in their usual order. As in the case of adding the
+``force`` flag, this affects only those PGs that are still queued but does not
+affect PGs currently undergoing recovery.
+
+The ``force`` flag is cleared automatically after recovery or backfill of the
+PGs is complete.
+
+Similarly, to instruct Ceph to prioritize all PGs from a specified pool (that
+is, to perform recovery or backfill on those PGs first), run one or both of the
+following commands:
+
+.. prompt:: bash #
+
+ ceph osd pool force-recovery {pool-name}
+ ceph osd pool force-backfill {pool-name}
+
+These commands can also be cancelled. To revert to the default order, run one
+or both of the following commands:
+
+.. prompt:: bash #
+
+ ceph osd pool cancel-force-recovery {pool-name}
+ ceph osd pool cancel-force-backfill {pool-name}
+
+.. warning:: These commands can break the order of Ceph's internal priority
+ computations, so use them with caution! If you have multiple pools that are
+ currently sharing the same underlying OSDs, and if the data held by certain
+ pools is more important than the data held by other pools, then we recommend
+ that you run a command of the following form to arrange a custom
+ recovery/backfill priority for all pools:
+
+.. prompt:: bash #
+
+ ceph osd pool set {pool-name} recovery_priority {value}
+
+For example, if you have twenty pools, you could make the most important pool
+priority ``20``, and the next most important pool priority ``19``, and so on.
+
+Another option is to set the recovery/backfill priority for only a proper
+subset of pools. In such a scenario, three important pools might (all) be
+assigned priority ``1`` and all other pools would be left without an assigned
+recovery/backfill priority. Another possibility is to select three important
+pools and set their recovery/backfill priorities to ``3``, ``2``, and ``1``
+respectively.
+
+.. important:: Numbers of greater value have higher priority than numbers of
+ lesser value when using ``ceph osd pool set {pool-name} recovery_priority
+ {value}`` to set their recovery/backfill priority. For example, a pool with
+ the recovery/backfill priority ``30`` has a higher priority than a pool with
+ the recovery/backfill priority ``15``.
+
+Reverting Lost RADOS Objects
+============================
+
+If the cluster has lost one or more RADOS objects and you have decided to
+abandon the search for the lost data, you must mark the unfound objects
+``lost``.
+
+If every possible location has been queried and all OSDs are ``up`` and ``in``,
+but certain RADOS objects are still lost, you might have to give up on those
+objects. This situation can arise when rare and unusual combinations of
+failures allow the cluster to learn about writes that were performed before the
+writes themselves were recovered.
+
+The command to mark a RADOS object ``lost`` has only one supported option:
+``revert``. The ``revert`` option will either roll back to a previous version
+of the RADOS object (if it is old enough to have a previous version) or forget
+about it entirely (if it is too new to have a previous version). To mark the
+"unfound" objects ``lost``, run a command of the following form:
+
+
+.. prompt:: bash #
+
+ ceph pg {pg-id} mark_unfound_lost revert|delete
+
+.. important:: Use this feature with caution. It might confuse applications
+ that expect the object(s) to exist.
+
+
+.. toctree::
+ :hidden:
+
+ pg-states
+ pg-concepts
+
+
+.. _Create a Pool: ../pools#createpool
+.. _Mapping PGs to OSDs: ../../../architecture#mapping-pgs-to-osds
+.. _pgcalc: https://old.ceph.com/pgcalc/
diff --git a/doc/rados/operations/pools.rst b/doc/rados/operations/pools.rst
new file mode 100644
index 000000000..dda9e844e
--- /dev/null
+++ b/doc/rados/operations/pools.rst
@@ -0,0 +1,751 @@
+.. _rados_pools:
+
+=======
+ Pools
+=======
+Pools are logical partitions that are used to store objects.
+
+Pools provide:
+
+- **Resilience**: It is possible to set the number of OSDs that are allowed to
+ fail without any data being lost. If your cluster uses replicated pools, the
+ number of OSDs that can fail without data loss is equal to the number of
+ replicas.
+
+ For example: a typical configuration stores an object and two replicas
+ (copies) of each RADOS object (that is: ``size = 3``), but you can configure
+ the number of replicas on a per-pool basis. For `erasure-coded pools
+ <../erasure-code>`_, resilience is defined as the number of coding chunks
+ (for example, ``m = 2`` in the default **erasure code profile**).
+
+- **Placement Groups**: You can set the number of placement groups (PGs) for
+ the pool. In a typical configuration, the target number of PGs is
+ approximately one hundred PGs per OSD. This provides reasonable balancing
+ without consuming excessive computing resources. When setting up multiple
+ pools, be careful to set an appropriate number of PGs for each pool and for
+ the cluster as a whole. Each PG belongs to a specific pool: when multiple
+ pools use the same OSDs, make sure that the **sum** of PG replicas per OSD is
+ in the desired PG-per-OSD target range. To calculate an appropriate number of
+ PGs for your pools, use the `pgcalc`_ tool.
+
+- **CRUSH Rules**: When data is stored in a pool, the placement of the object
+ and its replicas (or chunks, in the case of erasure-coded pools) in your
+ cluster is governed by CRUSH rules. Custom CRUSH rules can be created for a
+ pool if the default rule does not fit your use case.
+
+- **Snapshots**: The command ``ceph osd pool mksnap`` creates a snapshot of a
+ pool.
+
+Pool Names
+==========
+
+Pool names beginning with ``.`` are reserved for use by Ceph's internal
+operations. Do not create or manipulate pools with these names.
+
+
+List Pools
+==========
+
+There are multiple ways to get the list of pools in your cluster.
+
+To list just your cluster's pool names (good for scripting), execute:
+
+.. prompt:: bash $
+
+ ceph osd pool ls
+
+::
+
+ .rgw.root
+ default.rgw.log
+ default.rgw.control
+ default.rgw.meta
+
+To list your cluster's pools with the pool number, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd lspools
+
+::
+
+ 1 .rgw.root
+ 2 default.rgw.log
+ 3 default.rgw.control
+ 4 default.rgw.meta
+
+To list your cluster's pools with additional information, execute:
+
+.. prompt:: bash $
+
+ ceph osd pool ls detail
+
+::
+
+ pool 1 '.rgw.root' replicated size 3 min_size 1 crush_rule 0 object_hash rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 19 flags hashpspool stripe_width 0 application rgw read_balance_score 4.00
+ pool 2 'default.rgw.log' replicated size 3 min_size 1 crush_rule 0 object_hash rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 21 flags hashpspool stripe_width 0 application rgw read_balance_score 4.00
+ pool 3 'default.rgw.control' replicated size 3 min_size 1 crush_rule 0 object_hash rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 23 flags hashpspool stripe_width 0 application rgw read_balance_score 4.00
+ pool 4 'default.rgw.meta' replicated size 3 min_size 1 crush_rule 0 object_hash rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 25 flags hashpspool stripe_width 0 pg_autoscale_bias 4 application rgw read_balance_score 4.00
+
+To get even more information, you can execute this command with the ``--format`` (or ``-f``) option and the ``json``, ``json-pretty``, ``xml`` or ``xml-pretty`` value.
+
+.. _createpool:
+
+Creating a Pool
+===============
+
+Before creating a pool, consult `Pool, PG and CRUSH Config Reference`_. Your
+Ceph configuration file contains a setting (namely, ``pg_num``) that determines
+the number of PGs. However, this setting's default value is NOT appropriate
+for most systems. In most cases, you should override this default value when
+creating your pool. For details on PG numbers, see `setting the number of
+placement groups`_
+
+For example:
+
+.. prompt:: bash $
+
+ osd_pool_default_pg_num = 128
+ osd_pool_default_pgp_num = 128
+
+.. note:: In Luminous and later releases, each pool must be associated with the
+ application that will be using the pool. For more information, see
+ `Associating a Pool with an Application`_ below.
+
+To create a pool, run one of the following commands:
+
+.. prompt:: bash $
+
+ ceph osd pool create {pool-name} [{pg-num} [{pgp-num}]] [replicated] \
+ [crush-rule-name] [expected-num-objects]
+
+or:
+
+.. prompt:: bash $
+
+ ceph osd pool create {pool-name} [{pg-num} [{pgp-num}]] erasure \
+ [erasure-code-profile] [crush-rule-name] [expected_num_objects] [--autoscale-mode=<on,off,warn>]
+
+For a brief description of the elements of the above commands, consult the
+following:
+
+.. describe:: {pool-name}
+
+ The name of the pool. It must be unique.
+
+ :Type: String
+ :Required: Yes.
+
+.. describe:: {pg-num}
+
+ The total number of PGs in the pool. For details on calculating an
+ appropriate number, see :ref:`placement groups`. The default value ``8`` is
+ NOT suitable for most systems.
+
+ :Type: Integer
+ :Required: Yes.
+ :Default: 8
+
+.. describe:: {pgp-num}
+
+ The total number of PGs for placement purposes. This **should be equal to
+ the total number of PGs**, except briefly while ``pg_num`` is being
+ increased or decreased.
+
+ :Type: Integer
+ :Required: Yes. If no value has been specified in the command, then the default value is used (unless a different value has been set in Ceph configuration).
+ :Default: 8
+
+.. describe:: {replicated|erasure}
+
+ The pool type. This can be either **replicated** (to recover from lost OSDs
+ by keeping multiple copies of the objects) or **erasure** (to achieve a kind
+ of `generalized parity RAID <../erasure-code>`_ capability). The
+ **replicated** pools require more raw storage but can implement all Ceph
+ operations. The **erasure** pools require less raw storage but can perform
+ only some Ceph tasks and may provide decreased performance.
+
+ :Type: String
+ :Required: No.
+ :Default: replicated
+
+.. describe:: [crush-rule-name]
+
+ The name of the CRUSH rule to use for this pool. The specified rule must
+ exist; otherwise the command will fail.
+
+ :Type: String
+ :Required: No.
+ :Default: For **replicated** pools, it is the rule specified by the :confval:`osd_pool_default_crush_rule` configuration variable. This rule must exist. For **erasure** pools, it is the ``erasure-code`` rule if the ``default`` `erasure code profile`_ is used or the ``{pool-name}`` rule if not. This rule will be created implicitly if it doesn't already exist.
+
+.. describe:: [erasure-code-profile=profile]
+
+ For **erasure** pools only. Instructs Ceph to use the specified `erasure
+ code profile`_. This profile must be an existing profile as defined by **osd
+ erasure-code-profile set**.
+
+ :Type: String
+ :Required: No.
+
+.. _erasure code profile: ../erasure-code-profile
+
+.. describe:: --autoscale-mode=<on,off,warn>
+
+ - ``on``: the Ceph cluster will autotune or recommend changes to the number of PGs in your pool based on actual usage.
+ - ``warn``: the Ceph cluster will autotune or recommend changes to the number of PGs in your pool based on actual usage.
+ - ``off``: refer to :ref:`placement groups` for more information.
+
+ :Type: String
+ :Required: No.
+ :Default: The default behavior is determined by the :confval:`osd_pool_default_pg_autoscale_mode` option.
+
+.. describe:: [expected-num-objects]
+
+ The expected number of RADOS objects for this pool. By setting this value and
+ assigning a negative value to **filestore merge threshold**, you arrange
+ for the PG folder splitting to occur at the time of pool creation and
+ avoid the latency impact that accompanies runtime folder splitting.
+
+ :Type: Integer
+ :Required: No.
+ :Default: 0, no splitting at the time of pool creation.
+
+.. _associate-pool-to-application:
+
+Associating a Pool with an Application
+======================================
+
+Pools need to be associated with an application before they can be used. Pools
+that are intended for use with CephFS and pools that are created automatically
+by RGW are associated automatically. Pools that are intended for use with RBD
+should be initialized with the ``rbd`` tool (see `Block Device Commands`_ for
+more information).
+
+For other cases, you can manually associate a free-form application name to a
+pool by running the following command.:
+
+.. prompt:: bash $
+
+ ceph osd pool application enable {pool-name} {application-name}
+
+.. note:: CephFS uses the application name ``cephfs``, RBD uses the
+ application name ``rbd``, and RGW uses the application name ``rgw``.
+
+Setting Pool Quotas
+===================
+
+To set pool quotas for the maximum number of bytes and/or the maximum number of
+RADOS objects per pool, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set-quota {pool-name} [max_objects {obj-count}] [max_bytes {bytes}]
+
+For example:
+
+.. prompt:: bash $
+
+ ceph osd pool set-quota data max_objects 10000
+
+To remove a quota, set its value to ``0``.
+
+
+Deleting a Pool
+===============
+
+To delete a pool, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd pool delete {pool-name} [{pool-name} --yes-i-really-really-mean-it]
+
+To remove a pool, you must set the ``mon_allow_pool_delete`` flag to ``true``
+in the monitor's configuration. Otherwise, monitors will refuse to remove
+pools.
+
+For more information, see `Monitor Configuration`_.
+
+.. _Monitor Configuration: ../../configuration/mon-config-ref
+
+If there are custom rules for a pool that is no longer needed, consider
+deleting those rules.
+
+.. prompt:: bash $
+
+ ceph osd pool get {pool-name} crush_rule
+
+For example, if the custom rule is "123", check all pools to see whether they
+use the rule by running the following command:
+
+.. prompt:: bash $
+
+ ceph osd dump | grep "^pool" | grep "crush_rule 123"
+
+If no pools use this custom rule, then it is safe to delete the rule from the
+cluster.
+
+Similarly, if there are users with permissions restricted to a pool that no
+longer exists, consider deleting those users by running commands of the
+following forms:
+
+.. prompt:: bash $
+
+ ceph auth ls | grep -C 5 {pool-name}
+ ceph auth del {user}
+
+
+Renaming a Pool
+===============
+
+To rename a pool, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd pool rename {current-pool-name} {new-pool-name}
+
+If you rename a pool for which an authenticated user has per-pool capabilities,
+you must update the user's capabilities ("caps") to refer to the new pool name.
+
+
+Showing Pool Statistics
+=======================
+
+To show a pool's utilization statistics, run the following command:
+
+.. prompt:: bash $
+
+ rados df
+
+To obtain I/O information for a specific pool or for all pools, run a command
+of the following form:
+
+.. prompt:: bash $
+
+ ceph osd pool stats [{pool-name}]
+
+
+Making a Snapshot of a Pool
+===========================
+
+To make a snapshot of a pool, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd pool mksnap {pool-name} {snap-name}
+
+Removing a Snapshot of a Pool
+=============================
+
+To remove a snapshot of a pool, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd pool rmsnap {pool-name} {snap-name}
+
+.. _setpoolvalues:
+
+Setting Pool Values
+===================
+
+To assign values to a pool's configuration keys, run a command of the following
+form:
+
+.. prompt:: bash $
+
+ ceph osd pool set {pool-name} {key} {value}
+
+You may set values for the following keys:
+
+.. _compression_algorithm:
+
+.. describe:: compression_algorithm
+
+ :Description: Sets the inline compression algorithm used in storing data on the underlying BlueStore back end. This key's setting overrides the global setting :confval:`bluestore_compression_algorithm`.
+ :Type: String
+ :Valid Settings: ``lz4``, ``snappy``, ``zlib``, ``zstd``
+
+.. describe:: compression_mode
+
+ :Description: Sets the policy for the inline compression algorithm used in storing data on the underlying BlueStore back end. This key's setting overrides the global setting :confval:`bluestore_compression_mode`.
+ :Type: String
+ :Valid Settings: ``none``, ``passive``, ``aggressive``, ``force``
+
+.. describe:: compression_min_blob_size
+
+
+ :Description: Sets the minimum size for the compression of chunks: that is, chunks smaller than this are not compressed. This key's setting overrides the following global settings:
+
+ * :confval:`bluestore_compression_min_blob_size`
+ * :confval:`bluestore_compression_min_blob_size_hdd`
+ * :confval:`bluestore_compression_min_blob_size_ssd`
+
+ :Type: Unsigned Integer
+
+
+.. describe:: compression_max_blob_size
+
+ :Description: Sets the maximum size for chunks: that is, chunks larger than this are broken into smaller blobs of this size before compression is performed.
+ :Type: Unsigned Integer
+
+.. _size:
+
+.. describe:: size
+
+ :Description: Sets the number of replicas for objects in the pool. For further details, see `Setting the Number of RADOS Object Replicas`_. Replicated pools only.
+ :Type: Integer
+
+.. _min_size:
+
+.. describe:: min_size
+
+ :Description: Sets the minimum number of replicas required for I/O. For further details, see `Setting the Number of RADOS Object Replicas`_. For erasure-coded pools, this should be set to a value greater than 'k'. If I/O is allowed at the value 'k', then there is no redundancy and data will be lost in the event of a permanent OSD failure. For more information, see `Erasure Code <../erasure-code>`_
+ :Type: Integer
+ :Version: ``0.54`` and above
+
+.. _pg_num:
+
+.. describe:: pg_num
+
+ :Description: Sets the effective number of PGs to use when calculating data placement.
+ :Type: Integer
+ :Valid Range: ``0`` to ``mon_max_pool_pg_num``. If set to ``0``, the value of ``osd_pool_default_pg_num`` will be used.
+
+.. _pgp_num:
+
+.. describe:: pgp_num
+
+ :Description: Sets the effective number of PGs to use when calculating data placement.
+ :Type: Integer
+ :Valid Range: Between ``1`` and the current value of ``pg_num``.
+
+.. _crush_rule:
+
+.. describe:: crush_rule
+
+ :Description: Sets the CRUSH rule that Ceph uses to map object placement within the pool.
+ :Type: String
+
+.. _allow_ec_overwrites:
+
+.. describe:: allow_ec_overwrites
+
+ :Description: Determines whether writes to an erasure-coded pool are allowed to update only part of a RADOS object. This allows CephFS and RBD to use an EC (erasure-coded) pool for user data (but not for metadata). For more details, see `Erasure Coding with Overwrites`_.
+ :Type: Boolean
+
+ .. versionadded:: 12.2.0
+
+.. describe:: hashpspool
+
+ :Description: Sets and unsets the HASHPSPOOL flag on a given pool.
+ :Type: Integer
+ :Valid Range: 1 sets flag, 0 unsets flag
+
+.. _nodelete:
+
+.. describe:: nodelete
+
+ :Description: Sets and unsets the NODELETE flag on a given pool.
+ :Type: Integer
+ :Valid Range: 1 sets flag, 0 unsets flag
+ :Version: Version ``FIXME``
+
+.. _nopgchange:
+
+.. describe:: nopgchange
+
+ :Description: Sets and unsets the NOPGCHANGE flag on a given pool.
+ :Type: Integer
+ :Valid Range: 1 sets flag, 0 unsets flag
+ :Version: Version ``FIXME``
+
+.. _nosizechange:
+
+.. describe:: nosizechange
+
+ :Description: Sets and unsets the NOSIZECHANGE flag on a given pool.
+ :Type: Integer
+ :Valid Range: 1 sets flag, 0 unsets flag
+ :Version: Version ``FIXME``
+
+.. _bulk:
+
+.. describe:: bulk
+
+ :Description: Sets and unsets the bulk flag on a given pool.
+ :Type: Boolean
+ :Valid Range: ``true``/``1`` sets flag, ``false``/``0`` unsets flag
+
+.. _write_fadvise_dontneed:
+
+.. describe:: write_fadvise_dontneed
+
+ :Description: Sets and unsets the WRITE_FADVISE_DONTNEED flag on a given pool.
+ :Type: Integer
+ :Valid Range: ``1`` sets flag, ``0`` unsets flag
+
+.. _noscrub:
+
+.. describe:: noscrub
+
+ :Description: Sets and unsets the NOSCRUB flag on a given pool.
+ :Type: Integer
+ :Valid Range: ``1`` sets flag, ``0`` unsets flag
+
+.. _nodeep-scrub:
+
+.. describe:: nodeep-scrub
+
+ :Description: Sets and unsets the NODEEP_SCRUB flag on a given pool.
+ :Type: Integer
+ :Valid Range: ``1`` sets flag, ``0`` unsets flag
+
+.. _target_max_bytes:
+
+.. describe:: target_max_bytes
+
+ :Description: Ceph will begin flushing or evicting objects when the
+ ``max_bytes`` threshold is triggered.
+ :Type: Integer
+ :Example: ``1000000000000`` #1-TB
+
+.. _target_max_objects:
+
+.. describe:: target_max_objects
+
+ :Description: Ceph will begin flushing or evicting objects when the
+ ``max_objects`` threshold is triggered.
+ :Type: Integer
+ :Example: ``1000000`` #1M objects
+
+.. _fast_read:
+
+.. describe:: fast_read
+
+ :Description: For erasure-coded pools, if this flag is turned ``on``, the
+ read request issues "sub reads" to all shards, and then waits
+ until it receives enough shards to decode before it serves
+ the client. If *jerasure* or *isa* erasure plugins are in
+ use, then after the first *K* replies have returned, the
+ client's request is served immediately using the data decoded
+ from these replies. This approach sacrifices resources in
+ exchange for better performance. This flag is supported only
+ for erasure-coded pools.
+ :Type: Boolean
+ :Defaults: ``0``
+
+.. _scrub_min_interval:
+
+.. describe:: scrub_min_interval
+
+ :Description: Sets the minimum interval (in seconds) for successive scrubs of the pool's PGs when the load is low. If the default value of ``0`` is in effect, then the value of ``osd_scrub_min_interval`` from central config is used.
+
+ :Type: Double
+ :Default: ``0``
+
+.. _scrub_max_interval:
+
+.. describe:: scrub_max_interval
+
+ :Description: Sets the maximum interval (in seconds) for scrubs of the pool's PGs regardless of cluster load. If the value of ``scrub_max_interval`` is ``0``, then the value ``osd_scrub_max_interval`` from central config is used.
+
+ :Type: Double
+ :Default: ``0``
+
+.. _deep_scrub_interval:
+
+.. describe:: deep_scrub_interval
+
+ :Description: Sets the interval (in seconds) for pool “deep” scrubs of the pool's PGs. If the value of ``deep_scrub_interval`` is ``0``, the value ``osd_deep_scrub_interval`` from central config is used.
+
+ :Type: Double
+ :Default: ``0``
+
+.. _recovery_priority:
+
+.. describe:: recovery_priority
+
+ :Description: Setting this value adjusts a pool's computed reservation priority. This value must be in the range ``-10`` to ``10``. Any pool assigned a negative value will be given a lower priority than any new pools, so users are directed to assign negative values to low-priority pools.
+
+ :Type: Integer
+ :Default: ``0``
+
+
+.. _recovery_op_priority:
+
+.. describe:: recovery_op_priority
+
+ :Description: Sets the recovery operation priority for a specific pool's PGs. This overrides the general priority determined by :confval:`osd_recovery_op_priority`.
+
+ :Type: Integer
+ :Default: ``0``
+
+
+Getting Pool Values
+===================
+
+To get a value from a pool's key, run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph osd pool get {pool-name} {key}
+
+
+You may get values from the following keys:
+
+
+``size``
+
+:Description: See size_.
+
+:Type: Integer
+
+
+``min_size``
+
+:Description: See min_size_.
+
+:Type: Integer
+:Version: ``0.54`` and above
+
+
+``pg_num``
+
+:Description: See pg_num_.
+
+:Type: Integer
+
+
+``pgp_num``
+
+:Description: See pgp_num_.
+
+:Type: Integer
+:Valid Range: Equal to or less than ``pg_num``.
+
+
+``crush_rule``
+
+:Description: See crush_rule_.
+
+
+``target_max_bytes``
+
+:Description: See target_max_bytes_.
+
+:Type: Integer
+
+
+``target_max_objects``
+
+:Description: See target_max_objects_.
+
+:Type: Integer
+
+
+``fast_read``
+
+:Description: See fast_read_.
+
+:Type: Boolean
+
+
+``scrub_min_interval``
+
+:Description: See scrub_min_interval_.
+
+:Type: Double
+
+
+``scrub_max_interval``
+
+:Description: See scrub_max_interval_.
+
+:Type: Double
+
+
+``deep_scrub_interval``
+
+:Description: See deep_scrub_interval_.
+
+:Type: Double
+
+
+``allow_ec_overwrites``
+
+:Description: See allow_ec_overwrites_.
+
+:Type: Boolean
+
+
+``recovery_priority``
+
+:Description: See recovery_priority_.
+
+:Type: Integer
+
+
+``recovery_op_priority``
+
+:Description: See recovery_op_priority_.
+
+:Type: Integer
+
+
+Setting the Number of RADOS Object Replicas
+===========================================
+
+To set the number of data replicas on a replicated pool, run a command of the
+following form:
+
+.. prompt:: bash $
+
+ ceph osd pool set {poolname} size {num-replicas}
+
+.. important:: The ``{num-replicas}`` argument includes the primary object
+ itself. For example, if you want there to be two replicas of the object in
+ addition to the original object (for a total of three instances of the
+ object) specify ``3`` by running the following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set data size 3
+
+You may run the above command for each pool.
+
+.. Note:: An object might accept I/Os in degraded mode with fewer than ``pool
+ size`` replicas. To set a minimum number of replicas required for I/O, you
+ should use the ``min_size`` setting. For example, you might run the
+ following command:
+
+.. prompt:: bash $
+
+ ceph osd pool set data min_size 2
+
+This command ensures that no object in the data pool will receive I/O if it has
+fewer than ``min_size`` (in this case, two) replicas.
+
+
+Getting the Number of Object Replicas
+=====================================
+
+To get the number of object replicas, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd dump | grep 'replicated size'
+
+Ceph will list pools and highlight the ``replicated size`` attribute. By
+default, Ceph creates two replicas of an object (a total of three copies, for a
+size of ``3``).
+
+Managing pools that are flagged with ``--bulk``
+===============================================
+See :ref:`managing_bulk_flagged_pools`.
+
+
+.. _pgcalc: https://old.ceph.com/pgcalc/
+.. _Pool, PG and CRUSH Config Reference: ../../configuration/pool-pg-config-ref
+.. _Bloom Filter: https://en.wikipedia.org/wiki/Bloom_filter
+.. _setting the number of placement groups: ../placement-groups#set-the-number-of-placement-groups
+.. _Erasure Coding with Overwrites: ../erasure-code#erasure-coding-with-overwrites
+.. _Block Device Commands: ../../../rbd/rados-rbd-cmds/#create-a-block-device-pool
diff --git a/doc/rados/operations/read-balancer.rst b/doc/rados/operations/read-balancer.rst
new file mode 100644
index 000000000..0833e4326
--- /dev/null
+++ b/doc/rados/operations/read-balancer.rst
@@ -0,0 +1,64 @@
+.. _read_balancer:
+
+=======================================
+Operating the Read (Primary) Balancer
+=======================================
+
+You might be wondering: How can I improve performance in my Ceph cluster?
+One important data point you can check is the ``read_balance_score`` on each
+of your replicated pools.
+
+This metric, available via ``ceph osd pool ls detail`` (see :ref:`rados_pools`
+for more details) indicates read performance, or how balanced the primaries are
+for each replicated pool. In most cases, if a ``read_balance_score`` is above 1
+(for instance, 1.5), this means that your pool has unbalanced primaries and that
+you may want to try improving your read performance with the read balancer.
+
+Online Optimization
+===================
+
+At present, there is no online option for the read balancer. However, we plan to add
+the read balancer as an option to the :ref:`balancer` in the next Ceph version
+so it can be enabled to run automatically in the background like the upmap balancer.
+
+Offline Optimization
+====================
+
+Primaries are updated with an offline optimizer that is built into the
+:ref:`osdmaptool`.
+
+#. Grab the latest copy of your osdmap:
+
+ .. prompt:: bash $
+
+ ceph osd getmap -o om
+
+#. Run the optimizer:
+
+ .. prompt:: bash $
+
+ osdmaptool om --read out.txt --read-pool <pool name> [--vstart]
+
+ It is highly recommended that you run the capacity balancer before running the
+ balancer to ensure optimal results. See :ref:`upmap` for details on how to balance
+ capacity in a cluster.
+
+#. Apply the changes:
+
+ .. prompt:: bash $
+
+ source out.txt
+
+ In the above example, the proposed changes are written to the output file
+ ``out.txt``. The commands in this procedure are normal Ceph CLI commands
+ that can be run in order to apply the changes to the cluster.
+
+ If you are working in a vstart cluster, you may pass the ``--vstart`` parameter
+ as shown above so the CLI commands are formatted with the `./bin/` prefix.
+
+ Note that any time the number of pgs changes (for instance, if the pg autoscaler [:ref:`pg-autoscaler`]
+ kicks in), you should consider rechecking the scores and rerunning the balancer if needed.
+
+To see some details about what the tool is doing, you can pass
+``--debug-osd 10`` to ``osdmaptool``. To see even more details, pass
+``--debug-osd 20`` to ``osdmaptool``.
diff --git a/doc/rados/operations/stretch-mode.rst b/doc/rados/operations/stretch-mode.rst
new file mode 100644
index 000000000..f797b5b91
--- /dev/null
+++ b/doc/rados/operations/stretch-mode.rst
@@ -0,0 +1,262 @@
+.. _stretch_mode:
+
+================
+Stretch Clusters
+================
+
+
+Stretch Clusters
+================
+
+A stretch cluster is a cluster that has servers in geographically separated
+data centers, distributed over a WAN. Stretch clusters have LAN-like high-speed
+and low-latency connections, but limited links. Stretch clusters have a higher
+likelihood of (possibly asymmetric) network splits, and a higher likelihood of
+temporary or complete loss of an entire data center (which can represent
+one-third to one-half of the total cluster).
+
+Ceph is designed with the expectation that all parts of its network and cluster
+will be reliable and that failures will be distributed randomly across the
+CRUSH map. Even if a switch goes down and causes the loss of many OSDs, Ceph is
+designed so that the remaining OSDs and monitors will route around such a loss.
+
+Sometimes this cannot be relied upon. If you have a "stretched-cluster"
+deployment in which much of your cluster is behind a single network component,
+you might need to use **stretch mode** to ensure data integrity.
+
+We will here consider two standard configurations: a configuration with two
+data centers (or, in clouds, two availability zones), and a configuration with
+three data centers (or, in clouds, three availability zones).
+
+In the two-site configuration, Ceph expects each of the sites to hold a copy of
+the data, and Ceph also expects there to be a third site that has a tiebreaker
+monitor. This tiebreaker monitor picks a winner if the network connection fails
+and both data centers remain alive.
+
+The tiebreaker monitor can be a VM. It can also have high latency relative to
+the two main sites.
+
+The standard Ceph configuration is able to survive MANY network failures or
+data-center failures without ever compromising data availability. If enough
+Ceph servers are brought back following a failure, the cluster *will* recover.
+If you lose a data center but are still able to form a quorum of monitors and
+still have all the data available, Ceph will maintain availability. (This
+assumes that the cluster has enough copies to satisfy the pools' ``min_size``
+configuration option, or (failing that) that the cluster has CRUSH rules in
+place that will cause the cluster to re-replicate the data until the
+``min_size`` configuration option has been met.)
+
+Stretch Cluster Issues
+======================
+
+Ceph does not permit the compromise of data integrity and data consistency
+under any circumstances. When service is restored after a network failure or a
+loss of Ceph nodes, Ceph will restore itself to a state of normal functioning
+without operator intervention.
+
+Ceph does not permit the compromise of data integrity or data consistency, but
+there are situations in which *data availability* is compromised. These
+situations can occur even though there are enough clusters available to satisfy
+Ceph's consistency and sizing constraints. In some situations, you might
+discover that your cluster does not satisfy those constraints.
+
+The first category of these failures that we will discuss involves inconsistent
+networks -- if there is a netsplit (a disconnection between two servers that
+splits the network into two pieces), Ceph might be unable to mark OSDs ``down``
+and remove them from the acting PG sets. This failure to mark ODSs ``down``
+will occur, despite the fact that the primary PG is unable to replicate data (a
+situation that, under normal non-netsplit circumstances, would result in the
+marking of affected OSDs as ``down`` and their removal from the PG). If this
+happens, Ceph will be unable to satisfy its durability guarantees and
+consequently IO will not be permitted.
+
+The second category of failures that we will discuss involves the situation in
+which the constraints are not sufficient to guarantee the replication of data
+across data centers, though it might seem that the data is correctly replicated
+across data centers. For example, in a scenario in which there are two data
+centers named Data Center A and Data Center B, and the CRUSH rule targets three
+replicas and places a replica in each data center with a ``min_size`` of ``2``,
+the PG might go active with two replicas in Data Center A and zero replicas in
+Data Center B. In a situation of this kind, the loss of Data Center A means
+that the data is lost and Ceph will not be able to operate on it. This
+situation is surprisingly difficult to avoid using only standard CRUSH rules.
+
+
+Stretch Mode
+============
+Stretch mode is designed to handle deployments in which you cannot guarantee the
+replication of data across two data centers. This kind of situation can arise
+when the cluster's CRUSH rule specifies that three copies are to be made, but
+then a copy is placed in each data center with a ``min_size`` of 2. Under such
+conditions, a placement group can become active with two copies in the first
+data center and no copies in the second data center.
+
+
+Entering Stretch Mode
+---------------------
+
+To enable stretch mode, you must set the location of each monitor, matching
+your CRUSH map. This procedure shows how to do this.
+
+
+#. Place ``mon.a`` in your first data center:
+
+ .. prompt:: bash $
+
+ ceph mon set_location a datacenter=site1
+
+#. Generate a CRUSH rule that places two copies in each data center.
+ This requires editing the CRUSH map directly:
+
+ .. prompt:: bash $
+
+ ceph osd getcrushmap > crush.map.bin
+ crushtool -d crush.map.bin -o crush.map.txt
+
+#. Edit the ``crush.map.txt`` file to add a new rule. Here there is only one
+ other rule (``id 1``), but you might need to use a different rule ID. We
+ have two data-center buckets named ``site1`` and ``site2``:
+
+ ::
+
+ rule stretch_rule {
+ id 1
+ min_size 1
+ max_size 10
+ type replicated
+ step take site1
+ step chooseleaf firstn 2 type host
+ step emit
+ step take site2
+ step chooseleaf firstn 2 type host
+ step emit
+ }
+
+#. Inject the CRUSH map to make the rule available to the cluster:
+
+ .. prompt:: bash $
+
+ crushtool -c crush.map.txt -o crush2.map.bin
+ ceph osd setcrushmap -i crush2.map.bin
+
+#. Run the monitors in connectivity mode. See `Changing Monitor Elections`_.
+
+#. Command the cluster to enter stretch mode. In this example, ``mon.e`` is the
+ tiebreaker monitor and we are splitting across data centers. The tiebreaker
+ monitor must be assigned a data center that is neither ``site1`` nor
+ ``site2``. For this purpose you can create another data-center bucket named
+ ``site3`` in your CRUSH and place ``mon.e`` there:
+
+ .. prompt:: bash $
+
+ ceph mon set_location e datacenter=site3
+ ceph mon enable_stretch_mode e stretch_rule datacenter
+
+When stretch mode is enabled, PGs will become active only when they peer
+across data centers (or across whichever CRUSH bucket type was specified),
+assuming both are alive. Pools will increase in size from the default ``3`` to
+``4``, and two copies will be expected in each site. OSDs will be allowed to
+connect to monitors only if they are in the same data center as the monitors.
+New monitors will not be allowed to join the cluster if they do not specify a
+location.
+
+If all OSDs and monitors in one of the data centers become inaccessible at once,
+the surviving data center enters a "degraded stretch mode". A warning will be
+issued, the ``min_size`` will be reduced to ``1``, and the cluster will be
+allowed to go active with the data in the single remaining site. The pool size
+does not change, so warnings will be generated that report that the pools are
+too small -- but a special stretch mode flag will prevent the OSDs from
+creating extra copies in the remaining data center. This means that the data
+center will keep only two copies, just as before.
+
+When the missing data center comes back, the cluster will enter a "recovery
+stretch mode". This changes the warning and allows peering, but requires OSDs
+only from the data center that was ``up`` throughout the duration of the
+downtime. When all PGs are in a known state, and are neither degraded nor
+incomplete, the cluster transitions back to regular stretch mode, ends the
+warning, restores ``min_size`` to its original value (``2``), requires both
+sites to peer, and no longer requires the site that was up throughout the
+duration of the downtime when peering (which makes failover to the other site
+possible, if needed).
+
+.. _Changing Monitor elections: ../change-mon-elections
+
+Limitations of Stretch Mode
+===========================
+When using stretch mode, OSDs must be located at exactly two sites.
+
+Two monitors should be run in each data center, plus a tiebreaker in a third
+(or in the cloud) for a total of five monitors. While in stretch mode, OSDs
+will connect only to monitors within the data center in which they are located.
+OSDs *DO NOT* connect to the tiebreaker monitor.
+
+Erasure-coded pools cannot be used with stretch mode. Attempts to use erasure
+coded pools with stretch mode will fail. Erasure coded pools cannot be created
+while in stretch mode.
+
+To use stretch mode, you will need to create a CRUSH rule that provides two
+replicas in each data center. Ensure that there are four total replicas: two in
+each data center. If pools exist in the cluster that do not have the default
+``size`` or ``min_size``, Ceph will not enter stretch mode. An example of such
+a CRUSH rule is given above.
+
+Because stretch mode runs with ``min_size`` set to ``1`` (or, more directly,
+``min_size 1``), we recommend enabling stretch mode only when using OSDs on
+SSDs (including NVMe OSDs). Hybrid HDD+SDD or HDD-only OSDs are not recommended
+due to the long time it takes for them to recover after connectivity between
+data centers has been restored. This reduces the potential for data loss.
+
+In the future, stretch mode might support erasure-coded pools and might support
+deployments that have more than two data centers.
+
+Other commands
+==============
+
+Replacing a failed tiebreaker monitor
+-------------------------------------
+
+Turn on a new monitor and run the following command:
+
+.. prompt:: bash $
+
+ ceph mon set_new_tiebreaker mon.<new_mon_name>
+
+This command protests if the new monitor is in the same location as the
+existing non-tiebreaker monitors. **This command WILL NOT remove the previous
+tiebreaker monitor.** Remove the previous tiebreaker monitor yourself.
+
+Using "--set-crush-location" and not "ceph mon set_location"
+------------------------------------------------------------
+
+If you write your own tooling for deploying Ceph, use the
+``--set-crush-location`` option when booting monitors instead of running ``ceph
+mon set_location``. This option accepts only a single ``bucket=loc`` pair (for
+example, ``ceph-mon --set-crush-location 'datacenter=a'``), and that pair must
+match the bucket type that was specified when running ``enable_stretch_mode``.
+
+Forcing recovery stretch mode
+-----------------------------
+
+When in stretch degraded mode, the cluster will go into "recovery" mode
+automatically when the disconnected data center comes back. If that does not
+happen or you want to enable recovery mode early, run the following command:
+
+.. prompt:: bash $
+
+ ceph osd force_recovery_stretch_mode --yes-i-really-mean-it
+
+Forcing normal stretch mode
+---------------------------
+
+When in recovery mode, the cluster should go back into normal stretch mode when
+the PGs are healthy. If this fails to happen or if you want to force the
+cross-data-center peering early and are willing to risk data downtime (or have
+verified separately that all the PGs can peer, even if they aren't fully
+recovered), run the following command:
+
+.. prompt:: bash $
+
+ ceph osd force_healthy_stretch_mode --yes-i-really-mean-it
+
+This command can be used to to remove the ``HEALTH_WARN`` state, which recovery
+mode generates.
diff --git a/doc/rados/operations/upmap.rst b/doc/rados/operations/upmap.rst
new file mode 100644
index 000000000..8541680d8
--- /dev/null
+++ b/doc/rados/operations/upmap.rst
@@ -0,0 +1,113 @@
+.. _upmap:
+
+=======================================
+Using pg-upmap
+=======================================
+
+In Luminous v12.2.z and later releases, there is a *pg-upmap* exception table
+in the OSDMap that allows the cluster to explicitly map specific PGs to
+specific OSDs. This allows the cluster to fine-tune the data distribution to,
+in most cases, uniformly distribute PGs across OSDs.
+
+However, there is an important caveat when it comes to this new feature: it
+requires all clients to understand the new *pg-upmap* structure in the OSDMap.
+
+Online Optimization
+===================
+
+Enabling
+--------
+
+In order to use ``pg-upmap``, the cluster cannot have any pre-Luminous clients.
+By default, new clusters enable the *balancer module*, which makes use of
+``pg-upmap``. If you want to use a different balancer or you want to make your
+own custom ``pg-upmap`` entries, you might want to turn off the balancer in
+order to avoid conflict:
+
+.. prompt:: bash $
+
+ ceph balancer off
+
+To allow use of the new feature on an existing cluster, you must restrict the
+cluster to supporting only Luminous (and newer) clients. To do so, run the
+following command:
+
+.. prompt:: bash $
+
+ ceph osd set-require-min-compat-client luminous
+
+This command will fail if any pre-Luminous clients or daemons are connected to
+the monitors. To see which client versions are in use, run the following
+command:
+
+.. prompt:: bash $
+
+ ceph features
+
+Balancer Module
+---------------
+
+The `balancer` module for ``ceph-mgr`` will automatically balance the number of
+PGs per OSD. See :ref:`balancer`
+
+Offline Optimization
+====================
+
+Upmap entries are updated with an offline optimizer that is built into the
+:ref:`osdmaptool`.
+
+#. Grab the latest copy of your osdmap:
+
+ .. prompt:: bash $
+
+ ceph osd getmap -o om
+
+#. Run the optimizer:
+
+ .. prompt:: bash $
+
+ osdmaptool om --upmap out.txt [--upmap-pool <pool>] \
+ [--upmap-max <max-optimizations>] \
+ [--upmap-deviation <max-deviation>] \
+ [--upmap-active]
+
+ It is highly recommended that optimization be done for each pool
+ individually, or for sets of similarly utilized pools. You can specify the
+ ``--upmap-pool`` option multiple times. "Similarly utilized pools" means
+ pools that are mapped to the same devices and that store the same kind of
+ data (for example, RBD image pools are considered to be similarly utilized;
+ an RGW index pool and an RGW data pool are not considered to be similarly
+ utilized).
+
+ The ``max-optimizations`` value determines the maximum number of upmap
+ entries to identify. The default is `10` (as is the case with the
+ ``ceph-mgr`` balancer module), but you should use a larger number if you are
+ doing offline optimization. If it cannot find any additional changes to
+ make (that is, if the pool distribution is perfect), it will stop early.
+
+ The ``max-deviation`` value defaults to `5`. If an OSD's PG count varies
+ from the computed target number by no more than this amount it will be
+ considered perfect.
+
+ The ``--upmap-active`` option simulates the behavior of the active balancer
+ in upmap mode. It keeps cycling until the OSDs are balanced and reports how
+ many rounds have occurred and how long each round takes. The elapsed time
+ for rounds indicates the CPU load that ``ceph-mgr`` consumes when it computes
+ the next optimization plan.
+
+#. Apply the changes:
+
+ .. prompt:: bash $
+
+ source out.txt
+
+ In the above example, the proposed changes are written to the output file
+ ``out.txt``. The commands in this procedure are normal Ceph CLI commands
+ that can be run in order to apply the changes to the cluster.
+
+The above steps can be repeated as many times as necessary to achieve a perfect
+distribution of PGs for each set of pools.
+
+To see some (gory) details about what the tool is doing, you can pass
+``--debug-osd 10`` to ``osdmaptool``. To see even more details, pass
+``--debug-crush 10`` to ``osdmaptool``.
diff --git a/doc/rados/operations/user-management.rst b/doc/rados/operations/user-management.rst
new file mode 100644
index 000000000..130c02002
--- /dev/null
+++ b/doc/rados/operations/user-management.rst
@@ -0,0 +1,840 @@
+.. _user-management:
+
+=================
+ User Management
+=================
+
+This document describes :term:`Ceph Client` users, and describes the process by
+which they perform authentication and authorization so that they can access the
+:term:`Ceph Storage Cluster`. Users are either individuals or system actors
+(for example, applications) that use Ceph clients to interact with the Ceph
+Storage Cluster daemons.
+
+.. ditaa::
+ +-----+
+ | {o} |
+ | |
+ +--+--+ /---------\ /---------\
+ | | Ceph | | Ceph |
+ ---+---*----->| |<------------->| |
+ | uses | Clients | | Servers |
+ | \---------/ \---------/
+ /--+--\
+ | |
+ | |
+ actor
+
+
+When Ceph runs with authentication and authorization enabled (both are enabled
+by default), you must specify a user name and a keyring that contains the
+secret key of the specified user (usually these are specified via the command
+line). If you do not specify a user name, Ceph will use ``client.admin`` as the
+default user name. If you do not specify a keyring, Ceph will look for a
+keyring via the ``keyring`` setting in the Ceph configuration. For example, if
+you execute the ``ceph health`` command without specifying a user or a keyring,
+Ceph will assume that the keyring is in ``/etc/ceph/ceph.client.admin.keyring``
+and will attempt to use that keyring. The following illustrates this behavior:
+
+.. prompt:: bash $
+
+ ceph health
+
+Ceph will interpret the command like this:
+
+.. prompt:: bash $
+
+ ceph -n client.admin --keyring=/etc/ceph/ceph.client.admin.keyring health
+
+Alternatively, you may use the ``CEPH_ARGS`` environment variable to avoid
+re-entry of the user name and secret.
+
+For details on configuring the Ceph Storage Cluster to use authentication, see
+`Cephx Config Reference`_. For details on the architecture of Cephx, see
+`Architecture - High Availability Authentication`_.
+
+Background
+==========
+
+No matter what type of Ceph client is used (for example: Block Device, Object
+Storage, Filesystem, native API), Ceph stores all data as RADOS objects within
+`pools`_. Ceph users must have access to a given pool in order to read and
+write data, and Ceph users must have execute permissions in order to use Ceph's
+administrative commands. The following concepts will help you understand
+Ceph['s] user management.
+
+.. _rados-ops-user:
+
+User
+----
+
+A user is either an individual or a system actor (for example, an application).
+Creating users allows you to control who (or what) can access your Ceph Storage
+Cluster, its pools, and the data within those pools.
+
+Ceph has the concept of a ``type`` of user. For purposes of user management,
+the type will always be ``client``. Ceph identifies users in a "period-
+delimited form" that consists of the user type and the user ID: for example,
+``TYPE.ID``, ``client.admin``, or ``client.user1``. The reason for user typing
+is that the Cephx protocol is used not only by clients but also non-clients,
+such as Ceph Monitors, OSDs, and Metadata Servers. Distinguishing the user type
+helps to distinguish between client users and other users. This distinction
+streamlines access control, user monitoring, and traceability.
+
+Sometimes Ceph's user type might seem confusing, because the Ceph command line
+allows you to specify a user with or without the type, depending upon your
+command line usage. If you specify ``--user`` or ``--id``, you can omit the
+type. For example, ``client.user1`` can be entered simply as ``user1``. On the
+other hand, if you specify ``--name`` or ``-n``, you must supply the type and
+name: for example, ``client.user1``. We recommend using the type and name as a
+best practice wherever possible.
+
+.. note:: A Ceph Storage Cluster user is not the same as a Ceph Object Storage
+ user or a Ceph File System user. The Ceph Object Gateway uses a Ceph Storage
+ Cluster user to communicate between the gateway daemon and the storage
+ cluster, but the Ceph Object Gateway has its own user-management
+ functionality for end users. The Ceph File System uses POSIX semantics, and
+ the user space associated with the Ceph File System is not the same as the
+ user space associated with a Ceph Storage Cluster user.
+
+Authorization (Capabilities)
+----------------------------
+
+Ceph uses the term "capabilities" (caps) to describe the permissions granted to
+an authenticated user to exercise the functionality of the monitors, OSDs, and
+metadata servers. Capabilities can also restrict access to data within a pool,
+a namespace within a pool, or a set of pools based on their application tags.
+A Ceph administrative user specifies the capabilities of a user when creating
+or updating that user.
+
+Capability syntax follows this form::
+
+ {daemon-type} '{cap-spec}[, {cap-spec} ...]'
+
+- **Monitor Caps:** Monitor capabilities include ``r``, ``w``, ``x`` access
+ settings, and can be applied in aggregate from pre-defined profiles with
+ ``profile {name}``. For example::
+
+ mon 'allow {access-spec} [network {network/prefix}]'
+
+ mon 'profile {name}'
+
+ The ``{access-spec}`` syntax is as follows: ::
+
+ * | all | [r][w][x]
+
+ The optional ``{network/prefix}`` is a standard network name and prefix
+ length in CIDR notation (for example, ``10.3.0.0/16``). If
+ ``{network/prefix}`` is present, the monitor capability can be used only by
+ clients that connect from the specified network.
+
+- **OSD Caps:** OSD capabilities include ``r``, ``w``, ``x``, and
+ ``class-read`` and ``class-write`` access settings. OSD capabilities can be
+ applied in aggregate from pre-defined profiles with ``profile {name}``. In
+ addition, OSD capabilities allow for pool and namespace settings. ::
+
+ osd 'allow {access-spec} [{match-spec}] [network {network/prefix}]'
+
+ osd 'profile {name} [pool={pool-name} [namespace={namespace-name}]] [network {network/prefix}]'
+
+ There are two alternative forms of the ``{access-spec}`` syntax: ::
+
+ * | all | [r][w][x] [class-read] [class-write]
+
+ class {class name} [{method name}]
+
+ There are two alternative forms of the optional ``{match-spec}`` syntax::
+
+ pool={pool-name} [namespace={namespace-name}] [object_prefix {prefix}]
+
+ [namespace={namespace-name}] tag {application} {key}={value}
+
+ The optional ``{network/prefix}`` is a standard network name and prefix
+ length in CIDR notation (for example, ``10.3.0.0/16``). If
+ ``{network/prefix}`` is present, the OSD capability can be used only by
+ clients that connect from the specified network.
+
+- **Manager Caps:** Manager (``ceph-mgr``) capabilities include ``r``, ``w``,
+ ``x`` access settings, and can be applied in aggregate from pre-defined
+ profiles with ``profile {name}``. For example::
+
+ mgr 'allow {access-spec} [network {network/prefix}]'
+
+ mgr 'profile {name} [{key1} {match-type} {value1} ...] [network {network/prefix}]'
+
+ Manager capabilities can also be specified for specific commands, for all
+ commands exported by a built-in manager service, or for all commands exported
+ by a specific add-on module. For example::
+
+ mgr 'allow command "{command-prefix}" [with {key1} {match-type} {value1} ...] [network {network/prefix}]'
+
+ mgr 'allow service {service-name} {access-spec} [network {network/prefix}]'
+
+ mgr 'allow module {module-name} [with {key1} {match-type} {value1} ...] {access-spec} [network {network/prefix}]'
+
+ The ``{access-spec}`` syntax is as follows: ::
+
+ * | all | [r][w][x]
+
+ The ``{service-name}`` is one of the following: ::
+
+ mgr | osd | pg | py
+
+ The ``{match-type}`` is one of the following: ::
+
+ = | prefix | regex
+
+- **Metadata Server Caps:** For administrators, use ``allow *``. For all other
+ users (for example, CephFS clients), consult :doc:`/cephfs/client-auth`
+
+.. note:: The Ceph Object Gateway daemon (``radosgw``) is a client of the
+ Ceph Storage Cluster. For this reason, it is not represented as
+ a Ceph Storage Cluster daemon type.
+
+The following entries describe access capabilities.
+
+``allow``
+
+:Description: Precedes access settings for a daemon. Implies ``rw``
+ for MDS only.
+
+
+``r``
+
+:Description: Gives the user read access. Required with monitors to retrieve
+ the CRUSH map.
+
+
+``w``
+
+:Description: Gives the user write access to objects.
+
+
+``x``
+
+:Description: Gives the user the capability to call class methods
+ (that is, both read and write) and to conduct ``auth``
+ operations on monitors.
+
+
+``class-read``
+
+:Descriptions: Gives the user the capability to call class read methods.
+ Subset of ``x``.
+
+
+``class-write``
+
+:Description: Gives the user the capability to call class write methods.
+ Subset of ``x``.
+
+
+``*``, ``all``
+
+:Description: Gives the user read, write, and execute permissions for a
+ particular daemon/pool, as well as the ability to execute
+ admin commands.
+
+
+The following entries describe valid capability profiles:
+
+``profile osd`` (Monitor only)
+
+:Description: Gives a user permissions to connect as an OSD to other OSDs or
+ monitors. Conferred on OSDs in order to enable OSDs to handle replication
+ heartbeat traffic and status reporting.
+
+
+``profile mds`` (Monitor only)
+
+:Description: Gives a user permissions to connect as an MDS to other MDSs or
+ monitors.
+
+
+``profile bootstrap-osd`` (Monitor only)
+
+:Description: Gives a user permissions to bootstrap an OSD. Conferred on
+ deployment tools such as ``ceph-volume`` and ``cephadm``
+ so that they have permissions to add keys when
+ bootstrapping an OSD.
+
+
+``profile bootstrap-mds`` (Monitor only)
+
+:Description: Gives a user permissions to bootstrap a metadata server.
+ Conferred on deployment tools such as ``cephadm``
+ so that they have permissions to add keys when bootstrapping
+ a metadata server.
+
+``profile bootstrap-rbd`` (Monitor only)
+
+:Description: Gives a user permissions to bootstrap an RBD user.
+ Conferred on deployment tools such as ``cephadm``
+ so that they have permissions to add keys when bootstrapping
+ an RBD user.
+
+``profile bootstrap-rbd-mirror`` (Monitor only)
+
+:Description: Gives a user permissions to bootstrap an ``rbd-mirror`` daemon
+ user. Conferred on deployment tools such as ``cephadm`` so that
+ they have permissions to add keys when bootstrapping an
+ ``rbd-mirror`` daemon.
+
+``profile rbd`` (Manager, Monitor, and OSD)
+
+:Description: Gives a user permissions to manipulate RBD images. When used as a
+ Monitor cap, it provides the user with the minimal privileges
+ required by an RBD client application; such privileges include
+ the ability to blocklist other client users. When used as an OSD
+ cap, it provides an RBD client application with read-write access
+ to the specified pool. The Manager cap supports optional ``pool``
+ and ``namespace`` keyword arguments.
+
+``profile rbd-mirror`` (Monitor only)
+
+:Description: Gives a user permissions to manipulate RBD images and retrieve
+ RBD mirroring config-key secrets. It provides the minimal
+ privileges required for the user to manipulate the ``rbd-mirror``
+ daemon.
+
+``profile rbd-read-only`` (Manager and OSD)
+
+:Description: Gives a user read-only permissions to RBD images. The Manager cap
+ supports optional ``pool`` and ``namespace`` keyword arguments.
+
+``profile simple-rados-client`` (Monitor only)
+
+:Description: Gives a user read-only permissions for monitor, OSD, and PG data.
+ Intended for use by direct librados client applications.
+
+``profile simple-rados-client-with-blocklist`` (Monitor only)
+
+:Description: Gives a user read-only permissions for monitor, OSD, and PG data.
+ Intended for use by direct librados client applications. Also
+ includes permissions to add blocklist entries to build
+ high-availability (HA) applications.
+
+``profile fs-client`` (Monitor only)
+
+:Description: Gives a user read-only permissions for monitor, OSD, PG, and MDS
+ data. Intended for CephFS clients.
+
+``profile role-definer`` (Monitor and Auth)
+
+:Description: Gives a user **all** permissions for the auth subsystem, read-only
+ access to monitors, and nothing else. Useful for automation
+ tools. Do not assign this unless you really, **really** know what
+ you're doing, as the security ramifications are substantial and
+ pervasive.
+
+``profile crash`` (Monitor and MGR)
+
+:Description: Gives a user read-only access to monitors. Used in conjunction
+ with the manager ``crash`` module to upload daemon crash
+ dumps into monitor storage for later analysis.
+
+Pool
+----
+
+A pool is a logical partition where users store data.
+In Ceph deployments, it is common to create a pool as a logical partition for
+similar types of data. For example, when deploying Ceph as a back end for
+OpenStack, a typical deployment would have pools for volumes, images, backups
+and virtual machines, and such users as ``client.glance`` and ``client.cinder``.
+
+Application Tags
+----------------
+
+Access may be restricted to specific pools as defined by their application
+metadata. The ``*`` wildcard may be used for the ``key`` argument, the
+``value`` argument, or both. The ``all`` tag is a synonym for ``*``.
+
+Namespace
+---------
+
+Objects within a pool can be associated to a namespace: that is, to a logical group of
+objects within the pool. A user's access to a pool can be associated with a
+namespace so that reads and writes by the user can take place only within the
+namespace. Objects written to a namespace within the pool can be accessed only
+by users who have access to the namespace.
+
+.. note:: Namespaces are primarily useful for applications written on top of
+ ``librados``. In such situations, the logical grouping provided by
+ namespaces can obviate the need to create different pools. In Luminous and
+ later releases, Ceph Object Gateway uses namespaces for various metadata
+ objects.
+
+The rationale for namespaces is this: namespaces are relatively less
+computationally expensive than pools, which (pools) can be a computationally
+expensive method of segregating data sets between different authorized users.
+
+For example, a pool ought to host approximately 100 placement-group replicas
+per OSD. This means that a cluster with 1000 OSDs and three 3R replicated pools
+would have (in a single pool) 100,000 placement-group replicas, and that means
+that it has 33,333 Placement Groups.
+
+By contrast, writing an object to a namespace simply associates the namespace
+to the object name without incurring the computational overhead of a separate
+pool. Instead of creating a separate pool for a user or set of users, you can
+use a namespace.
+
+.. note::
+
+ Namespaces are available only when using ``librados``.
+
+
+Access may be restricted to specific RADOS namespaces by use of the ``namespace``
+capability. Limited globbing of namespaces (that is, use of wildcards (``*``)) is supported: if the last character
+of the specified namespace is ``*``, then access is granted to any namespace
+starting with the provided argument.
+
+Managing Users
+==============
+
+User management functionality provides Ceph Storage Cluster administrators with
+the ability to create, update, and delete users directly in the Ceph Storage
+Cluster.
+
+When you create or delete users in the Ceph Storage Cluster, you might need to
+distribute keys to clients so that they can be added to keyrings. For details, see `Keyring
+Management`_.
+
+Listing Users
+-------------
+
+To list the users in your cluster, run the following command:
+
+.. prompt:: bash $
+
+ ceph auth ls
+
+Ceph will list all users in your cluster. For example, in a two-node
+cluster, ``ceph auth ls`` will provide an output that resembles the following::
+
+ installed auth entries:
+
+ osd.0
+ key: AQCvCbtToC6MDhAATtuT70Sl+DymPCfDSsyV4w==
+ caps: [mon] allow profile osd
+ caps: [osd] allow *
+ osd.1
+ key: AQC4CbtTCFJBChAAVq5spj0ff4eHZICxIOVZeA==
+ caps: [mon] allow profile osd
+ caps: [osd] allow *
+ client.admin
+ key: AQBHCbtT6APDHhAA5W00cBchwkQjh3dkKsyPjw==
+ caps: [mds] allow
+ caps: [mon] allow *
+ caps: [osd] allow *
+ client.bootstrap-mds
+ key: AQBICbtTOK9uGBAAdbe5zcIGHZL3T/u2g6EBww==
+ caps: [mon] allow profile bootstrap-mds
+ client.bootstrap-osd
+ key: AQBHCbtT4GxqORAADE5u7RkpCN/oo4e5W0uBtw==
+ caps: [mon] allow profile bootstrap-osd
+
+Note that, according to the ``TYPE.ID`` notation for users, ``osd.0`` is a
+user of type ``osd`` and an ID of ``0``, and ``client.admin`` is a user of type
+``client`` and an ID of ``admin`` (that is, the default ``client.admin`` user).
+Note too that each entry has a ``key: <value>`` entry, and also has one or more
+``caps:`` entries.
+
+To save the output of ``ceph auth ls`` to a file, use the ``-o {filename}`` option.
+
+
+Getting a User
+--------------
+
+To retrieve a specific user, key, and capabilities, run the following command:
+
+.. prompt:: bash $
+
+ ceph auth get {TYPE.ID}
+
+For example:
+
+.. prompt:: bash $
+
+ ceph auth get client.admin
+
+To save the output of ``ceph auth get`` to a file, use the ``-o {filename}`` option. Developers may also run the following command:
+
+.. prompt:: bash $
+
+ ceph auth export {TYPE.ID}
+
+The ``auth export`` command is identical to ``auth get``.
+
+.. _rados_ops_adding_a_user:
+
+Adding a User
+-------------
+
+Adding a user creates a user name (that is, ``TYPE.ID``), a secret key, and
+any capabilities specified in the command that creates the user.
+
+A user's key allows the user to authenticate with the Ceph Storage Cluster.
+The user's capabilities authorize the user to read, write, or execute on Ceph
+monitors (``mon``), Ceph OSDs (``osd``) or Ceph Metadata Servers (``mds``).
+
+There are a few ways to add a user:
+
+- ``ceph auth add``: This command is the canonical way to add a user. It
+ will create the user, generate a key, and add any specified capabilities.
+
+- ``ceph auth get-or-create``: This command is often the most convenient way
+ to create a user, because it returns a keyfile format with the user name
+ (in brackets) and the key. If the user already exists, this command
+ simply returns the user name and key in the keyfile format. To save the output to
+ a file, use the ``-o {filename}`` option.
+
+- ``ceph auth get-or-create-key``: This command is a convenient way to create
+ a user and return the user's key and nothing else. This is useful for clients that
+ need only the key (for example, libvirt). If the user already exists, this command
+ simply returns the key. To save the output to
+ a file, use the ``-o {filename}`` option.
+
+It is possible, when creating client users, to create a user with no capabilities. A user
+with no capabilities is useless beyond mere authentication, because the client
+cannot retrieve the cluster map from the monitor. However, you might want to create a user
+with no capabilities and wait until later to add capabilities to the user by using the ``ceph auth caps`` comand.
+
+A typical user has at least read capabilities on the Ceph monitor and
+read and write capabilities on Ceph OSDs. A user's OSD permissions
+are often restricted so that the user can access only one particular pool.
+In the following example, the commands (1) add a client named ``john`` that has read capabilities on the Ceph monitor
+and read and write capabilities on the pool named ``liverpool``, (2) authorize a client named ``paul`` to have read capabilities on the Ceph monitor and
+read and write capabilities on the pool named ``liverpool``, (3) authorize a client named ``george`` to have read capabilities on the Ceph monitor and
+read and write capabilities on the pool named ``liverpool`` and use the keyring named ``george.keyring`` to make this authorization, and (4) authorize
+a client named ``ringo`` to have read capabilities on the Ceph monitor and read and write capabilities on the pool named ``liverpool`` and use the key
+named ``ringo.key`` to make this authorization:
+
+.. prompt:: bash $
+
+ ceph auth add client.john mon 'allow r' osd 'allow rw pool=liverpool'
+ ceph auth get-or-create client.paul mon 'allow r' osd 'allow rw pool=liverpool'
+ ceph auth get-or-create client.george mon 'allow r' osd 'allow rw pool=liverpool' -o george.keyring
+ ceph auth get-or-create-key client.ringo mon 'allow r' osd 'allow rw pool=liverpool' -o ringo.key
+
+.. important:: Any user that has capabilities on OSDs will have access to ALL pools in the cluster
+ unless that user's access has been restricted to a proper subset of the pools in the cluster.
+
+
+.. _modify-user-capabilities:
+
+Modifying User Capabilities
+---------------------------
+
+The ``ceph auth caps`` command allows you to specify a user and change that
+user's capabilities. Setting new capabilities will overwrite current capabilities.
+To view current capabilities, run ``ceph auth get USERTYPE.USERID``.
+To add capabilities, run a command of the following form (and be sure to specify the existing capabilities):
+
+.. prompt:: bash $
+
+ ceph auth caps USERTYPE.USERID {daemon} 'allow [r|w|x|*|...] [pool={pool-name}] [namespace={namespace-name}]' [{daemon} 'allow [r|w|x|*|...] [pool={pool-name}] [namespace={namespace-name}]']
+
+For example:
+
+.. prompt:: bash $
+
+ ceph auth get client.john
+ ceph auth caps client.john mon 'allow r' osd 'allow rw pool=liverpool'
+ ceph auth caps client.paul mon 'allow rw' osd 'allow rwx pool=liverpool'
+ ceph auth caps client.brian-manager mon 'allow *' osd 'allow *'
+
+For additional details on capabilities, see `Authorization (Capabilities)`_.
+
+Deleting a User
+---------------
+
+To delete a user, use ``ceph auth del``:
+
+.. prompt:: bash $
+
+ ceph auth del {TYPE}.{ID}
+
+Here ``{TYPE}`` is either ``client``, ``osd``, ``mon``, or ``mds``,
+and ``{ID}`` is the user name or the ID of the daemon.
+
+
+Printing a User's Key
+---------------------
+
+To print a user's authentication key to standard output, run the following command:
+
+.. prompt:: bash $
+
+ ceph auth print-key {TYPE}.{ID}
+
+Here ``{TYPE}`` is either ``client``, ``osd``, ``mon``, or ``mds``,
+and ``{ID}`` is the user name or the ID of the daemon.
+
+When it is necessary to populate client software with a user's key (as in the case of libvirt),
+you can print the user's key by running the following command:
+
+.. prompt:: bash $
+
+ mount -t ceph serverhost:/ mountpoint -o name=client.user,secret=`ceph auth print-key client.user`
+
+Importing a User
+----------------
+
+To import one or more users, use ``ceph auth import`` and
+specify a keyring as follows:
+
+.. prompt:: bash $
+
+ ceph auth import -i /path/to/keyring
+
+For example:
+
+.. prompt:: bash $
+
+ sudo ceph auth import -i /etc/ceph/ceph.keyring
+
+.. note:: The Ceph storage cluster will add new users, their keys, and their
+ capabilities and will update existing users, their keys, and their
+ capabilities.
+
+Keyring Management
+==================
+
+When you access Ceph via a Ceph client, the Ceph client will look for a local
+keyring. Ceph presets the ``keyring`` setting with four keyring
+names by default. For this reason, you do not have to set the keyring names in your Ceph configuration file
+unless you want to override these defaults (which is not recommended). The four default keyring names are as follows:
+
+- ``/etc/ceph/$cluster.$name.keyring``
+- ``/etc/ceph/$cluster.keyring``
+- ``/etc/ceph/keyring``
+- ``/etc/ceph/keyring.bin``
+
+The ``$cluster`` metavariable found in the first two default keyring names above
+is your Ceph cluster name as defined by the name of the Ceph configuration
+file: for example, if the Ceph configuration file is named ``ceph.conf``,
+then your Ceph cluster name is ``ceph`` and the second name above would be
+``ceph.keyring``. The ``$name`` metavariable is the user type and user ID:
+for example, given the user ``client.admin``, the first name above would be
+``ceph.client.admin.keyring``.
+
+.. note:: When running commands that read or write to ``/etc/ceph``, you might
+ need to use ``sudo`` to run the command as ``root``.
+
+After you create a user (for example, ``client.ringo``), you must get the key and add
+it to a keyring on a Ceph client so that the user can access the Ceph Storage
+Cluster.
+
+The `User Management`_ section details how to list, get, add, modify, and delete
+users directly in the Ceph Storage Cluster. In addition, Ceph provides the
+``ceph-authtool`` utility to allow you to manage keyrings from a Ceph client.
+
+Creating a Keyring
+------------------
+
+When you use the procedures in the `Managing Users`_ section to create users,
+you must provide user keys to the Ceph client(s). This is required so that the Ceph client(s)
+can retrieve the key for the specified user and authenticate that user against the Ceph
+Storage Cluster. Ceph clients access keyrings in order to look up a user name and
+retrieve the user's key.
+
+The ``ceph-authtool`` utility allows you to create a keyring. To create an
+empty keyring, use ``--create-keyring`` or ``-C``. For example:
+
+.. prompt:: bash $
+
+ ceph-authtool --create-keyring /path/to/keyring
+
+When creating a keyring with multiple users, we recommend using the cluster name
+(of the form ``$cluster.keyring``) for the keyring filename and saving the keyring in the
+``/etc/ceph`` directory. By doing this, you ensure that the ``keyring`` configuration default setting
+will pick up the filename without requiring you to specify the filename in the local copy
+of your Ceph configuration file. For example, you can create ``ceph.keyring`` by
+running the following command:
+
+.. prompt:: bash $
+
+ sudo ceph-authtool -C /etc/ceph/ceph.keyring
+
+When creating a keyring with a single user, we recommend using the cluster name,
+the user type, and the user name, and saving the keyring in the ``/etc/ceph`` directory.
+For example, we recommend that the ``client.admin`` user use ``ceph.client.admin.keyring``.
+
+To create a keyring in ``/etc/ceph``, you must do so as ``root``. This means
+that the file will have ``rw`` permissions for the ``root`` user only, which is
+appropriate when the keyring contains administrator keys. However, if you
+intend to use the keyring for a particular user or group of users, be sure to use ``chown`` or ``chmod`` to establish appropriate keyring
+ownership and access.
+
+Adding a User to a Keyring
+--------------------------
+
+When you :ref:`Add a user<rados_ops_adding_a_user>` to the Ceph Storage
+Cluster, you can use the `Getting a User`_ procedure to retrieve a user, key,
+and capabilities and then save the user to a keyring.
+
+If you want to use only one user per keyring, the `Getting a User`_ procedure with
+the ``-o`` option will save the output in the keyring file format. For example,
+to create a keyring for the ``client.admin`` user, run the following command:
+
+.. prompt:: bash $
+
+ sudo ceph auth get client.admin -o /etc/ceph/ceph.client.admin.keyring
+
+Notice that the file format in this command is the file format conventionally used when manipulating the keyrings of individual users.
+
+If you want to import users to a keyring, you can use ``ceph-authtool``
+to specify the destination keyring and the source keyring.
+For example:
+
+.. prompt:: bash $
+
+ sudo ceph-authtool /etc/ceph/ceph.keyring --import-keyring /etc/ceph/ceph.client.admin.keyring
+
+Creating a User
+---------------
+
+Ceph provides the `Adding a User`_ function to create a user directly in the Ceph
+Storage Cluster. However, you can also create a user, keys, and capabilities
+directly on a Ceph client keyring, and then import the user to the Ceph
+Storage Cluster. For example:
+
+.. prompt:: bash $
+
+ sudo ceph-authtool -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx' /etc/ceph/ceph.keyring
+
+For additional details on capabilities, see `Authorization (Capabilities)`_.
+
+You can also create a keyring and add a new user to the keyring simultaneously.
+For example:
+
+.. prompt:: bash $
+
+ sudo ceph-authtool -C /etc/ceph/ceph.keyring -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx' --gen-key
+
+In the above examples, the new user ``client.ringo`` has been added only to the
+keyring. The new user has not been added to the Ceph Storage Cluster.
+
+To add the new user ``client.ringo`` to the Ceph Storage Cluster, run the following command:
+
+.. prompt:: bash $
+
+ sudo ceph auth add client.ringo -i /etc/ceph/ceph.keyring
+
+Modifying a User
+----------------
+
+To modify the capabilities of a user record in a keyring, specify the keyring
+and the user, followed by the capabilities. For example:
+
+.. prompt:: bash $
+
+ sudo ceph-authtool /etc/ceph/ceph.keyring -n client.ringo --cap osd 'allow rwx' --cap mon 'allow rwx'
+
+To update the user in the Ceph Storage Cluster, you must update the user
+in the keyring to the user entry in the Ceph Storage Cluster. To do so, run the following command:
+
+.. prompt:: bash $
+
+ sudo ceph auth import -i /etc/ceph/ceph.keyring
+
+For details on updating a Ceph Storage Cluster user from a
+keyring, see `Importing a User`_
+
+You may also :ref:`Modify user capabilities<modify-user-capabilities>` directly in the cluster, store the
+results to a keyring file, and then import the keyring into your main
+``ceph.keyring`` file.
+
+Command Line Usage
+==================
+
+Ceph supports the following usage for user name and secret:
+
+``--id`` | ``--user``
+
+:Description: Ceph identifies users with a type and an ID: the form of this user identification is ``TYPE.ID``, and examples of the type and ID are
+ ``client.admin`` and ``client.user1``. The ``id``, ``name`` and
+ ``-n`` options allow you to specify the ID portion of the user
+ name (for example, ``admin``, ``user1``, ``foo``). You can specify
+ the user with the ``--id`` and omit the type. For example,
+ to specify user ``client.foo``, run the following commands:
+
+ .. prompt:: bash $
+
+ ceph --id foo --keyring /path/to/keyring health
+ ceph --user foo --keyring /path/to/keyring health
+
+
+``--name`` | ``-n``
+
+:Description: Ceph identifies users with a type and an ID: the form of this user identification is ``TYPE.ID``, and examples of the type and ID are
+ ``client.admin`` and ``client.user1``. The ``--name`` and ``-n``
+ options allow you to specify the fully qualified user name.
+ You are required to specify the user type (typically ``client``) with the
+ user ID. For example:
+
+ .. prompt:: bash $
+
+ ceph --name client.foo --keyring /path/to/keyring health
+ ceph -n client.foo --keyring /path/to/keyring health
+
+
+``--keyring``
+
+:Description: The path to the keyring that contains one or more user names and
+ secrets. The ``--secret`` option provides the same functionality,
+ but it does not work with Ceph RADOS Gateway, which uses
+ ``--secret`` for another purpose. You may retrieve a keyring with
+ ``ceph auth get-or-create`` and store it locally. This is a
+ preferred approach, because you can switch user names without
+ switching the keyring path. For example:
+
+ .. prompt:: bash $
+
+ sudo rbd map --id foo --keyring /path/to/keyring mypool/myimage
+
+
+.. _pools: ../pools
+
+Limitations
+===========
+
+The ``cephx`` protocol authenticates Ceph clients and servers to each other. It
+is not intended to handle authentication of human users or application programs
+that are run on their behalf. If your access control
+needs require that kind of authentication, you will need to have some other mechanism, which is likely to be specific to the
+front end that is used to access the Ceph object store. This other mechanism would ensure that only acceptable users and programs are able to run on the
+machine that Ceph permits to access its object store.
+
+The keys used to authenticate Ceph clients and servers are typically stored in
+a plain text file on a trusted host. Appropriate permissions must be set on the plain text file.
+
+.. important:: Storing keys in plaintext files has security shortcomings, but
+ they are difficult to avoid, given the basic authentication methods Ceph
+ uses in the background. Anyone setting up Ceph systems should be aware of
+ these shortcomings.
+
+In particular, user machines, especially portable machines, should not
+be configured to interact directly with Ceph, since that mode of use would
+require the storage of a plaintext authentication key on an insecure machine.
+Anyone who stole that machine or obtained access to it could
+obtain a key that allows them to authenticate their own machines to Ceph.
+
+Instead of permitting potentially insecure machines to access a Ceph object
+store directly, you should require users to sign in to a trusted machine in
+your environment, using a method that provides sufficient security for your
+purposes. That trusted machine will store the plaintext Ceph keys for the
+human users. A future version of Ceph might address these particular
+authentication issues more fully.
+
+At present, none of the Ceph authentication protocols provide secrecy for
+messages in transit. As a result, an eavesdropper on the wire can hear and understand
+all data sent between clients and servers in Ceph, even if the eavesdropper cannot create or
+alter the data. Similarly, Ceph does not include options to encrypt user data in the
+object store. Users can, of course, hand-encrypt and store their own data in the Ceph
+object store, but Ceph itself provides no features to perform object
+encryption. Anyone storing sensitive data in Ceph should consider
+encrypting their data before providing it to the Ceph system.
+
+
+.. _Architecture - High Availability Authentication: ../../../architecture#high-availability-authentication
+.. _Cephx Config Reference: ../../configuration/auth-config-ref
diff --git a/doc/rados/troubleshooting/community.rst b/doc/rados/troubleshooting/community.rst
new file mode 100644
index 000000000..c0d7be10c
--- /dev/null
+++ b/doc/rados/troubleshooting/community.rst
@@ -0,0 +1,37 @@
+====================
+ The Ceph Community
+====================
+
+Ceph-users email list
+=====================
+
+The Ceph community is an excellent source of information and help. For
+operational issues with Ceph we recommend that you `subscribe to the ceph-users
+email list`_. When you no longer want to receive emails, you can `unsubscribe
+from the ceph-users email list`_.
+
+Ceph-devel email list
+=====================
+
+You can also `subscribe to the ceph-devel email list`_. You should do so if
+your issue is:
+
+- Likely related to a bug
+- Related to a development release package
+- Related to a development testing package
+- Related to your own builds
+
+If you no longer want to receive emails from the ``ceph-devel`` email list, you
+can `unsubscribe from the ceph-devel email list`_.
+
+Ceph report
+===========
+
+.. tip:: Community members can help you if you provide them with detailed
+ information about your problem. Attach the output of the ``ceph report``
+ command to help people understand your issues.
+
+.. _subscribe to the ceph-devel email list: mailto:dev-join@ceph.io
+.. _unsubscribe from the ceph-devel email list: mailto:dev-leave@ceph.io
+.. _subscribe to the ceph-users email list: mailto:ceph-users-join@ceph.io
+.. _unsubscribe from the ceph-users email list: mailto:ceph-users-leave@ceph.io
diff --git a/doc/rados/troubleshooting/cpu-profiling.rst b/doc/rados/troubleshooting/cpu-profiling.rst
new file mode 100644
index 000000000..b7fdd1d41
--- /dev/null
+++ b/doc/rados/troubleshooting/cpu-profiling.rst
@@ -0,0 +1,80 @@
+===============
+ CPU Profiling
+===============
+
+If you built Ceph from source and compiled Ceph for use with `oprofile`_
+you can profile Ceph's CPU usage. See `Installing Oprofile`_ for details.
+
+
+Initializing oprofile
+=====================
+
+``oprofile`` must be initalized the first time it is used. Locate the
+``vmlinux`` image that corresponds to the kernel you are running:
+
+.. prompt:: bash $
+
+ ls /boot
+ sudo opcontrol --init
+ sudo opcontrol --setup --vmlinux={path-to-image} --separate=library --callgraph=6
+
+
+Starting oprofile
+=================
+
+Run the following command to start ``oprofile``:
+
+.. prompt:: bash $
+
+ opcontrol --start
+
+
+Stopping oprofile
+=================
+
+Run the following command to stop ``oprofile``:
+
+.. prompt:: bash $
+
+ opcontrol --stop
+
+
+Retrieving oprofile Results
+===========================
+
+Run the following command to retrieve the top ``cmon`` results:
+
+.. prompt:: bash $
+
+ opreport -gal ./cmon | less
+
+
+Run the following command to retrieve the top ``cmon`` results, with call
+graphs attached:
+
+.. prompt:: bash $
+
+ opreport -cal ./cmon | less
+
+.. important:: After you have reviewed the results, reset ``oprofile`` before
+ running it again. The act of resetting ``oprofile`` removes data from the
+ session directory.
+
+
+Resetting oprofile
+==================
+
+Run the following command to reset ``oprofile``:
+
+.. prompt:: bash $
+
+ sudo opcontrol --reset
+
+.. important:: Reset ``oprofile`` after analyzing data. This ensures that
+ results from prior tests do not get mixed in with the results of the current
+ test.
+
+.. _oprofile: http://oprofile.sourceforge.net/about/
+.. _Installing Oprofile: ../../../dev/cpu-profiler
+
+
diff --git a/doc/rados/troubleshooting/index.rst b/doc/rados/troubleshooting/index.rst
new file mode 100644
index 000000000..b481ee1dc
--- /dev/null
+++ b/doc/rados/troubleshooting/index.rst
@@ -0,0 +1,19 @@
+=================
+ Troubleshooting
+=================
+
+You may encounter situations that require you to examine your configuration,
+consult the documentation, modify your logging output, troubleshoot monitors
+and OSDs, profile memory and CPU usage, and, in the last resort, reach out to
+the Ceph community for help.
+
+.. toctree::
+ :maxdepth: 1
+
+ community
+ log-and-debug
+ troubleshooting-mon
+ troubleshooting-osd
+ troubleshooting-pg
+ memory-profiling
+ cpu-profiling
diff --git a/doc/rados/troubleshooting/log-and-debug.rst b/doc/rados/troubleshooting/log-and-debug.rst
new file mode 100644
index 000000000..929c3f53f
--- /dev/null
+++ b/doc/rados/troubleshooting/log-and-debug.rst
@@ -0,0 +1,430 @@
+=======================
+ Logging and Debugging
+=======================
+
+Ceph component debug log levels can be adjusted at runtime, while services are
+running. In some circumstances you might want to adjust debug log levels in
+``ceph.conf`` or in the central config store. Increased debug logging can be
+useful if you are encountering issues when operating your cluster. By default,
+Ceph log files are in ``/var/log/ceph``.
+
+.. tip:: Remember that debug output can slow down your system, and that this
+ latency sometimes hides race conditions.
+
+Debug logging is resource intensive. If you encounter a problem in a specific
+component of your cluster, begin troubleshooting by enabling logging for only
+that component of the cluster. For example, if your OSDs are running without
+errors, but your metadata servers are not, enable logging for any specific
+metadata server instances that are having problems. Continue by enabling
+logging for each subsystem only as needed.
+
+.. important:: Verbose logging sometimes generates over 1 GB of data per hour.
+ If the disk that your operating system runs on (your "OS disk") reaches its
+ capacity, the node associated with that disk will stop working.
+
+Whenever you enable or increase the rate of debug logging, make sure that you
+have ample capacity for log files, as this may dramatically increase their
+size. For details on rotating log files, see `Accelerating Log Rotation`_.
+When your system is running well again, remove unnecessary debugging settings
+in order to ensure that your cluster runs optimally. Logging debug-output
+messages is a slow process and a potential waste of your cluster's resources.
+
+For details on available settings, see `Subsystem, Log and Debug Settings`_.
+
+Runtime
+=======
+
+To see the configuration settings at runtime, log in to a host that has a
+running daemon and run a command of the following form:
+
+.. prompt:: bash $
+
+ ceph daemon {daemon-name} config show | less
+
+For example:
+
+.. prompt:: bash $
+
+ ceph daemon osd.0 config show | less
+
+To activate Ceph's debugging output (that is, the ``dout()`` logging function)
+at runtime, inject arguments into the runtime configuration by running a ``ceph
+tell`` command of the following form:
+
+.. prompt:: bash $
+
+ ceph tell {daemon-type}.{daemon id or *} config set {name} {value}
+
+Here ``{daemon-type}`` is ``osd``, ``mon``, or ``mds``. Apply the runtime
+setting either to a specific daemon (by specifying its ID) or to all daemons of
+a particular type (by using the ``*`` operator). For example, to increase
+debug logging for a specific ``ceph-osd`` daemon named ``osd.0``, run the
+following command:
+
+.. prompt:: bash $
+
+ ceph tell osd.0 config set debug_osd 0/5
+
+The ``ceph tell`` command goes through the monitors. However, if you are unable
+to bind to the monitor, there is another method that can be used to activate
+Ceph's debugging output: use the ``ceph daemon`` command to log in to the host
+of a specific daemon and change the daemon's configuration. For example:
+
+.. prompt:: bash $
+
+ sudo ceph daemon osd.0 config set debug_osd 0/5
+
+For details on available settings, see `Subsystem, Log and Debug Settings`_.
+
+
+Boot Time
+=========
+
+To activate Ceph's debugging output (that is, the ``dout()`` logging function)
+at boot time, you must add settings to your Ceph configuration file.
+Subsystems that are common to all daemons are set under ``[global]`` in the
+configuration file. Subsystems for a specific daemon are set under the relevant
+daemon section in the configuration file (for example, ``[mon]``, ``[osd]``,
+``[mds]``). Here is an example that shows possible debugging settings in a Ceph
+configuration file:
+
+.. code-block:: ini
+
+ [global]
+ debug_ms = 1/5
+
+ [mon]
+ debug_mon = 20
+ debug_paxos = 1/5
+ debug_auth = 2
+
+ [osd]
+ debug_osd = 1/5
+ debug_filestore = 1/5
+ debug_journal = 1
+ debug_monc = 5/20
+
+ [mds]
+ debug_mds = 1
+ debug_mds_balancer = 1
+
+
+For details, see `Subsystem, Log and Debug Settings`_.
+
+
+Accelerating Log Rotation
+=========================
+
+If your log filesystem is nearly full, you can accelerate log rotation by
+modifying the Ceph log rotation file at ``/etc/logrotate.d/ceph``. To increase
+the frequency of log rotation (which will guard against a filesystem reaching
+capacity), add a ``size`` directive after the ``weekly`` frequency directive.
+To smooth out volume spikes, consider changing ``weekly`` to ``daily`` and
+consider changing ``rotate`` to ``30``. The procedure for adding the size
+setting is shown immediately below.
+
+#. Note the default settings of the ``/etc/logrotate.d/ceph`` file::
+
+ rotate 7
+ weekly
+ compress
+ sharedscripts
+
+#. Modify them by adding a ``size`` setting::
+
+ rotate 7
+ weekly
+ size 500M
+ compress
+ sharedscripts
+
+#. Start the crontab editor for your user space:
+
+ .. prompt:: bash $
+
+ crontab -e
+
+#. Add an entry to crontab that instructs cron to check the
+ ``etc/logrotate.d/ceph`` file::
+
+ 30 * * * * /usr/sbin/logrotate /etc/logrotate.d/ceph >/dev/null 2>&1
+
+In this example, the ``etc/logrotate.d/ceph`` file will be checked every 30
+minutes.
+
+Valgrind
+========
+
+When you are debugging your cluster's performance, you might find it necessary
+to track down memory and threading issues. The Valgrind tool suite can be used
+to detect problems in a specific daemon, in a particular type of daemon, or in
+the entire cluster. Because Valgrind is computationally expensive, it should be
+used only when developing or debugging Ceph, and it will slow down your system
+if used at other times. Valgrind messages are logged to ``stderr``.
+
+
+Subsystem, Log and Debug Settings
+=================================
+
+Debug logging output is typically enabled via subsystems.
+
+Ceph Subsystems
+---------------
+
+For each subsystem, there is a logging level for its output logs (a so-called
+"log level") and a logging level for its in-memory logs (a so-called "memory
+level"). Different values may be set for these two logging levels in each
+subsystem. Ceph's logging levels operate on a scale of ``1`` to ``20``, where
+``1`` is terse and ``20`` is verbose [#f1]_. As a general rule, the in-memory
+logs are not sent to the output log unless one or more of the following
+conditions obtain:
+
+- a fatal signal is raised or
+- an ``assert`` in source code is triggered or
+- upon requested. Please consult `document on admin socket
+ <http://docs.ceph.com/en/latest/man/8/ceph/#daemon>`_ for more details.
+
+.. warning ::
+ .. [#f1] In certain rare cases, there are logging levels that can take a value greater than 20. The resulting logs are extremely verbose.
+
+Log levels and memory levels can be set either together or separately. If a
+subsystem is assigned a single value, then that value determines both the log
+level and the memory level. For example, ``debug ms = 5`` will give the ``ms``
+subsystem a log level of ``5`` and a memory level of ``5``. On the other hand,
+if a subsystem is assigned two values that are separated by a forward slash
+(/), then the first value determines the log level and the second value
+determines the memory level. For example, ``debug ms = 1/5`` will give the
+``ms`` subsystem a log level of ``1`` and a memory level of ``5``. See the
+following:
+
+.. code-block:: ini
+
+ debug {subsystem} = {log-level}/{memory-level}
+ #for example
+ debug mds balancer = 1/20
+
+The following table provides a list of Ceph subsystems and their default log and
+memory levels. Once you complete your logging efforts, restore the subsystems
+to their default level or to a level suitable for normal operations.
+
++--------------------------+-----------+--------------+
+| Subsystem | Log Level | Memory Level |
++==========================+===========+==============+
+| ``default`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``lockdep`` | 0 | 1 |
++--------------------------+-----------+--------------+
+| ``context`` | 0 | 1 |
++--------------------------+-----------+--------------+
+| ``crush`` | 1 | 1 |
++--------------------------+-----------+--------------+
+| ``mds`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``mds balancer`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``mds log`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``mds log expire`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``mds migrator`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``buffer`` | 0 | 1 |
++--------------------------+-----------+--------------+
+| ``timer`` | 0 | 1 |
++--------------------------+-----------+--------------+
+| ``filer`` | 0 | 1 |
++--------------------------+-----------+--------------+
+| ``striper`` | 0 | 1 |
++--------------------------+-----------+--------------+
+| ``objecter`` | 0 | 1 |
++--------------------------+-----------+--------------+
+| ``rados`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``rbd`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``rbd mirror`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``rbd replay`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``rbd pwl`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``journaler`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``objectcacher`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``immutable obj cache`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``client`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``osd`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``optracker`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``objclass`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``filestore`` | 1 | 3 |
++--------------------------+-----------+--------------+
+| ``journal`` | 1 | 3 |
++--------------------------+-----------+--------------+
+| ``ms`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``mon`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``monc`` | 0 | 10 |
++--------------------------+-----------+--------------+
+| ``paxos`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``tp`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``auth`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``crypto`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``finisher`` | 1 | 1 |
++--------------------------+-----------+--------------+
+| ``reserver`` | 1 | 1 |
++--------------------------+-----------+--------------+
+| ``heartbeatmap`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``perfcounter`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``rgw`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``rgw sync`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``rgw datacache`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``rgw access`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``rgw dbstore`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``javaclient`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``asok`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``throttle`` | 1 | 1 |
++--------------------------+-----------+--------------+
+| ``refs`` | 0 | 0 |
++--------------------------+-----------+--------------+
+| ``compressor`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``bluestore`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``bluefs`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``bdev`` | 1 | 3 |
++--------------------------+-----------+--------------+
+| ``kstore`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``rocksdb`` | 4 | 5 |
++--------------------------+-----------+--------------+
+| ``leveldb`` | 4 | 5 |
++--------------------------+-----------+--------------+
+| ``fuse`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``mgr`` | 2 | 5 |
++--------------------------+-----------+--------------+
+| ``mgrc`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``dpdk`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``eventtrace`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``prioritycache`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``test`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``cephfs mirror`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``cepgsqlite`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore onode`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore odata`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore ompap`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore tm`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore t`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore cleaner`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore epm`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore lba`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore fixedkv tree``| 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore cache`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore journal`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore device`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``seastore backref`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``alienstore`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``mclock`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``cyanstore`` | 0 | 5 |
++--------------------------+-----------+--------------+
+| ``ceph exporter`` | 1 | 5 |
++--------------------------+-----------+--------------+
+| ``memstore`` | 1 | 5 |
++--------------------------+-----------+--------------+
+
+
+Logging Settings
+----------------
+
+It is not necessary to specify logging and debugging settings in the Ceph
+configuration file, but you may override default settings when needed. Ceph
+supports the following settings:
+
+.. confval:: log_file
+.. confval:: log_max_new
+.. confval:: log_max_recent
+.. confval:: log_to_file
+.. confval:: log_to_stderr
+.. confval:: err_to_stderr
+.. confval:: log_to_syslog
+.. confval:: err_to_syslog
+.. confval:: log_flush_on_exit
+.. confval:: clog_to_monitors
+.. confval:: clog_to_syslog
+.. confval:: mon_cluster_log_to_syslog
+.. confval:: mon_cluster_log_file
+
+OSD
+---
+
+.. confval:: osd_debug_drop_ping_probability
+.. confval:: osd_debug_drop_ping_duration
+
+Filestore
+---------
+
+.. confval:: filestore_debug_omap_check
+
+MDS
+---
+
+- :confval:`mds_debug_scatterstat`
+- :confval:`mds_debug_frag`
+- :confval:`mds_debug_auth_pins`
+- :confval:`mds_debug_subtrees`
+
+RADOS Gateway
+-------------
+
+- :confval:`rgw_log_nonexistent_bucket`
+- :confval:`rgw_log_object_name`
+- :confval:`rgw_log_object_name_utc`
+- :confval:`rgw_enable_ops_log`
+- :confval:`rgw_enable_usage_log`
+- :confval:`rgw_usage_log_flush_threshold`
+- :confval:`rgw_usage_log_tick_interval`
diff --git a/doc/rados/troubleshooting/memory-profiling.rst b/doc/rados/troubleshooting/memory-profiling.rst
new file mode 100644
index 000000000..8e58f2d76
--- /dev/null
+++ b/doc/rados/troubleshooting/memory-profiling.rst
@@ -0,0 +1,203 @@
+==================
+ Memory Profiling
+==================
+
+Ceph Monitor, OSD, and MDS can report ``TCMalloc`` heap profiles. Install
+``google-perftools`` if you want to generate these. Your OS distribution might
+package this under a different name (for example, ``gperftools``), and your OS
+distribution might use a different package manager. Run a command similar to
+this one to install ``google-perftools``:
+
+.. prompt:: bash
+
+ sudo apt-get install google-perftools
+
+The profiler dumps output to your ``log file`` directory (``/var/log/ceph``).
+See `Logging and Debugging`_ for details.
+
+To view the profiler logs with Google's performance tools, run the following
+command:
+
+.. prompt:: bash
+
+ google-pprof --text {path-to-daemon} {log-path/filename}
+
+For example::
+
+ $ ceph tell osd.0 heap start_profiler
+ $ ceph tell osd.0 heap dump
+ osd.0 tcmalloc heap stats:------------------------------------------------
+ MALLOC: 2632288 ( 2.5 MiB) Bytes in use by application
+ MALLOC: + 499712 ( 0.5 MiB) Bytes in page heap freelist
+ MALLOC: + 543800 ( 0.5 MiB) Bytes in central cache freelist
+ MALLOC: + 327680 ( 0.3 MiB) Bytes in transfer cache freelist
+ MALLOC: + 1239400 ( 1.2 MiB) Bytes in thread cache freelists
+ MALLOC: + 1142936 ( 1.1 MiB) Bytes in malloc metadata
+ MALLOC: ------------
+ MALLOC: = 6385816 ( 6.1 MiB) Actual memory used (physical + swap)
+ MALLOC: + 0 ( 0.0 MiB) Bytes released to OS (aka unmapped)
+ MALLOC: ------------
+ MALLOC: = 6385816 ( 6.1 MiB) Virtual address space used
+ MALLOC:
+ MALLOC: 231 Spans in use
+ MALLOC: 56 Thread heaps in use
+ MALLOC: 8192 Tcmalloc page size
+ ------------------------------------------------
+ Call ReleaseFreeMemory() to release freelist memory to the OS (via madvise()).
+ Bytes released to the OS take up virtual address space but no physical memory.
+ $ google-pprof --text \
+ /usr/bin/ceph-osd \
+ /var/log/ceph/ceph-osd.0.profile.0001.heap
+ Total: 3.7 MB
+ 1.9 51.1% 51.1% 1.9 51.1% ceph::log::Log::create_entry
+ 1.8 47.3% 98.4% 1.8 47.3% std::string::_Rep::_S_create
+ 0.0 0.4% 98.9% 0.0 0.6% SimpleMessenger::add_accept_pipe
+ 0.0 0.4% 99.2% 0.0 0.6% decode_message
+ ...
+
+Performing another heap dump on the same daemon creates another file. It is
+convenient to compare the new file to a file created by a previous heap dump to
+show what has grown in the interval. For example::
+
+ $ google-pprof --text --base out/osd.0.profile.0001.heap \
+ ceph-osd out/osd.0.profile.0003.heap
+ Total: 0.2 MB
+ 0.1 50.3% 50.3% 0.1 50.3% ceph::log::Log::create_entry
+ 0.1 46.6% 96.8% 0.1 46.6% std::string::_Rep::_S_create
+ 0.0 0.9% 97.7% 0.0 26.1% ReplicatedPG::do_op
+ 0.0 0.8% 98.5% 0.0 0.8% __gnu_cxx::new_allocator::allocate
+
+See `Google Heap Profiler`_ for additional details.
+
+After you have installed the heap profiler, start your cluster and begin using
+the heap profiler. You can enable or disable the heap profiler at runtime, or
+ensure that it runs continuously. When running commands based on the examples
+that follow, do the following:
+
+#. replace ``{daemon-type}`` with ``mon``, ``osd`` or ``mds``
+#. replace ``{daemon-id}`` with the OSD number or the MON ID or the MDS ID
+
+
+Starting the Profiler
+---------------------
+
+To start the heap profiler, run a command of the following form:
+
+.. prompt:: bash
+
+ ceph tell {daemon-type}.{daemon-id} heap start_profiler
+
+For example:
+
+.. prompt:: bash
+
+ ceph tell osd.1 heap start_profiler
+
+Alternatively, if the ``CEPH_HEAP_PROFILER_INIT=true`` variable is found in the
+environment, the profile will be started when the daemon starts running.
+
+Printing Stats
+--------------
+
+To print out statistics, run a command of the following form:
+
+.. prompt:: bash
+
+ ceph tell {daemon-type}.{daemon-id} heap stats
+
+For example:
+
+.. prompt:: bash
+
+ ceph tell osd.0 heap stats
+
+.. note:: The reporting of stats with this command does not require the
+ profiler to be running and does not dump the heap allocation information to
+ a file.
+
+
+Dumping Heap Information
+------------------------
+
+To dump heap information, run a command of the following form:
+
+.. prompt:: bash
+
+ ceph tell {daemon-type}.{daemon-id} heap dump
+
+For example:
+
+.. prompt:: bash
+
+ ceph tell mds.a heap dump
+
+.. note:: Dumping heap information works only when the profiler is running.
+
+
+Releasing Memory
+----------------
+
+To release memory that ``tcmalloc`` has allocated but which is not being used
+by the Ceph daemon itself, run a command of the following form:
+
+.. prompt:: bash
+
+ ceph tell {daemon-type}{daemon-id} heap release
+
+For example:
+
+.. prompt:: bash
+
+ ceph tell osd.2 heap release
+
+
+Stopping the Profiler
+---------------------
+
+To stop the heap profiler, run a command of the following form:
+
+.. prompt:: bash
+
+ ceph tell {daemon-type}.{daemon-id} heap stop_profiler
+
+For example:
+
+.. prompt:: bash
+
+ ceph tell osd.0 heap stop_profiler
+
+.. _Logging and Debugging: ../log-and-debug
+.. _Google Heap Profiler: http://goog-perftools.sourceforge.net/doc/heap_profiler.html
+
+Alternative Methods of Memory Profiling
+----------------------------------------
+
+Running Massif heap profiler with Valgrind
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Massif heap profiler tool can be used with Valgrind to measure how much
+heap memory is used. This method is well-suited to troubleshooting RadosGW.
+
+See the `Massif documentation
+<https://valgrind.org/docs/manual/ms-manual.html>`_ for more information.
+
+Install Valgrind from the package manager for your distribution then start the
+Ceph daemon you want to troubleshoot:
+
+.. prompt:: bash
+
+ sudo -u ceph valgrind --max-threads=1024 --tool=massif /usr/bin/radosgw -f --cluster ceph --name NAME --setuser ceph --setgroup ceph
+
+When this command has completed its run, a file with a name of the form
+``massif.out.<pid>`` will be saved in your current working directory. To run
+the command above, the user who runs it must have write permissions in the
+current directory.
+
+Run the ``ms_print`` command to get a graph and statistics from the collected
+data in the ``massif.out.<pid>`` file:
+
+.. prompt:: bash
+
+ ms_print massif.out.12345
+
+The output of this command is helpful when submitting a bug report.
diff --git a/doc/rados/troubleshooting/troubleshooting-mon.rst b/doc/rados/troubleshooting/troubleshooting-mon.rst
new file mode 100644
index 000000000..1170da7c3
--- /dev/null
+++ b/doc/rados/troubleshooting/troubleshooting-mon.rst
@@ -0,0 +1,713 @@
+.. _rados-troubleshooting-mon:
+
+==========================
+ Troubleshooting Monitors
+==========================
+
+.. index:: monitor, high availability
+
+Even if a cluster experiences monitor-related problems, the cluster is not
+necessarily in danger of going down. If a cluster has lost multiple monitors,
+it can still remain up and running as long as there are enough surviving
+monitors to form a quorum.
+
+If your cluster is having monitor-related problems, we recommend that you
+consult the following troubleshooting information.
+
+Initial Troubleshooting
+=======================
+
+The first steps in the process of troubleshooting Ceph Monitors involve making
+sure that the Monitors are running and that they are able to communicate with
+the network and on the network. Follow the steps in this section to rule out
+the simplest causes of Monitor malfunction.
+
+#. **Make sure that the Monitors are running.**
+
+ Make sure that the Monitor (*mon*) daemon processes (``ceph-mon``) are
+ running. It might be the case that the mons have not be restarted after an
+ upgrade. Checking for this simple oversight can save hours of painstaking
+ troubleshooting.
+
+ It is also important to make sure that the manager daemons (``ceph-mgr``)
+ are running. Remember that typical cluster configurations provide one
+ Manager (``ceph-mgr``) for each Monitor (``ceph-mon``).
+
+ .. note:: In releases prior to v1.12.5, Rook will not run more than two
+ managers.
+
+#. **Make sure that you can reach the Monitor nodes.**
+
+ In certain rare cases, ``iptables`` rules might be blocking access to
+ Monitor nodes or TCP ports. These rules might be left over from earlier
+ stress testing or rule development. To check for the presence of such
+ rules, SSH into each Monitor node and use ``telnet`` or ``nc`` or a similar
+ tool to attempt to connect to each of the other Monitor nodes on ports
+ ``tcp/3300`` and ``tcp/6789``.
+
+#. **Make sure that the "ceph status" command runs and receives a reply from the cluster.**
+
+ If the ``ceph status`` command receives a reply from the cluster, then the
+ cluster is up and running. Monitors answer to a ``status`` request only if
+ there is a formed quorum. Confirm that one or more ``mgr`` daemons are
+ reported as running. In a cluster with no deficiencies, ``ceph status``
+ will report that all ``mgr`` daemons are running.
+
+ If the ``ceph status`` command does not receive a reply from the cluster,
+ then there are probably not enough Monitors ``up`` to form a quorum. If the
+ ``ceph -s`` command is run with no further options specified, it connects
+ to an arbitrarily selected Monitor. In certain cases, however, it might be
+ helpful to connect to a specific Monitor (or to several specific Monitors
+ in sequence) by adding the ``-m`` flag to the command: for example, ``ceph
+ status -m mymon1``.
+
+#. **None of this worked. What now?**
+
+ If the above solutions have not resolved your problems, you might find it
+ helpful to examine each individual Monitor in turn. Even if no quorum has
+ been formed, it is possible to contact each Monitor individually and
+ request its status by using the ``ceph tell mon.ID mon_status`` command
+ (here ``ID`` is the Monitor's identifier).
+
+ Run the ``ceph tell mon.ID mon_status`` command for each Monitor in the
+ cluster. For more on this command's output, see :ref:`Understanding
+ mon_status
+ <rados_troubleshoting_troubleshooting_mon_understanding_mon_status>`.
+
+ There is also an alternative method for contacting each individual Monitor:
+ SSH into each Monitor node and query the daemon's admin socket. See
+ :ref:`Using the Monitor's Admin
+ Socket<rados_troubleshoting_troubleshooting_mon_using_admin_socket>`.
+
+.. _rados_troubleshoting_troubleshooting_mon_using_admin_socket:
+
+Using the monitor's admin socket
+================================
+
+A monitor's admin socket allows you to interact directly with a specific daemon
+by using a Unix socket file. This file is found in the monitor's ``run``
+directory. The admin socket's default directory is
+``/var/run/ceph/ceph-mon.ID.asok``, but this can be overridden and the admin
+socket might be elsewhere, especially if your cluster's daemons are deployed in
+containers. If you cannot find it, either check your ``ceph.conf`` for an
+alternative path or run the following command:
+
+.. prompt:: bash $
+
+ ceph-conf --name mon.ID --show-config-value admin_socket
+
+The admin socket is available for use only when the monitor daemon is running.
+Whenever the monitor has been properly shut down, the admin socket is removed.
+However, if the monitor is not running and the admin socket persists, it is
+likely that the monitor has been improperly shut down. In any case, if the
+monitor is not running, it will be impossible to use the admin socket, and the
+``ceph`` command is likely to return ``Error 111: Connection Refused``.
+
+To access the admin socket, run a ``ceph tell`` command of the following form
+(specifying the daemon that you are interested in):
+
+.. prompt:: bash $
+
+ ceph tell mon.<id> mon_status
+
+This command passes a ``help`` command to the specific running monitor daemon
+``<id>`` via its admin socket. If you know the full path to the admin socket
+file, this can be done more directly by running the following command:
+
+.. prompt:: bash $
+
+ ceph --admin-daemon <full_path_to_asok_file> <command>
+
+Running ``ceph help`` shows all supported commands that are available through
+the admin socket. See especially ``config get``, ``config show``, ``mon stat``,
+and ``quorum_status``.
+
+.. _rados_troubleshoting_troubleshooting_mon_understanding_mon_status:
+
+Understanding mon_status
+========================
+
+The status of the monitor (as reported by the ``ceph tell mon.X mon_status``
+command) can always be obtained via the admin socket. This command outputs a
+great deal of information about the monitor (including the information found in
+the output of the ``quorum_status`` command).
+
+To understand this command's output, let us consider the following example, in
+which we see the output of ``ceph tell mon.c mon_status``::
+
+ { "name": "c",
+ "rank": 2,
+ "state": "peon",
+ "election_epoch": 38,
+ "quorum": [
+ 1,
+ 2],
+ "outside_quorum": [],
+ "extra_probe_peers": [],
+ "sync_provider": [],
+ "monmap": { "epoch": 3,
+ "fsid": "5c4e9d53-e2e1-478a-8061-f543f8be4cf8",
+ "modified": "2013-10-30 04:12:01.945629",
+ "created": "2013-10-29 14:14:41.914786",
+ "mons": [
+ { "rank": 0,
+ "name": "a",
+ "addr": "127.0.0.1:6789\/0"},
+ { "rank": 1,
+ "name": "b",
+ "addr": "127.0.0.1:6790\/0"},
+ { "rank": 2,
+ "name": "c",
+ "addr": "127.0.0.1:6795\/0"}]}}
+
+It is clear that there are three monitors in the monmap (*a*, *b*, and *c*),
+the quorum is formed by only two monitors, and *c* is in the quorum as a
+*peon*.
+
+**Which monitor is out of the quorum?**
+
+ The answer is **a** (that is, ``mon.a``).
+
+**Why?**
+
+ When the ``quorum`` set is examined, there are clearly two monitors in the
+ set: *1* and *2*. But these are not monitor names. They are monitor ranks, as
+ established in the current ``monmap``. The ``quorum`` set does not include
+ the monitor that has rank 0, and according to the ``monmap`` that monitor is
+ ``mon.a``.
+
+**How are monitor ranks determined?**
+
+ Monitor ranks are calculated (or recalculated) whenever monitors are added or
+ removed. The calculation of ranks follows a simple rule: the **greater** the
+ ``IP:PORT`` combination, the **lower** the rank. In this case, because
+ ``127.0.0.1:6789`` is lower than the other two ``IP:PORT`` combinations,
+ ``mon.a`` has the highest rank: namely, rank 0.
+
+
+Most Common Monitor Issues
+===========================
+
+The Cluster Has Quorum but at Least One Monitor is Down
+-------------------------------------------------------
+
+When the cluster has quorum but at least one monitor is down, ``ceph health
+detail`` returns a message similar to the following::
+
+ $ ceph health detail
+ [snip]
+ mon.a (rank 0) addr 127.0.0.1:6789/0 is down (out of quorum)
+
+**How do I troubleshoot a Ceph cluster that has quorum but also has at least one monitor down?**
+
+ #. Make sure that ``mon.a`` is running.
+
+ #. Make sure that you can connect to ``mon.a``'s node from the
+ other Monitor nodes. Check the TCP ports as well. Check ``iptables`` and
+ ``nf_conntrack`` on all nodes and make sure that you are not
+ dropping/rejecting connections.
+
+ If this initial troubleshooting doesn't solve your problem, then further
+ investigation is necessary.
+
+ First, check the problematic monitor's ``mon_status`` via the admin
+ socket as explained in `Using the monitor's admin socket`_ and
+ `Understanding mon_status`_.
+
+ If the Monitor is out of the quorum, then its state will be one of the
+ following: ``probing``, ``electing`` or ``synchronizing``. If the state of
+ the Monitor is ``leader`` or ``peon``, then the Monitor believes itself to be
+ in quorum but the rest of the cluster believes that it is not in quorum. It
+ is possible that a Monitor that is in one of the ``probing``, ``electing``,
+ or ``synchronizing`` states has entered the quorum during the process of
+ troubleshooting. Check ``ceph status`` again to determine whether the Monitor
+ has entered quorum during your troubleshooting. If the Monitor remains out of
+ the quorum, then proceed with the investigations described in this section of
+ the documentation.
+
+
+**What does it mean when a Monitor's state is ``probing``?**
+
+ If ``ceph health detail`` shows that a Monitor's state is
+ ``probing``, then the Monitor is still looking for the other Monitors. Every
+ Monitor remains in this state for some time when it is started. When a
+ Monitor has connected to the other Monitors specified in the ``monmap``, it
+ ceases to be in the ``probing`` state. The amount of time that a Monitor is
+ in the ``probing`` state depends upon the parameters of the cluster of which
+ it is a part. For example, when a Monitor is a part of a single-monitor
+ cluster (never do this in production), the monitor passes through the probing
+ state almost instantaneously. In a multi-monitor cluster, the Monitors stay
+ in the ``probing`` state until they find enough monitors to form a quorum
+ |---| this means that if two out of three Monitors in the cluster are
+ ``down``, the one remaining Monitor stays in the ``probing`` state
+ indefinitely until you bring one of the other monitors up.
+
+ If quorum has been established, then the Monitor daemon should be able to
+ find the other Monitors quickly, as long as they can be reached. If a Monitor
+ is stuck in the ``probing`` state and you have exhausted the procedures above
+ that describe the troubleshooting of communications between the Monitors,
+ then it is possible that the problem Monitor is trying to reach the other
+ Monitors at a wrong address. ``mon_status`` outputs the ``monmap`` that is
+ known to the monitor: determine whether the other Monitors' locations as
+ specified in the ``monmap`` match the locations of the Monitors in the
+ network. If they do not, see `Recovering a Monitor's Broken monmap`_.
+ If the locations of the Monitors as specified in the ``monmap`` match the
+ locations of the Monitors in the network, then the persistent
+ ``probing`` state could be related to severe clock skews amongst the monitor
+ nodes. See `Clock Skews`_. If the information in `Clock Skews`_ does not
+ bring the Monitor out of the ``probing`` state, then prepare your system logs
+ and ask the Ceph community for help. See `Preparing your logs`_ for
+ information about the proper preparation of logs.
+
+
+**What does it mean when a Monitor's state is ``electing``?**
+
+ If ``ceph health detail`` shows that a Monitor's state is ``electing``, the
+ monitor is in the middle of an election. Elections typically complete
+ quickly, but sometimes the monitors can get stuck in what is known as an
+ *election storm*. See :ref:`Monitor Elections <dev_mon_elections>` for more
+ on monitor elections.
+
+ The presence of election storm might indicate clock skew among the monitor
+ nodes. See `Clock Skews`_ for more information.
+
+ If your clocks are properly synchronized, search the mailing lists and bug
+ tracker for issues similar to your issue. The ``electing`` state is not
+ likely to persist. In versions of Ceph after the release of Cuttlefish, there
+ is no obvious reason other than clock skew that explains why an ``electing``
+ state would persist.
+
+ It is possible to investigate the cause of a persistent ``electing`` state if
+ you put the problematic Monitor into a ``down`` state while you investigate.
+ This is possible only if there are enough surviving Monitors to form quorum.
+
+**What does it mean when a Monitor's state is ``synchronizing``?**
+
+ If ``ceph health detail`` shows that the Monitor is ``synchronizing``, the
+ monitor is catching up with the rest of the cluster so that it can join the
+ quorum. The amount of time that it takes for the Monitor to synchronize with
+ the rest of the quorum is a function of the size of the cluster's monitor
+ store, the cluster's size, and the state of the cluster. Larger and degraded
+ clusters generally keep Monitors in the ``synchronizing`` state longer than
+ do smaller, new clusters.
+
+ A Monitor that changes its state from ``synchronizing`` to ``electing`` and
+ then back to ``synchronizing`` indicates a problem: the cluster state may be
+ advancing (that is, generating new maps) too fast for the synchronization
+ process to keep up with the pace of the creation of the new maps. This issue
+ presented more frequently prior to the Cuttlefish release than it does in
+ more recent releases, because the synchronization process has since been
+ refactored and enhanced to avoid this dynamic. If you experience this in
+ later versions, report the issue in the `Ceph bug tracker
+ <https://tracker.ceph.com>`_. Prepare and provide logs to substantiate any
+ bug you raise. See `Preparing your logs`_ for information about the proper
+ preparation of logs.
+
+**What does it mean when a Monitor's state is ``leader`` or ``peon``?**
+
+ If ``ceph health detail`` shows that the Monitor is in the ``leader`` state
+ or in the ``peon`` state, it is likely that clock skew is present. Follow the
+ instructions in `Clock Skews`_. If you have followed those instructions and
+ ``ceph health detail`` still shows that the Monitor is in the ``leader``
+ state or the ``peon`` state, report the issue in the `Ceph bug tracker
+ <https://tracker.ceph.com>`_. If you raise an issue, provide logs to
+ substantiate it. See `Preparing your logs`_ for information about the
+ proper preparation of logs.
+
+
+Recovering a Monitor's Broken ``monmap``
+----------------------------------------
+
+This is how a ``monmap`` usually looks, depending on the number of
+monitors::
+
+
+ epoch 3
+ fsid 5c4e9d53-e2e1-478a-8061-f543f8be4cf8
+ last_changed 2013-10-30 04:12:01.945629
+ created 2013-10-29 14:14:41.914786
+ 0: 127.0.0.1:6789/0 mon.a
+ 1: 127.0.0.1:6790/0 mon.b
+ 2: 127.0.0.1:6795/0 mon.c
+
+This may not be what you have however. For instance, in some versions of
+early Cuttlefish there was a bug that could cause your ``monmap``
+to be nullified. Completely filled with zeros. This means that not even
+``monmaptool`` would be able to make sense of cold, hard, inscrutable zeros.
+It's also possible to end up with a monitor with a severely outdated monmap,
+notably if the node has been down for months while you fight with your vendor's
+TAC. The subject ``ceph-mon`` daemon might be unable to find the surviving
+monitors (e.g., say ``mon.c`` is down; you add a new monitor ``mon.d``,
+then remove ``mon.a``, then add a new monitor ``mon.e`` and remove
+``mon.b``; you will end up with a totally different monmap from the one
+``mon.c`` knows).
+
+In this situation you have two possible solutions:
+
+Scrap the monitor and redeploy
+
+ You should only take this route if you are positive that you won't
+ lose the information kept by that monitor; that you have other monitors
+ and that they are running just fine so that your new monitor is able
+ to synchronize from the remaining monitors. Keep in mind that destroying
+ a monitor, if there are no other copies of its contents, may lead to
+ loss of data.
+
+Inject a monmap into the monitor
+
+ These are the basic steps:
+
+ Retrieve the ``monmap`` from the surviving monitors and inject it into the
+ monitor whose ``monmap`` is corrupted or lost.
+
+ Implement this solution by carrying out the following procedure:
+
+ 1. Is there a quorum of monitors? If so, retrieve the ``monmap`` from the
+ quorum::
+
+ $ ceph mon getmap -o /tmp/monmap
+
+ 2. If there is no quorum, then retrieve the ``monmap`` directly from another
+ monitor that has been stopped (in this example, the other monitor has
+ the ID ``ID-FOO``)::
+
+ $ ceph-mon -i ID-FOO --extract-monmap /tmp/monmap
+
+ 3. Stop the monitor you are going to inject the monmap into.
+
+ 4. Inject the monmap::
+
+ $ ceph-mon -i ID --inject-monmap /tmp/monmap
+
+ 5. Start the monitor
+
+ .. warning:: Injecting ``monmaps`` can cause serious problems because doing
+ so will overwrite the latest existing ``monmap`` stored on the monitor. Be
+ careful!
+
+Clock Skews
+-----------
+
+The Paxos consensus algorithm requires close time synchroniziation, which means
+that clock skew among the monitors in the quorum can have a serious effect on
+monitor operation. The resulting behavior can be puzzling. To avoid this issue,
+run a clock synchronization tool on your monitor nodes: for example, use
+``Chrony`` or the legacy ``ntpd`` utility. Configure each monitor nodes so that
+the `iburst` option is in effect and so that each monitor has multiple peers,
+including the following:
+
+* Each other
+* Internal ``NTP`` servers
+* Multiple external, public pool servers
+
+.. note:: The ``iburst`` option sends a burst of eight packets instead of the
+ usual single packet, and is used during the process of getting two peers
+ into initial synchronization.
+
+Furthermore, it is advisable to synchronize *all* nodes in your cluster against
+internal and external servers, and perhaps even against your monitors. Run
+``NTP`` servers on bare metal: VM-virtualized clocks are not suitable for
+steady timekeeping. See `https://www.ntp.org <https://www.ntp.org>`_ for more
+information about the Network Time Protocol (NTP). Your organization might
+already have quality internal ``NTP`` servers available. Sources for ``NTP``
+server appliances include the following:
+
+* Microsemi (formerly Symmetricom) `https://microsemi.com <https://www.microsemi.com/product-directory/3425-timing-synchronization>`_
+* EndRun `https://endruntechnologies.com <https://endruntechnologies.com/products/ntp-time-servers>`_
+* Netburner `https://www.netburner.com <https://www.netburner.com/products/network-time-server/pk70-ex-ntp-network-time-server>`_
+
+Clock Skew Questions and Answers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+**What's the maximum tolerated clock skew?**
+
+ By default, monitors allow clocks to drift up to a maximum of 0.05 seconds
+ (50 milliseconds).
+
+**Can I increase the maximum tolerated clock skew?**
+
+ Yes, but we strongly recommend against doing so. The maximum tolerated clock
+ skew is configurable via the ``mon-clock-drift-allowed`` option, but it is
+ almost certainly a bad idea to make changes to this option. The clock skew
+ maximum is in place because clock-skewed monitors cannot be relied upon. The
+ current default value has proven its worth at alerting the user before the
+ monitors encounter serious problems. Changing this value might cause
+ unforeseen effects on the stability of the monitors and overall cluster
+ health.
+
+**How do I know whether there is a clock skew?**
+
+ The monitors will warn you via the cluster status ``HEALTH_WARN``. When clock
+ skew is present, the ``ceph health detail`` and ``ceph status`` commands
+ return an output resembling the following::
+
+ mon.c addr 10.10.0.1:6789/0 clock skew 0.08235s > max 0.05s (latency 0.0045s)
+
+ In this example, the monitor ``mon.c`` has been flagged as suffering from
+ clock skew.
+
+ In Luminous and later releases, it is possible to check for a clock skew by
+ running the ``ceph time-sync-status`` command. Note that the lead monitor
+ typically has the numerically lowest IP address. It will always show ``0``:
+ the reported offsets of other monitors are relative to the lead monitor, not
+ to any external reference source.
+
+**What should I do if there is a clock skew?**
+
+ Synchronize your clocks. Using an NTP client might help. However, if you
+ are already using an NTP client and you still encounter clock skew problems,
+ determine whether the NTP server that you are using is remote to your network
+ or instead hosted on your network. Hosting your own NTP servers tends to
+ mitigate clock skew problems.
+
+
+Client Can't Connect or Mount
+-----------------------------
+
+Check your IP tables. Some operating-system install utilities add a ``REJECT``
+rule to ``iptables``. ``iptables`` rules will reject all clients other than
+``ssh`` that try to connect to the host. If your monitor host's IP tables have
+a ``REJECT`` rule in place, clients that are connecting from a separate node
+will fail and will raise a timeout error. Any ``iptables`` rules that reject
+clients trying to connect to Ceph daemons must be addressed. For example::
+
+ REJECT all -- anywhere anywhere reject-with icmp-host-prohibited
+
+It might also be necessary to add rules to iptables on your Ceph hosts to
+ensure that clients are able to access the TCP ports associated with your Ceph
+monitors (default: port 6789) and Ceph OSDs (default: 6800 through 7300). For
+example::
+
+ iptables -A INPUT -m multiport -p tcp -s {ip-address}/{netmask} --dports 6789,6800:7300 -j ACCEPT
+
+
+Monitor Store Failures
+======================
+
+Symptoms of store corruption
+----------------------------
+
+Ceph monitors store the :term:`Cluster Map` in a key-value store. If key-value
+store corruption causes a monitor to fail, then the monitor log might contain
+one of the following error messages::
+
+ Corruption: error in middle of record
+
+or::
+
+ Corruption: 1 missing files; e.g.: /var/lib/ceph/mon/mon.foo/store.db/1234567.ldb
+
+Recovery using healthy monitor(s)
+---------------------------------
+
+If there are surviving monitors, we can always :ref:`replace
+<adding-and-removing-monitors>` the corrupted monitor with a new one. After the
+new monitor boots, it will synchronize with a healthy peer. After the new
+monitor is fully synchronized, it will be able to serve clients.
+
+.. _mon-store-recovery-using-osds:
+
+Recovery using OSDs
+-------------------
+
+Even if all monitors fail at the same time, it is possible to recover the
+monitor store by using information stored in OSDs. You are encouraged to deploy
+at least three (and preferably five) monitors in a Ceph cluster. In such a
+deployment, complete monitor failure is unlikely. However, unplanned power loss
+in a data center whose disk settings or filesystem settings are improperly
+configured could cause the underlying filesystem to fail and this could kill
+all of the monitors. In such a case, data in the OSDs can be used to recover
+the monitors. The following is such a script and can be used to recover the
+monitors:
+
+
+.. code-block:: bash
+
+ ms=/root/mon-store
+ mkdir $ms
+
+ # collect the cluster map from stopped OSDs
+ for host in $hosts; do
+ rsync -avz $ms/. user@$host:$ms.remote
+ rm -rf $ms
+ ssh user@$host <<EOF
+ for osd in /var/lib/ceph/osd/ceph-*; do
+ ceph-objectstore-tool --data-path \$osd --no-mon-config --op update-mon-db --mon-store-path $ms.remote
+ done
+ EOF
+ rsync -avz user@$host:$ms.remote/. $ms
+ done
+
+ # rebuild the monitor store from the collected map, if the cluster does not
+ # use cephx authentication, we can skip the following steps to update the
+ # keyring with the caps, and there is no need to pass the "--keyring" option.
+ # i.e. just use "ceph-monstore-tool $ms rebuild" instead
+ ceph-authtool /path/to/admin.keyring -n mon. \
+ --cap mon 'allow *'
+ ceph-authtool /path/to/admin.keyring -n client.admin \
+ --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow *'
+ # add one or more ceph-mgr's key to the keyring. in this case, an encoded key
+ # for mgr.x is added, you can find the encoded key in
+ # /etc/ceph/${cluster}.${mgr_name}.keyring on the machine where ceph-mgr is
+ # deployed
+ ceph-authtool /path/to/admin.keyring --add-key 'AQDN8kBe9PLWARAAZwxXMr+n85SBYbSlLcZnMA==' -n mgr.x \
+ --cap mon 'allow profile mgr' --cap osd 'allow *' --cap mds 'allow *'
+ # If your monitors' ids are not sorted by ip address, please specify them in order.
+ # For example. if mon 'a' is 10.0.0.3, mon 'b' is 10.0.0.2, and mon 'c' is 10.0.0.4,
+ # please passing "--mon-ids b a c".
+ # In addition, if your monitors' ids are not single characters like 'a', 'b', 'c', please
+ # specify them in the command line by passing them as arguments of the "--mon-ids"
+ # option. if you are not sure, please check your ceph.conf to see if there is any
+ # sections named like '[mon.foo]'. don't pass the "--mon-ids" option, if you are
+ # using DNS SRV for looking up monitors.
+ ceph-monstore-tool $ms rebuild -- --keyring /path/to/admin.keyring --mon-ids alpha beta gamma
+
+ # make a backup of the corrupted store.db just in case! repeat for
+ # all monitors.
+ mv /var/lib/ceph/mon/mon.foo/store.db /var/lib/ceph/mon/mon.foo/store.db.corrupted
+
+ # move rebuild store.db into place. repeat for all monitors.
+ mv $ms/store.db /var/lib/ceph/mon/mon.foo/store.db
+ chown -R ceph:ceph /var/lib/ceph/mon/mon.foo/store.db
+
+This script performs the following steps:
+
+#. Collects the map from each OSD host.
+#. Rebuilds the store.
+#. Fills the entities in the keyring file with appropriate capabilities.
+#. Replaces the corrupted store on ``mon.foo`` with the recovered copy.
+
+
+Known limitations
+~~~~~~~~~~~~~~~~~
+
+The above recovery tool is unable to recover the following information:
+
+- **Certain added keyrings**: All of the OSD keyrings added using the ``ceph
+ auth add`` command are recovered from the OSD's copy, and the
+ ``client.admin`` keyring is imported using ``ceph-monstore-tool``. However,
+ the MDS keyrings and all other keyrings will be missing in the recovered
+ monitor store. You might need to manually re-add them.
+
+- **Creating pools**: If any RADOS pools were in the process of being created,
+ that state is lost. The recovery tool operates on the assumption that all
+ pools have already been created. If there are PGs that are stuck in the
+ 'unknown' state after the recovery for a partially created pool, you can
+ force creation of the *empty* PG by running the ``ceph osd force-create-pg``
+ command. Note that this will create an *empty* PG, so take this action only
+ if you know the pool is empty.
+
+- **MDS Maps**: The MDS maps are lost.
+
+
+Everything Failed! Now What?
+============================
+
+Reaching out for help
+---------------------
+
+You can find help on IRC in #ceph and #ceph-devel on OFTC (server
+irc.oftc.net), or at ``dev@ceph.io`` and ``ceph-users@lists.ceph.com``. Make
+sure that you have prepared your logs and that you have them ready upon
+request.
+
+See https://ceph.io/en/community/connect/ for current (as of October 2023)
+information on getting in contact with the upstream Ceph community.
+
+
+Preparing your logs
+-------------------
+
+The default location for monitor logs is ``/var/log/ceph/ceph-mon.FOO.log*``.
+However, if they are not there, you can find their current location by running
+the following command:
+
+.. prompt:: bash
+
+ ceph-conf --name mon.FOO --show-config-value log_file
+
+The amount of information in the logs is determined by the debug levels in the
+cluster's configuration files. If Ceph is using the default debug levels, then
+your logs might be missing important information that would help the upstream
+Ceph community address your issue.
+
+To make sure your monitor logs contain relevant information, you can raise
+debug levels. Here we are interested in information from the monitors. As with
+other components, the monitors have different parts that output their debug
+information on different subsystems.
+
+If you are an experienced Ceph troubleshooter, we recommend raising the debug
+levels of the most relevant subsystems. Of course, this approach might not be
+easy for beginners. In most cases, however, enough information to address the
+issue will be secured if the following debug levels are entered::
+
+ debug_mon = 10
+ debug_ms = 1
+
+Sometimes these debug levels do not yield enough information. In such cases,
+members of the upstream Ceph community might ask you to make additional changes
+to these or to other debug levels. In any case, it is better for us to receive
+at least some useful information than to receive an empty log.
+
+
+Do I need to restart a monitor to adjust debug levels?
+------------------------------------------------------
+
+No, restarting a monitor is not necessary. Debug levels may be adjusted by
+using two different methods, depending on whether or not there is a quorum:
+
+There is a quorum
+
+ Either inject the debug option into the specific monitor that needs to
+ be debugged::
+
+ ceph tell mon.FOO config set debug_mon 10/10
+
+ Or inject it into all monitors at once::
+
+ ceph tell mon.* config set debug_mon 10/10
+
+
+There is no quorum
+
+ Use the admin socket of the specific monitor that needs to be debugged
+ and directly adjust the monitor's configuration options::
+
+ ceph daemon mon.FOO config set debug_mon 10/10
+
+
+To return the debug levels to their default values, run the above commands
+using the debug level ``1/10`` rather than ``10/10``. To check a monitor's
+current values, use the admin socket and run either of the following commands:
+
+ .. prompt:: bash
+
+ ceph daemon mon.FOO config show
+
+or:
+
+ .. prompt:: bash
+
+ ceph daemon mon.FOO config get 'OPTION_NAME'
+
+
+
+I Reproduced the problem with appropriate debug levels. Now what?
+-----------------------------------------------------------------
+
+We prefer that you send us only the portions of your logs that are relevant to
+your monitor problems. Of course, it might not be easy for you to determine
+which portions are relevant so we are willing to accept complete and
+unabridged logs. However, we request that you avoid sending logs containing
+hundreds of thousands of lines with no additional clarifying information. One
+common-sense way of making our task easier is to write down the current time
+and date when you are reproducing the problem and then extract portions of your
+logs based on that information.
+
+Finally, reach out to us on the mailing lists or IRC or Slack, or by filing a
+new issue on the `tracker`_.
+
+.. _tracker: http://tracker.ceph.com/projects/ceph/issues/new
+
+.. |---| unicode:: U+2014 .. EM DASH
+ :trim:
diff --git a/doc/rados/troubleshooting/troubleshooting-osd.rst b/doc/rados/troubleshooting/troubleshooting-osd.rst
new file mode 100644
index 000000000..035947d7e
--- /dev/null
+++ b/doc/rados/troubleshooting/troubleshooting-osd.rst
@@ -0,0 +1,787 @@
+======================
+ Troubleshooting OSDs
+======================
+
+Before troubleshooting the cluster's OSDs, check the monitors
+and the network.
+
+First, determine whether the monitors have a quorum. Run the ``ceph health``
+command or the ``ceph -s`` command and if Ceph shows ``HEALTH_OK`` then there
+is a monitor quorum.
+
+If the monitors don't have a quorum or if there are errors with the monitor
+status, address the monitor issues before proceeding by consulting the material
+in `Troubleshooting Monitors <../troubleshooting-mon>`_.
+
+Next, check your networks to make sure that they are running properly. Networks
+can have a significant impact on OSD operation and performance. Look for
+dropped packets on the host side and CRC errors on the switch side.
+
+
+Obtaining Data About OSDs
+=========================
+
+When troubleshooting OSDs, it is useful to collect different kinds of
+information about the OSDs. Some information comes from the practice of
+`monitoring OSDs`_ (for example, by running the ``ceph osd tree`` command).
+Additional information concerns the topology of your cluster, and is discussed
+in the following sections.
+
+
+Ceph Logs
+---------
+
+Ceph log files are stored under ``/var/log/ceph``. Unless the path has been
+changed (or you are in a containerized environment that stores logs in a
+different location), the log files can be listed by running the following
+command:
+
+.. prompt:: bash
+
+ ls /var/log/ceph
+
+If there is not enough log detail, change the logging level. To ensure that
+Ceph performs adequately under high logging volume, see `Logging and
+Debugging`_.
+
+
+
+Admin Socket
+------------
+
+Use the admin socket tool to retrieve runtime information. First, list the
+sockets of Ceph's daemons by running the following command:
+
+.. prompt:: bash
+
+ ls /var/run/ceph
+
+Next, run a command of the following form (replacing ``{daemon-name}`` with the
+name of a specific daemon: for example, ``osd.0``):
+
+.. prompt:: bash
+
+ ceph daemon {daemon-name} help
+
+Alternatively, run the command with a ``{socket-file}`` specified (a "socket
+file" is a specific file in ``/var/run/ceph``):
+
+.. prompt:: bash
+
+ ceph daemon {socket-file} help
+
+The admin socket makes many tasks possible, including:
+
+- Listing Ceph configuration at runtime
+- Dumping historic operations
+- Dumping the operation priority queue state
+- Dumping operations in flight
+- Dumping perfcounters
+
+Display Free Space
+------------------
+
+Filesystem issues may arise. To display your filesystems' free space, run the
+following command:
+
+.. prompt:: bash
+
+ df -h
+
+To see this command's supported syntax and options, run ``df --help``.
+
+I/O Statistics
+--------------
+
+The `iostat`_ tool can be used to identify I/O-related issues. Run the
+following command:
+
+.. prompt:: bash
+
+ iostat -x
+
+
+Diagnostic Messages
+-------------------
+
+To retrieve diagnostic messages from the kernel, run the ``dmesg`` command and
+specify the output with ``less``, ``more``, ``grep``, or ``tail``. For
+example:
+
+.. prompt:: bash
+
+ dmesg | grep scsi
+
+Stopping without Rebalancing
+============================
+
+It might be occasionally necessary to perform maintenance on a subset of your
+cluster or to resolve a problem that affects a failure domain (for example, a
+rack). However, when you stop OSDs for maintenance, you might want to prevent
+CRUSH from automatically rebalancing the cluster. To avert this rebalancing
+behavior, set the cluster to ``noout`` by running the following command:
+
+.. prompt:: bash
+
+ ceph osd set noout
+
+.. warning:: This is more a thought exercise offered for the purpose of giving
+ the reader a sense of failure domains and CRUSH behavior than a suggestion
+ that anyone in the post-Luminous world run ``ceph osd set noout``. When the
+ OSDs return to an ``up`` state, rebalancing will resume and the change
+ introduced by the ``ceph osd set noout`` command will be reverted.
+
+In Luminous and later releases, however, it is a safer approach to flag only
+affected OSDs. To add or remove a ``noout`` flag to a specific OSD, run a
+command like the following:
+
+.. prompt:: bash
+
+ ceph osd add-noout osd.0
+ ceph osd rm-noout osd.0
+
+It is also possible to flag an entire CRUSH bucket. For example, if you plan to
+take down ``prod-ceph-data1701`` in order to add RAM, you might run the
+following command:
+
+.. prompt:: bash
+
+ ceph osd set-group noout prod-ceph-data1701
+
+After the flag is set, stop the OSDs and any other colocated
+Ceph services within the failure domain that requires maintenance work::
+
+ systemctl stop ceph\*.service ceph\*.target
+
+.. note:: When an OSD is stopped, any placement groups within the OSD are
+ marked as ``degraded``.
+
+After the maintenance is complete, it will be necessary to restart the OSDs
+and any other daemons that have stopped. However, if the host was rebooted as
+part of the maintenance, they do not need to be restarted and will come back up
+automatically. To restart OSDs or other daemons, use a command of the following
+form:
+
+.. prompt:: bash
+
+ sudo systemctl start ceph.target
+
+Finally, unset the ``noout`` flag as needed by running commands like the
+following:
+
+.. prompt:: bash
+
+ ceph osd unset noout
+ ceph osd unset-group noout prod-ceph-data1701
+
+Many contemporary Linux distributions employ ``systemd`` for service
+management. However, for certain operating systems (especially older ones) it
+might be necessary to issue equivalent ``service`` or ``start``/``stop``
+commands.
+
+
+.. _osd-not-running:
+
+OSD Not Running
+===============
+
+Under normal conditions, restarting a ``ceph-osd`` daemon will allow it to
+rejoin the cluster and recover.
+
+
+An OSD Won't Start
+------------------
+
+If the cluster has started but an OSD isn't starting, check the following:
+
+- **Configuration File:** If you were not able to get OSDs running from a new
+ installation, check your configuration file to ensure it conforms to the
+ standard (for example, make sure that it says ``host`` and not ``hostname``,
+ etc.).
+
+- **Check Paths:** Ensure that the paths specified in the configuration
+ correspond to the paths for data and metadata that actually exist (for
+ example, the paths to the journals, the WAL, and the DB). Separate the OSD
+ data from the metadata in order to see whether there are errors in the
+ configuration file and in the actual mounts. If so, these errors might
+ explain why OSDs are not starting. To store the metadata on a separate block
+ device, partition or LVM the drive and assign one partition per OSD.
+
+- **Check Max Threadcount:** If the cluster has a node with an especially high
+ number of OSDs, it might be hitting the default maximum number of threads
+ (usually 32,000). This is especially likely to happen during recovery.
+ Increasing the maximum number of threads to the maximum possible number of
+ threads allowed (4194303) might help with the problem. To increase the number
+ of threads to the maximum, run the following command:
+
+ .. prompt:: bash
+
+ sysctl -w kernel.pid_max=4194303
+
+ If this increase resolves the issue, you must make the increase permanent by
+ including a ``kernel.pid_max`` setting either in a file under
+ ``/etc/sysctl.d`` or within the master ``/etc/sysctl.conf`` file. For
+ example::
+
+ kernel.pid_max = 4194303
+
+- **Check ``nf_conntrack``:** This connection-tracking and connection-limiting
+ system causes problems for many production Ceph clusters. The problems often
+ emerge slowly and subtly. As cluster topology and client workload grow,
+ mysterious and intermittent connection failures and performance glitches
+ occur more and more, especially at certain times of the day. To begin taking
+ the measure of your problem, check the ``syslog`` history for "table full"
+ events. One way to address this kind of problem is as follows: First, use the
+ ``sysctl`` utility to assign ``nf_conntrack_max`` a much higher value. Next,
+ raise the value of ``nf_conntrack_buckets`` so that ``nf_conntrack_buckets``
+ × 8 = ``nf_conntrack_max``; this action might require running commands
+ outside of ``sysctl`` (for example, ``"echo 131072 >
+ /sys/module/nf_conntrack/parameters/hashsize``). Another way to address the
+ problem is to blacklist the associated kernel modules in order to disable
+ processing altogether. This approach is powerful, but fragile. The modules
+ and the order in which the modules must be listed can vary among kernel
+ versions. Even when blacklisted, ``iptables`` and ``docker`` might sometimes
+ activate connection tracking anyway, so we advise a "set and forget" strategy
+ for the tunables. On modern systems, this approach will not consume
+ appreciable resources.
+
+- **Kernel Version:** Identify the kernel version and distribution that are in
+ use. By default, Ceph uses third-party tools that might be buggy or come into
+ conflict with certain distributions or kernel versions (for example, Google's
+ ``gperftools`` and ``TCMalloc``). Check the `OS recommendations`_ and the
+ release notes for each Ceph version in order to make sure that you have
+ addressed any issues related to your kernel.
+
+- **Segment Fault:** If there is a segment fault, increase log levels and
+ restart the problematic daemon(s). If segment faults recur, search the Ceph
+ bug tracker `https://tracker.ceph/com/projects/ceph
+ <https://tracker.ceph.com/projects/ceph/>`_ and the ``dev`` and
+ ``ceph-users`` mailing list archives `https://ceph.io/resources
+ <https://ceph.io/resources>`_ to see if others have experienced and reported
+ these issues. If this truly is a new and unique failure, post to the ``dev``
+ email list and provide the following information: the specific Ceph release
+ being run, ``ceph.conf`` (with secrets XXX'd out), your monitor status
+ output, and excerpts from your log file(s).
+
+
+An OSD Failed
+-------------
+
+When an OSD fails, this means that a ``ceph-osd`` process is unresponsive or
+has died and that the corresponding OSD has been marked ``down``. Surviving
+``ceph-osd`` daemons will report to the monitors that the OSD appears to be
+down, and a new status will be visible in the output of the ``ceph health``
+command, as in the following example:
+
+.. prompt:: bash
+
+ ceph health
+
+::
+
+ HEALTH_WARN 1/3 in osds are down
+
+This health alert is raised whenever there are one or more OSDs marked ``in``
+and ``down``. To see which OSDs are ``down``, add ``detail`` to the command as in
+the following example:
+
+.. prompt:: bash
+
+ ceph health detail
+
+::
+
+ HEALTH_WARN 1/3 in osds are down
+ osd.0 is down since epoch 23, last address 192.168.106.220:6800/11080
+
+Alternatively, run the following command:
+
+.. prompt:: bash
+
+ ceph osd tree down
+
+If there is a drive failure or another fault that is preventing a given
+``ceph-osd`` daemon from functioning or restarting, then there should be an
+error message present in its log file under ``/var/log/ceph``.
+
+If the ``ceph-osd`` daemon stopped because of a heartbeat failure or a
+``suicide timeout`` error, then the underlying drive or filesystem might be
+unresponsive. Check ``dmesg`` output and `syslog` output for drive errors or
+kernel errors. It might be necessary to specify certain flags (for example,
+``dmesg -T`` to see human-readable timestamps) in order to avoid mistaking old
+errors for new errors.
+
+If an entire host's OSDs are ``down``, check to see if there is a network
+error or a hardware issue with the host.
+
+If the OSD problem is the result of a software error (for example, a failed
+assertion or another unexpected error), search for reports of the issue in the
+`bug tracker <https://tracker.ceph/com/projects/ceph>`_ , the `dev mailing list
+archives <https://lists.ceph.io/hyperkitty/list/dev@ceph.io/>`_, and the
+`ceph-users mailing list archives
+<https://lists.ceph.io/hyperkitty/list/ceph-users@ceph.io/>`_. If there is no
+clear fix or existing bug, then :ref:`report the problem to the ceph-devel
+email list <Get Involved>`.
+
+
+.. _no-free-drive-space:
+
+No Free Drive Space
+-------------------
+
+If an OSD is full, Ceph prevents data loss by ensuring that no new data is
+written to the OSD. In an properly running cluster, health checks are raised
+when the cluster's OSDs and pools approach certain "fullness" ratios. The
+``mon_osd_full_ratio`` threshold defaults to ``0.95`` (or 95% of capacity):
+this is the point above which clients are prevented from writing data. The
+``mon_osd_backfillfull_ratio`` threshold defaults to ``0.90`` (or 90% of
+capacity): this is the point above which backfills will not start. The
+``mon_osd_nearfull_ratio`` threshold defaults to ``0.85`` (or 85% of capacity):
+this is the point at which it raises the ``OSD_NEARFULL`` health check.
+
+OSDs within a cluster will vary in how much data is allocated to them by Ceph.
+To check "fullness" by displaying data utilization for every OSD, run the
+following command:
+
+.. prompt:: bash
+
+ ceph osd df
+
+To check "fullness" by displaying a cluster’s overall data usage and data
+distribution among pools, run the following command:
+
+.. prompt:: bash
+
+ ceph df
+
+When examining the output of the ``ceph df`` command, pay special attention to
+the **most full** OSDs, as opposed to the percentage of raw space used. If a
+single outlier OSD becomes full, all writes to this OSD's pool might fail as a
+result. When ``ceph df`` reports the space available to a pool, it considers
+the ratio settings relative to the *most full* OSD that is part of the pool. To
+flatten the distribution, two approaches are available: (1) Using the
+``reweight-by-utilization`` command to progressively move data from excessively
+full OSDs or move data to insufficiently full OSDs, and (2) in later revisions
+of Luminous and subsequent releases, exploiting the ``ceph-mgr`` ``balancer``
+module to perform the same task automatically.
+
+To adjust the "fullness" ratios, run a command or commands of the following
+form:
+
+.. prompt:: bash
+
+ ceph osd set-nearfull-ratio <float[0.0-1.0]>
+ ceph osd set-full-ratio <float[0.0-1.0]>
+ ceph osd set-backfillfull-ratio <float[0.0-1.0]>
+
+Sometimes full cluster issues arise because an OSD has failed. This can happen
+either because of a test or because the cluster is small, very full, or
+unbalanced. When an OSD or node holds an excessive percentage of the cluster's
+data, component failures or natural growth can result in the ``nearfull`` and
+``full`` ratios being exceeded. When testing Ceph's resilience to OSD failures
+on a small cluster, it is advised to leave ample free disk space and to
+consider temporarily lowering the OSD ``full ratio``, OSD ``backfillfull
+ratio``, and OSD ``nearfull ratio``.
+
+The "fullness" status of OSDs is visible in the output of the ``ceph health``
+command, as in the following example:
+
+.. prompt:: bash
+
+ ceph health
+
+::
+
+ HEALTH_WARN 1 nearfull osd(s)
+
+For details, add the ``detail`` command as in the following example:
+
+.. prompt:: bash
+
+ ceph health detail
+
+::
+
+ HEALTH_ERR 1 full osd(s); 1 backfillfull osd(s); 1 nearfull osd(s)
+ osd.3 is full at 97%
+ osd.4 is backfill full at 91%
+ osd.2 is near full at 87%
+
+To address full cluster issues, it is recommended to add capacity by adding
+OSDs. Adding new OSDs allows the cluster to redistribute data to newly
+available storage. Search for ``rados bench`` orphans that are wasting space.
+
+If a legacy Filestore OSD cannot be started because it is full, it is possible
+to reclaim space by deleting a small number of placement group directories in
+the full OSD.
+
+.. important:: If you choose to delete a placement group directory on a full
+ OSD, **DO NOT** delete the same placement group directory on another full
+ OSD. **OTHERWISE YOU WILL LOSE DATA**. You **MUST** maintain at least one
+ copy of your data on at least one OSD. Deleting placement group directories
+ is a rare and extreme intervention. It is not to be undertaken lightly.
+
+See `Monitor Config Reference`_ for more information.
+
+
+OSDs are Slow/Unresponsive
+==========================
+
+OSDs are sometimes slow or unresponsive. When troubleshooting this common
+problem, it is advised to eliminate other possibilities before investigating
+OSD performance issues. For example, be sure to confirm that your network(s)
+are working properly, to verify that your OSDs are running, and to check
+whether OSDs are throttling recovery traffic.
+
+.. tip:: In pre-Luminous releases of Ceph, ``up`` and ``in`` OSDs were
+ sometimes not available or were otherwise slow because recovering OSDs were
+ consuming system resources. Newer releases provide better recovery handling
+ by preventing this phenomenon.
+
+
+Networking Issues
+-----------------
+
+As a distributed storage system, Ceph relies upon networks for OSD peering and
+replication, recovery from faults, and periodic heartbeats. Networking issues
+can cause OSD latency and flapping OSDs. For more information, see `Flapping
+OSDs`_.
+
+To make sure that Ceph processes and Ceph-dependent processes are connected and
+listening, run the following commands:
+
+.. prompt:: bash
+
+ netstat -a | grep ceph
+ netstat -l | grep ceph
+ sudo netstat -p | grep ceph
+
+To check network statistics, run the following command:
+
+.. prompt:: bash
+
+ netstat -s
+
+Drive Configuration
+-------------------
+
+An SAS or SATA storage drive should house only one OSD, but a NVMe drive can
+easily house two or more. However, it is possible for read and write throughput
+to bottleneck if other processes share the drive. Such processes include:
+journals / metadata, operating systems, Ceph monitors, ``syslog`` logs, other
+OSDs, and non-Ceph processes.
+
+Because Ceph acknowledges writes *after* journaling, fast SSDs are an
+attractive option for accelerating response time -- particularly when using the
+``XFS`` or ``ext4`` filesystems for legacy FileStore OSDs. By contrast, the
+``Btrfs`` file system can write and journal simultaneously. (However, use of
+``Btrfs`` is not recommended for production deployments.)
+
+.. note:: Partitioning a drive does not change its total throughput or
+ sequential read/write limits. Throughput might be improved somewhat by
+ running a journal in a separate partition, but it is better still to run
+ such a journal in a separate physical drive.
+
+.. warning:: Reef does not support FileStore. Releases after Reef do not
+ support FileStore. Any information that mentions FileStore is pertinent only
+ to the Quincy release of Ceph and to releases prior to Quincy.
+
+
+Bad Sectors / Fragmented Disk
+-----------------------------
+
+Check your drives for bad blocks, fragmentation, and other errors that can
+cause significantly degraded performance. Tools that are useful in checking for
+drive errors include ``dmesg``, ``syslog`` logs, and ``smartctl`` (found in the
+``smartmontools`` package).
+
+.. note:: ``smartmontools`` 7.0 and late provides NVMe stat passthrough and
+ JSON output.
+
+
+Co-resident Monitors/OSDs
+-------------------------
+
+Although monitors are relatively lightweight processes, performance issues can
+result when monitors are run on the same host machine as an OSD. Monitors issue
+many ``fsync()`` calls and this can interfere with other workloads. The danger
+of performance issues is especially acute when the monitors are co-resident on
+the same storage drive as an OSD. In addition, if the monitors are running an
+older kernel (pre-3.0) or a kernel with no ``syncfs(2)`` syscall, then multiple
+OSDs running on the same host might make so many commits as to undermine each
+other's performance. This problem sometimes results in what is called "the
+bursty writes".
+
+
+Co-resident Processes
+---------------------
+
+Significant OSD latency can result from processes that write data to Ceph (for
+example, cloud-based solutions and virtual machines) while operating on the
+same hardware as OSDs. For this reason, making such processes co-resident with
+OSDs is not generally recommended. Instead, the recommended practice is to
+optimize certain hosts for use with Ceph and use other hosts for other
+processes. This practice of separating Ceph operations from other applications
+might help improve performance and might also streamline troubleshooting and
+maintenance.
+
+Running co-resident processes on the same hardware is sometimes called
+"convergence". When using Ceph, engage in convergence only with expertise and
+after consideration.
+
+
+Logging Levels
+--------------
+
+Performance issues can result from high logging levels. Operators sometimes
+raise logging levels in order to track an issue and then forget to lower them
+afterwards. In such a situation, OSDs might consume valuable system resources to
+write needlessly verbose logs onto the disk. Anyone who does want to use high logging
+levels is advised to consider mounting a drive to the default path for logging
+(for example, ``/var/log/ceph/$cluster-$name.log``).
+
+Recovery Throttling
+-------------------
+
+Depending upon your configuration, Ceph may reduce recovery rates to maintain
+client or OSD performance, or it may increase recovery rates to the point that
+recovery impacts client or OSD performance. Check to see if the client or OSD
+is recovering.
+
+
+Kernel Version
+--------------
+
+Check the kernel version that you are running. Older kernels may lack updates
+that improve Ceph performance.
+
+
+Kernel Issues with SyncFS
+-------------------------
+
+If you have kernel issues with SyncFS, try running one OSD per host to see if
+performance improves. Old kernels might not have a recent enough version of
+``glibc`` to support ``syncfs(2)``.
+
+
+Filesystem Issues
+-----------------
+
+In post-Luminous releases, we recommend deploying clusters with the BlueStore
+back end. When running a pre-Luminous release, or if you have a specific
+reason to deploy OSDs with the previous Filestore backend, we recommend
+``XFS``.
+
+We recommend against using ``Btrfs`` or ``ext4``. The ``Btrfs`` filesystem has
+many attractive features, but bugs may lead to performance issues and spurious
+ENOSPC errors. We do not recommend ``ext4`` for Filestore OSDs because
+``xattr`` limitations break support for long object names, which are needed for
+RGW.
+
+For more information, see `Filesystem Recommendations`_.
+
+.. _Filesystem Recommendations: ../configuration/filesystem-recommendations
+
+Insufficient RAM
+----------------
+
+We recommend a *minimum* of 4GB of RAM per OSD daemon and we suggest rounding
+up from 6GB to 8GB. During normal operations, you may notice that ``ceph-osd``
+processes use only a fraction of that amount. You might be tempted to use the
+excess RAM for co-resident applications or to skimp on each node's memory
+capacity. However, when OSDs experience recovery their memory utilization
+spikes. If there is insufficient RAM available during recovery, OSD performance
+will slow considerably and the daemons may even crash or be killed by the Linux
+``OOM Killer``.
+
+
+Blocked Requests or Slow Requests
+---------------------------------
+
+When a ``ceph-osd`` daemon is slow to respond to a request, the cluster log
+receives messages reporting ops that are taking too long. The warning threshold
+defaults to 30 seconds and is configurable via the ``osd_op_complaint_time``
+setting.
+
+Legacy versions of Ceph complain about ``old requests``::
+
+ osd.0 192.168.106.220:6800/18813 312 : [WRN] old request osd_op(client.5099.0:790 fatty_26485_object789 [write 0~4096] 2.5e54f643) v4 received at 2012-03-06 15:42:56.054801 currently waiting for sub ops
+
+Newer versions of Ceph complain about ``slow requests``::
+
+ {date} {osd.num} [WRN] 1 slow requests, 1 included below; oldest blocked for > 30.005692 secs
+ {date} {osd.num} [WRN] slow request 30.005692 seconds old, received at {date-time}: osd_op(client.4240.0:8 benchmark_data_ceph-1_39426_object7 [write 0~4194304] 0.69848840) v4 currently waiting for subops from [610]
+
+Possible causes include:
+
+- A failing drive (check ``dmesg`` output)
+- A bug in the kernel file system (check ``dmesg`` output)
+- An overloaded cluster (check system load, iostat, etc.)
+- A bug in the ``ceph-osd`` daemon.
+
+Possible solutions:
+
+- Remove VMs from Ceph hosts
+- Upgrade kernel
+- Upgrade Ceph
+- Restart OSDs
+- Replace failed or failing components
+
+Debugging Slow Requests
+-----------------------
+
+If you run ``ceph daemon osd.<id> dump_historic_ops`` or ``ceph daemon osd.<id>
+dump_ops_in_flight``, you will see a set of operations and a list of events
+each operation went through. These are briefly described below.
+
+Events from the Messenger layer:
+
+- ``header_read``: The time that the messenger first started reading the message off the wire.
+- ``throttled``: The time that the messenger tried to acquire memory throttle space to read
+ the message into memory.
+- ``all_read``: The time that the messenger finished reading the message off the wire.
+- ``dispatched``: The time that the messenger gave the message to the OSD.
+- ``initiated``: This is identical to ``header_read``. The existence of both is a
+ historical oddity.
+
+Events from the OSD as it processes ops:
+
+- ``queued_for_pg``: The op has been put into the queue for processing by its PG.
+- ``reached_pg``: The PG has started performing the op.
+- ``waiting for \*``: The op is waiting for some other work to complete before
+ it can proceed (for example, a new OSDMap; the scrubbing of its object
+ target; the completion of a PG's peering; all as specified in the message).
+- ``started``: The op has been accepted as something the OSD should do and
+ is now being performed.
+- ``waiting for subops from``: The op has been sent to replica OSDs.
+
+Events from ```Filestore```:
+
+- ``commit_queued_for_journal_write``: The op has been given to the FileStore.
+- ``write_thread_in_journal_buffer``: The op is in the journal's buffer and is waiting
+ to be persisted (as the next disk write).
+- ``journaled_completion_queued``: The op was journaled to disk and its callback
+ has been queued for invocation.
+
+Events from the OSD after data has been given to underlying storage:
+
+- ``op_commit``: The op has been committed (that is, written to journal) by the
+ primary OSD.
+- ``op_applied``: The op has been `write()'en
+ <https://www.freebsd.org/cgi/man.cgi?write(2)>`_ to the backing FS (that is,
+ applied in memory but not flushed out to disk) on the primary.
+- ``sub_op_applied``: ``op_applied``, but for a replica's "subop".
+- ``sub_op_committed``: ``op_commit``, but for a replica's subop (only for EC pools).
+- ``sub_op_commit_rec/sub_op_apply_rec from <X>``: The primary marks this when it
+ hears about the above, but for a particular replica (i.e. ``<X>``).
+- ``commit_sent``: We sent a reply back to the client (or primary OSD, for sub ops).
+
+Some of these events may appear redundant, but they cross important boundaries
+in the internal code (such as passing data across locks into new threads).
+
+
+Flapping OSDs
+=============
+
+"Flapping" is the term for the phenomenon of an OSD being repeatedly marked
+``up`` and then ``down`` in rapid succession. This section explains how to
+recognize flapping, and how to mitigate it.
+
+When OSDs peer and check heartbeats, they use the cluster (back-end) network
+when it is available. See `Monitor/OSD Interaction`_ for details.
+
+The upstream Ceph community has traditionally recommended separate *public*
+(front-end) and *private* (cluster / back-end / replication) networks. This
+provides the following benefits:
+
+#. Segregation of (1) heartbeat traffic and replication/recovery traffic
+ (private) from (2) traffic from clients and between OSDs and monitors
+ (public). This helps keep one stream of traffic from DoS-ing the other,
+ which could in turn result in a cascading failure.
+
+#. Additional throughput for both public and private traffic.
+
+In the past, when common networking technologies were measured in a range
+encompassing 100Mb/s and 1Gb/s, this separation was often critical. But with
+today's 10Gb/s, 40Gb/s, and 25/50/100Gb/s networks, the above capacity concerns
+are often diminished or even obviated. For example, if your OSD nodes have two
+network ports, dedicating one to the public and the other to the private
+network means that you have no path redundancy. This degrades your ability to
+endure network maintenance and network failures without significant cluster or
+client impact. In situations like this, consider instead using both links for
+only a public network: with bonding (LACP) or equal-cost routing (for example,
+FRR) you reap the benefits of increased throughput headroom, fault tolerance,
+and reduced OSD flapping.
+
+When a private network (or even a single host link) fails or degrades while the
+public network continues operating normally, OSDs may not handle this situation
+well. In such situations, OSDs use the public network to report each other
+``down`` to the monitors, while marking themselves ``up``. The monitors then
+send out-- again on the public network--an updated cluster map with the
+affected OSDs marked `down`. These OSDs reply to the monitors "I'm not dead
+yet!", and the cycle repeats. We call this scenario 'flapping`, and it can be
+difficult to isolate and remediate. Without a private network, this irksome
+dynamic is avoided: OSDs are generally either ``up`` or ``down`` without
+flapping.
+
+If something does cause OSDs to 'flap' (repeatedly being marked ``down`` and
+then ``up`` again), you can force the monitors to halt the flapping by
+temporarily freezing their states:
+
+.. prompt:: bash
+
+ ceph osd set noup # prevent OSDs from getting marked up
+ ceph osd set nodown # prevent OSDs from getting marked down
+
+These flags are recorded in the osdmap:
+
+.. prompt:: bash
+
+ ceph osd dump | grep flags
+
+::
+
+ flags no-up,no-down
+
+You can clear these flags with:
+
+.. prompt:: bash
+
+ ceph osd unset noup
+ ceph osd unset nodown
+
+Two other flags are available, ``noin`` and ``noout``, which prevent booting
+OSDs from being marked ``in`` (allocated data) or protect OSDs from eventually
+being marked ``out`` (regardless of the current value of
+``mon_osd_down_out_interval``).
+
+.. note:: ``noup``, ``noout``, and ``nodown`` are temporary in the sense that
+ after the flags are cleared, the action that they were blocking should be
+ possible shortly thereafter. But the ``noin`` flag prevents OSDs from being
+ marked ``in`` on boot, and any daemons that started while the flag was set
+ will remain that way.
+
+.. note:: The causes and effects of flapping can be mitigated somewhat by
+ making careful adjustments to ``mon_osd_down_out_subtree_limit``,
+ ``mon_osd_reporter_subtree_level``, and ``mon_osd_min_down_reporters``.
+ Derivation of optimal settings depends on cluster size, topology, and the
+ Ceph release in use. The interaction of all of these factors is subtle and
+ is beyond the scope of this document.
+
+
+.. _iostat: https://en.wikipedia.org/wiki/Iostat
+.. _Ceph Logging and Debugging: ../../configuration/ceph-conf#ceph-logging-and-debugging
+.. _Logging and Debugging: ../log-and-debug
+.. _Debugging and Logging: ../debug
+.. _Monitor/OSD Interaction: ../../configuration/mon-osd-interaction
+.. _Monitor Config Reference: ../../configuration/mon-config-ref
+.. _monitoring your OSDs: ../../operations/monitoring-osd-pg
+
+.. _monitoring OSDs: ../../operations/monitoring-osd-pg/#monitoring-osds
+
+.. _subscribe to the ceph-devel email list: mailto:majordomo@vger.kernel.org?body=subscribe+ceph-devel
+.. _unsubscribe from the ceph-devel email list: mailto:majordomo@vger.kernel.org?body=unsubscribe+ceph-devel
+.. _subscribe to the ceph-users email list: mailto:ceph-users-join@lists.ceph.com
+.. _unsubscribe from the ceph-users email list: mailto:ceph-users-leave@lists.ceph.com
+.. _OS recommendations: ../../../start/os-recommendations
+.. _ceph-devel: ceph-devel@vger.kernel.org
diff --git a/doc/rados/troubleshooting/troubleshooting-pg.rst b/doc/rados/troubleshooting/troubleshooting-pg.rst
new file mode 100644
index 000000000..74d04bd9f
--- /dev/null
+++ b/doc/rados/troubleshooting/troubleshooting-pg.rst
@@ -0,0 +1,782 @@
+====================
+ Troubleshooting PGs
+====================
+
+Placement Groups Never Get Clean
+================================
+
+If, after you have created your cluster, any Placement Groups (PGs) remain in
+the ``active`` status, the ``active+remapped`` status or the
+``active+degraded`` status and never achieves an ``active+clean`` status, you
+likely have a problem with your configuration.
+
+In such a situation, it may be necessary to review the settings in the `Pool,
+PG and CRUSH Config Reference`_ and make appropriate adjustments.
+
+As a general rule, run your cluster with more than one OSD and a pool size
+greater than two object replicas.
+
+.. _one-node-cluster:
+
+One Node Cluster
+----------------
+
+Ceph no longer provides documentation for operating on a single node. Systems
+designed for distributed computing by definition do not run on a single node.
+The mounting of client kernel modules on a single node that contains a Ceph
+daemon may cause a deadlock due to issues with the Linux kernel itself (unless
+VMs are used as clients). You can experiment with Ceph in a one-node
+configuration, in spite of the limitations as described herein.
+
+To create a cluster on a single node, you must change the
+``osd_crush_chooseleaf_type`` setting from the default of ``1`` (meaning
+``host`` or ``node``) to ``0`` (meaning ``osd``) in your Ceph configuration
+file before you create your monitors and OSDs. This tells Ceph that an OSD is
+permitted to place another OSD on the same host. If you are trying to set up a
+single-node cluster and ``osd_crush_chooseleaf_type`` is greater than ``0``,
+Ceph will attempt to place the PGs of one OSD with the PGs of another OSD on
+another node, chassis, rack, row, or datacenter depending on the setting.
+
+.. tip:: DO NOT mount kernel clients directly on the same node as your Ceph
+ Storage Cluster. Kernel conflicts can arise. However, you can mount kernel
+ clients within virtual machines (VMs) on a single node.
+
+If you are creating OSDs using a single disk, you must manually create
+directories for the data first.
+
+
+Fewer OSDs than Replicas
+------------------------
+
+If two OSDs are in an ``up`` and ``in`` state, but the placement gropus are not
+in an ``active + clean`` state, you may have an ``osd_pool_default_size`` set
+to greater than ``2``.
+
+There are a few ways to address this situation. If you want to operate your
+cluster in an ``active + degraded`` state with two replicas, you can set the
+``osd_pool_default_min_size`` to ``2`` so that you can write objects in an
+``active + degraded`` state. You may also set the ``osd_pool_default_size``
+setting to ``2`` so that you have only two stored replicas (the original and
+one replica). In such a case, the cluster should achieve an ``active + clean``
+state.
+
+.. note:: You can make the changes while the cluster is running. If you make
+ the changes in your Ceph configuration file, you might need to restart your
+ cluster.
+
+
+Pool Size = 1
+-------------
+
+If you have ``osd_pool_default_size`` set to ``1``, you will have only one copy
+of the object. OSDs rely on other OSDs to tell them which objects they should
+have. If one OSD has a copy of an object and there is no second copy, then
+there is no second OSD to tell the first OSD that it should have that copy. For
+each placement group mapped to the first OSD (see ``ceph pg dump``), you can
+force the first OSD to notice the placement groups it needs by running a
+command of the following form:
+
+.. prompt:: bash
+
+ ceph osd force-create-pg <pgid>
+
+
+CRUSH Map Errors
+----------------
+
+If any placement groups in your cluster are unclean, then there might be errors
+in your CRUSH map.
+
+
+Stuck Placement Groups
+======================
+
+It is normal for placement groups to enter "degraded" or "peering" states after
+a component failure. Normally, these states reflect the expected progression
+through the failure recovery process. However, a placement group that stays in
+one of these states for a long time might be an indication of a larger problem.
+For this reason, the Ceph Monitors will warn when placement groups get "stuck"
+in a non-optimal state. Specifically, we check for:
+
+* ``inactive`` - The placement group has not been ``active`` for too long (that
+ is, it hasn't been able to service read/write requests).
+
+* ``unclean`` - The placement group has not been ``clean`` for too long (that
+ is, it hasn't been able to completely recover from a previous failure).
+
+* ``stale`` - The placement group status has not been updated by a
+ ``ceph-osd``. This indicates that all nodes storing this placement group may
+ be ``down``.
+
+List stuck placement groups by running one of the following commands:
+
+.. prompt:: bash
+
+ ceph pg dump_stuck stale
+ ceph pg dump_stuck inactive
+ ceph pg dump_stuck unclean
+
+- Stuck ``stale`` placement groups usually indicate that key ``ceph-osd``
+ daemons are not running.
+- Stuck ``inactive`` placement groups usually indicate a peering problem (see
+ :ref:`failures-osd-peering`).
+- Stuck ``unclean`` placement groups usually indicate that something is
+ preventing recovery from completing, possibly unfound objects (see
+ :ref:`failures-osd-unfound`);
+
+
+
+.. _failures-osd-peering:
+
+Placement Group Down - Peering Failure
+======================================
+
+In certain cases, the ``ceph-osd`` `peering` process can run into problems,
+which can prevent a PG from becoming active and usable. In such a case, running
+the command ``ceph health detail`` will report something similar to the following:
+
+.. prompt:: bash
+
+ ceph health detail
+
+::
+
+ HEALTH_ERR 7 pgs degraded; 12 pgs down; 12 pgs peering; 1 pgs recovering; 6 pgs stuck unclean; 114/3300 degraded (3.455%); 1/3 in osds are down
+ ...
+ pg 0.5 is down+peering
+ pg 1.4 is down+peering
+ ...
+ osd.1 is down since epoch 69, last address 192.168.106.220:6801/8651
+
+Query the cluster to determine exactly why the PG is marked ``down`` by running a command of the following form:
+
+.. prompt:: bash
+
+ ceph pg 0.5 query
+
+.. code-block:: javascript
+
+ { "state": "down+peering",
+ ...
+ "recovery_state": [
+ { "name": "Started\/Primary\/Peering\/GetInfo",
+ "enter_time": "2012-03-06 14:40:16.169679",
+ "requested_info_from": []},
+ { "name": "Started\/Primary\/Peering",
+ "enter_time": "2012-03-06 14:40:16.169659",
+ "probing_osds": [
+ 0,
+ 1],
+ "blocked": "peering is blocked due to down osds",
+ "down_osds_we_would_probe": [
+ 1],
+ "peering_blocked_by": [
+ { "osd": 1,
+ "current_lost_at": 0,
+ "comment": "starting or marking this osd lost may let us proceed"}]},
+ { "name": "Started",
+ "enter_time": "2012-03-06 14:40:16.169513"}
+ ]
+ }
+
+The ``recovery_state`` section tells us that peering is blocked due to down
+``ceph-osd`` daemons, specifically ``osd.1``. In this case, we can start that
+particular ``ceph-osd`` and recovery will proceed.
+
+Alternatively, if there is a catastrophic failure of ``osd.1`` (for example, if
+there has been a disk failure), the cluster can be informed that the OSD is
+``lost`` and the cluster can be instructed that it must cope as best it can.
+
+.. important:: Informing the cluster that an OSD has been lost is dangerous
+ because the cluster cannot guarantee that the other copies of the data are
+ consistent and up to date.
+
+To report an OSD ``lost`` and to instruct Ceph to continue to attempt recovery
+anyway, run a command of the following form:
+
+.. prompt:: bash
+
+ ceph osd lost 1
+
+Recovery will proceed.
+
+
+.. _failures-osd-unfound:
+
+Unfound Objects
+===============
+
+Under certain combinations of failures, Ceph may complain about ``unfound``
+objects, as in this example:
+
+.. prompt:: bash
+
+ ceph health detail
+
+::
+
+ HEALTH_WARN 1 pgs degraded; 78/3778 unfound (2.065%)
+ pg 2.4 is active+degraded, 78 unfound
+
+This means that the storage cluster knows that some objects (or newer copies of
+existing objects) exist, but it hasn't found copies of them. Here is an
+example of how this might come about for a PG whose data is on two OSDS, which
+we will call "1" and "2":
+
+* 1 goes down
+* 2 handles some writes, alone
+* 1 comes up
+* 1 and 2 re-peer, and the objects missing on 1 are queued for recovery.
+* Before the new objects are copied, 2 goes down.
+
+At this point, 1 knows that these objects exist, but there is no live
+``ceph-osd`` that has a copy of the objects. In this case, IO to those objects
+will block, and the cluster will hope that the failed node comes back soon.
+This is assumed to be preferable to returning an IO error to the user.
+
+.. note:: The situation described immediately above is one reason that setting
+ ``size=2`` on a replicated pool and ``m=1`` on an erasure coded pool risks
+ data loss.
+
+Identify which objects are unfound by running a command of the following form:
+
+.. prompt:: bash
+
+ ceph pg 2.4 list_unfound [starting offset, in json]
+
+.. code-block:: javascript
+
+ {
+ "num_missing": 1,
+ "num_unfound": 1,
+ "objects": [
+ {
+ "oid": {
+ "oid": "object",
+ "key": "",
+ "snapid": -2,
+ "hash": 2249616407,
+ "max": 0,
+ "pool": 2,
+ "namespace": ""
+ },
+ "need": "43'251",
+ "have": "0'0",
+ "flags": "none",
+ "clean_regions": "clean_offsets: [], clean_omap: 0, new_object: 1",
+ "locations": [
+ "0(3)",
+ "4(2)"
+ ]
+ }
+ ],
+ "state": "NotRecovering",
+ "available_might_have_unfound": true,
+ "might_have_unfound": [
+ {
+ "osd": "2(4)",
+ "status": "osd is down"
+ }
+ ],
+ "more": false
+ }
+
+If there are too many objects to list in a single result, the ``more`` field
+will be true and you can query for more. (Eventually the command line tool
+will hide this from you, but not yet.)
+
+Now you can identify which OSDs have been probed or might contain data.
+
+At the end of the listing (before ``more: false``), ``might_have_unfound`` is
+provided when ``available_might_have_unfound`` is true. This is equivalent to
+the output of ``ceph pg #.# query``. This eliminates the need to use ``query``
+directly. The ``might_have_unfound`` information given behaves the same way as
+that ``query`` does, which is described below. The only difference is that
+OSDs that have the status of ``already probed`` are ignored.
+
+Use of ``query``:
+
+.. prompt:: bash
+
+ ceph pg 2.4 query
+
+.. code-block:: javascript
+
+ "recovery_state": [
+ { "name": "Started\/Primary\/Active",
+ "enter_time": "2012-03-06 15:15:46.713212",
+ "might_have_unfound": [
+ { "osd": 1,
+ "status": "osd is down"}]},
+
+In this case, the cluster knows that ``osd.1`` might have data, but it is
+``down``. Here is the full range of possible states:
+
+* already probed
+* querying
+* OSD is down
+* not queried (yet)
+
+Sometimes it simply takes some time for the cluster to query possible
+locations.
+
+It is possible that there are other locations where the object might exist that
+are not listed. For example: if an OSD is stopped and taken out of the cluster
+and then the cluster fully recovers, and then through a subsequent set of
+failures the cluster ends up with an unfound object, the cluster will ignore
+the removed OSD. (This scenario, however, is unlikely.)
+
+If all possible locations have been queried and objects are still lost, you may
+have to give up on the lost objects. This, again, is possible only when unusual
+combinations of failures have occurred that allow the cluster to learn about
+writes that were performed before the writes themselves have been recovered. To
+mark the "unfound" objects as "lost", run a command of the following form:
+
+.. prompt:: bash
+
+ ceph pg 2.5 mark_unfound_lost revert|delete
+
+Here the final argument (``revert|delete``) specifies how the cluster should
+deal with lost objects.
+
+The ``delete`` option will cause the cluster to forget about them entirely.
+
+The ``revert`` option (which is not available for erasure coded pools) will
+either roll back to a previous version of the object or (if it was a new
+object) forget about the object entirely. Use ``revert`` with caution, as it
+may confuse applications that expect the object to exist.
+
+Homeless Placement Groups
+=========================
+
+It is possible that every OSD that has copies of a given placement group fails.
+If this happens, then the subset of the object store that contains those
+placement groups becomes unavailable and the monitor will receive no status
+updates for those placement groups. The monitor marks as ``stale`` any
+placement group whose primary OSD has failed. For example:
+
+.. prompt:: bash
+
+ ceph health
+
+::
+
+ HEALTH_WARN 24 pgs stale; 3/300 in osds are down
+
+Identify which placement groups are ``stale`` and which were the last OSDs to
+store the ``stale`` placement groups by running the following command:
+
+.. prompt:: bash
+
+ ceph health detail
+
+::
+
+ HEALTH_WARN 24 pgs stale; 3/300 in osds are down
+ ...
+ pg 2.5 is stuck stale+active+remapped, last acting [2,0]
+ ...
+ osd.10 is down since epoch 23, last address 192.168.106.220:6800/11080
+ osd.11 is down since epoch 13, last address 192.168.106.220:6803/11539
+ osd.12 is down since epoch 24, last address 192.168.106.220:6806/11861
+
+This output indicates that placement group 2.5 (``pg 2.5``) was last managed by
+``osd.0`` and ``osd.2``. Restart those OSDs to allow the cluster to recover
+that placement group.
+
+
+Only a Few OSDs Receive Data
+============================
+
+If only a few of the nodes in the cluster are receiving data, check the number
+of placement groups in the pool as instructed in the :ref:`Placement Groups
+<rados_ops_pgs_get_pg_num>` documentation. Since placement groups get mapped to
+OSDs in an operation involving dividing the number of placement groups in the
+cluster by the number of OSDs in the cluster, a small number of placement
+groups (the remainder, in this operation) are sometimes not distributed across
+the cluster. In situations like this, create a pool with a placement group
+count that is a multiple of the number of OSDs. See `Placement Groups`_ for
+details. See the :ref:`Pool, PG, and CRUSH Config Reference
+<rados_config_pool_pg_crush_ref>` for instructions on changing the default
+values used to determine how many placement groups are assigned to each pool.
+
+
+Can't Write Data
+================
+
+If the cluster is up, but some OSDs are down and you cannot write data, make
+sure that you have the minimum number of OSDs running in the pool. If you don't
+have the minimum number of OSDs running in the pool, Ceph will not allow you to
+write data to it because there is no guarantee that Ceph can replicate your
+data. See ``osd_pool_default_min_size`` in the :ref:`Pool, PG, and CRUSH
+Config Reference <rados_config_pool_pg_crush_ref>` for details.
+
+
+PGs Inconsistent
+================
+
+If the command ``ceph health detail`` returns an ``active + clean +
+inconsistent`` state, this might indicate an error during scrubbing. Identify
+the inconsistent placement group or placement groups by running the following
+command:
+
+.. prompt:: bash
+
+ $ ceph health detail
+
+::
+
+ HEALTH_ERR 1 pgs inconsistent; 2 scrub errors
+ pg 0.6 is active+clean+inconsistent, acting [0,1,2]
+ 2 scrub errors
+
+Alternatively, run this command if you prefer to inspect the output in a
+programmatic way:
+
+.. prompt:: bash
+
+ $ rados list-inconsistent-pg rbd
+
+::
+
+ ["0.6"]
+
+There is only one consistent state, but in the worst case, we could have
+different inconsistencies in multiple perspectives found in more than one
+object. If an object named ``foo`` in PG ``0.6`` is truncated, the output of
+``rados list-inconsistent-pg rbd`` will look something like this:
+
+.. prompt:: bash
+
+ rados list-inconsistent-obj 0.6 --format=json-pretty
+
+.. code-block:: javascript
+
+ {
+ "epoch": 14,
+ "inconsistents": [
+ {
+ "object": {
+ "name": "foo",
+ "nspace": "",
+ "locator": "",
+ "snap": "head",
+ "version": 1
+ },
+ "errors": [
+ "data_digest_mismatch",
+ "size_mismatch"
+ ],
+ "union_shard_errors": [
+ "data_digest_mismatch_info",
+ "size_mismatch_info"
+ ],
+ "selected_object_info": "0:602f83fe:::foo:head(16'1 client.4110.0:1 dirty|data_digest|omap_digest s 968 uv 1 dd e978e67f od ffffffff alloc_hint [0 0 0])",
+ "shards": [
+ {
+ "osd": 0,
+ "errors": [],
+ "size": 968,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0xe978e67f"
+ },
+ {
+ "osd": 1,
+ "errors": [],
+ "size": 968,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0xe978e67f"
+ },
+ {
+ "osd": 2,
+ "errors": [
+ "data_digest_mismatch_info",
+ "size_mismatch_info"
+ ],
+ "size": 0,
+ "omap_digest": "0xffffffff",
+ "data_digest": "0xffffffff"
+ }
+ ]
+ }
+ ]
+ }
+
+In this case, the output indicates the following:
+
+* The only inconsistent object is named ``foo``, and its head has
+ inconsistencies.
+* The inconsistencies fall into two categories:
+
+ #. ``errors``: these errors indicate inconsistencies between shards, without
+ an indication of which shard(s) are bad. Check for the ``errors`` in the
+ ``shards`` array, if available, to pinpoint the problem.
+
+ * ``data_digest_mismatch``: the digest of the replica read from ``OSD.2``
+ is different from the digests of the replica reads of ``OSD.0`` and
+ ``OSD.1``
+ * ``size_mismatch``: the size of the replica read from ``OSD.2`` is ``0``,
+ but the size reported by ``OSD.0`` and ``OSD.1`` is ``968``.
+
+ #. ``union_shard_errors``: the union of all shard-specific ``errors`` in the
+ ``shards`` array. The ``errors`` are set for the shard with the problem.
+ These errors include ``read_error`` and other similar errors. The
+ ``errors`` ending in ``oi`` indicate a comparison with
+ ``selected_object_info``. Examine the ``shards`` array to determine
+ which shard has which error or errors.
+
+ * ``data_digest_mismatch_info``: the digest stored in the ``object-info``
+ is not ``0xffffffff``, which is calculated from the shard read from
+ ``OSD.2``
+ * ``size_mismatch_info``: the size stored in the ``object-info`` is
+ different from the size read from ``OSD.2``. The latter is ``0``.
+
+.. warning:: If ``read_error`` is listed in a shard's ``errors`` attribute, the
+ inconsistency is likely due to physical storage errors. In cases like this,
+ check the storage used by that OSD.
+
+ Examine the output of ``dmesg`` and ``smartctl`` before attempting a drive
+ repair.
+
+To repair the inconsistent placement group, run a command of the following
+form:
+
+.. prompt:: bash
+
+ ceph pg repair {placement-group-ID}
+
+.. warning: This command overwrites the "bad" copies with "authoritative"
+ copies. In most cases, Ceph is able to choose authoritative copies from all
+ the available replicas by using some predefined criteria. This, however,
+ does not work in every case. For example, it might be the case that the
+ stored data digest is missing, which means that the calculated digest is
+ ignored when Ceph chooses the authoritative copies. Be aware of this, and
+ use the above command with caution.
+
+
+If you receive ``active + clean + inconsistent`` states periodically due to
+clock skew, consider configuring the `NTP
+<https://en.wikipedia.org/wiki/Network_Time_Protocol>`_ daemons on your monitor
+hosts to act as peers. See `The Network Time Protocol <http://www.ntp.org>`_
+and Ceph :ref:`Clock Settings <mon-config-ref-clock>` for more information.
+
+
+Erasure Coded PGs are not active+clean
+======================================
+
+If CRUSH fails to find enough OSDs to map to a PG, it will show as a
+``2147483647`` which is ``ITEM_NONE`` or ``no OSD found``. For example::
+
+ [2,1,6,0,5,8,2147483647,7,4]
+
+Not enough OSDs
+---------------
+
+If the Ceph cluster has only eight OSDs and an erasure coded pool needs nine
+OSDs, the cluster will show "Not enough OSDs". In this case, you either create
+another erasure coded pool that requires fewer OSDs, by running commands of the
+following form:
+
+.. prompt:: bash
+
+ ceph osd erasure-code-profile set myprofile k=5 m=3
+ ceph osd pool create erasurepool erasure myprofile
+
+or add new OSDs, and the PG will automatically use them.
+
+CRUSH constraints cannot be satisfied
+-------------------------------------
+
+If the cluster has enough OSDs, it is possible that the CRUSH rule is imposing
+constraints that cannot be satisfied. If there are ten OSDs on two hosts and
+the CRUSH rule requires that no two OSDs from the same host are used in the
+same PG, the mapping may fail because only two OSDs will be found. Check the
+constraint by displaying ("dumping") the rule, as shown here:
+
+.. prompt:: bash
+
+ ceph osd crush rule ls
+
+::
+
+ [
+ "replicated_rule",
+ "erasurepool"]
+ $ ceph osd crush rule dump erasurepool
+ { "rule_id": 1,
+ "rule_name": "erasurepool",
+ "type": 3,
+ "steps": [
+ { "op": "take",
+ "item": -1,
+ "item_name": "default"},
+ { "op": "chooseleaf_indep",
+ "num": 0,
+ "type": "host"},
+ { "op": "emit"}]}
+
+
+Resolve this problem by creating a new pool in which PGs are allowed to have
+OSDs residing on the same host by running the following commands:
+
+.. prompt:: bash
+
+ ceph osd erasure-code-profile set myprofile crush-failure-domain=osd
+ ceph osd pool create erasurepool erasure myprofile
+
+CRUSH gives up too soon
+-----------------------
+
+If the Ceph cluster has just enough OSDs to map the PG (for instance a cluster
+with a total of nine OSDs and an erasure coded pool that requires nine OSDs per
+PG), it is possible that CRUSH gives up before finding a mapping. This problem
+can be resolved by:
+
+* lowering the erasure coded pool requirements to use fewer OSDs per PG (this
+ requires the creation of another pool, because erasure code profiles cannot
+ be modified dynamically).
+
+* adding more OSDs to the cluster (this does not require the erasure coded pool
+ to be modified, because it will become clean automatically)
+
+* using a handmade CRUSH rule that tries more times to find a good mapping.
+ This can be modified for an existing CRUSH rule by setting
+ ``set_choose_tries`` to a value greater than the default.
+
+First, verify the problem by using ``crushtool`` after extracting the crushmap
+from the cluster. This ensures that your experiments do not modify the Ceph
+cluster and that they operate only on local files:
+
+.. prompt:: bash
+
+ ceph osd crush rule dump erasurepool
+
+::
+
+ { "rule_id": 1,
+ "rule_name": "erasurepool",
+ "type": 3,
+ "steps": [
+ { "op": "take",
+ "item": -1,
+ "item_name": "default"},
+ { "op": "chooseleaf_indep",
+ "num": 0,
+ "type": "host"},
+ { "op": "emit"}]}
+ $ ceph osd getcrushmap > crush.map
+ got crush map from osdmap epoch 13
+ $ crushtool -i crush.map --test --show-bad-mappings \
+ --rule 1 \
+ --num-rep 9 \
+ --min-x 1 --max-x $((1024 * 1024))
+ bad mapping rule 8 x 43 num_rep 9 result [3,2,7,1,2147483647,8,5,6,0]
+ bad mapping rule 8 x 79 num_rep 9 result [6,0,2,1,4,7,2147483647,5,8]
+ bad mapping rule 8 x 173 num_rep 9 result [0,4,6,8,2,1,3,7,2147483647]
+
+Here, ``--num-rep`` is the number of OSDs that the erasure code CRUSH rule
+needs, ``--rule`` is the value of the ``rule_id`` field that was displayed by
+``ceph osd crush rule dump``. This test will attempt to map one million values
+(in this example, the range defined by ``[--min-x,--max-x]``) and must display
+at least one bad mapping. If this test outputs nothing, all mappings have been
+successful and you can be assured that the problem with your cluster is not
+caused by bad mappings.
+
+Changing the value of set_choose_tries
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#. Decompile the CRUSH map to edit the CRUSH rule by running the following
+ command:
+
+ .. prompt:: bash
+
+ crushtool --decompile crush.map > crush.txt
+
+#. Add the following line to the rule::
+
+ step set_choose_tries 100
+
+ The relevant part of the ``crush.txt`` file will resemble this::
+
+ rule erasurepool {
+ id 1
+ type erasure
+ step set_chooseleaf_tries 5
+ step set_choose_tries 100
+ step take default
+ step chooseleaf indep 0 type host
+ step emit
+ }
+
+#. Recompile and retest the CRUSH rule:
+
+ .. prompt:: bash
+
+ crushtool --compile crush.txt -o better-crush.map
+
+#. When all mappings succeed, display a histogram of the number of tries that
+ were necessary to find all of the mapping by using the
+ ``--show-choose-tries`` option of the ``crushtool`` command, as in the
+ following example:
+
+ .. prompt:: bash
+
+ crushtool -i better-crush.map --test --show-bad-mappings \
+ --show-choose-tries \
+ --rule 1 \
+ --num-rep 9 \
+ --min-x 1 --max-x $((1024 * 1024))
+ ...
+ 11: 42
+ 12: 44
+ 13: 54
+ 14: 45
+ 15: 35
+ 16: 34
+ 17: 30
+ 18: 25
+ 19: 19
+ 20: 22
+ 21: 20
+ 22: 17
+ 23: 13
+ 24: 16
+ 25: 13
+ 26: 11
+ 27: 11
+ 28: 13
+ 29: 11
+ 30: 10
+ 31: 6
+ 32: 5
+ 33: 10
+ 34: 3
+ 35: 7
+ 36: 5
+ 37: 2
+ 38: 5
+ 39: 5
+ 40: 2
+ 41: 5
+ 42: 4
+ 43: 1
+ 44: 2
+ 45: 2
+ 46: 3
+ 47: 1
+ 48: 0
+ ...
+ 102: 0
+ 103: 1
+ 104: 0
+ ...
+
+ This output indicates that it took eleven tries to map forty-two PGs, twelve
+ tries to map forty-four PGs etc. The highest number of tries is the minimum
+ value of ``set_choose_tries`` that prevents bad mappings (for example,
+ ``103`` in the above output, because it did not take more than 103 tries for
+ any PG to be mapped).
+
+.. _check: ../../operations/placement-groups#get-the-number-of-placement-groups
+.. _Placement Groups: ../../operations/placement-groups
+.. _Pool, PG and CRUSH Config Reference: ../../configuration/pool-pg-config-ref
diff --git a/doc/radosgw/STS.rst b/doc/radosgw/STS.rst
new file mode 100644
index 000000000..bc89b89da
--- /dev/null
+++ b/doc/radosgw/STS.rst
@@ -0,0 +1,297 @@
+===========
+STS in Ceph
+===========
+
+Secure Token Service is a web service in AWS that returns a set of temporary security credentials for authenticating federated users.
+The link to official AWS documentation can be found here: https://docs.aws.amazon.com/STS/latest/APIReference/Welcome.html.
+
+Ceph Object Gateway implements a subset of STS APIs that provide temporary credentials for identity and access management.
+These temporary credentials can be used to make subsequent S3 calls which will be authenticated by the STS engine in Ceph Object Gateway.
+Permissions of the temporary credentials can be further restricted via an IAM policy passed as a parameter to the STS APIs.
+
+STS REST APIs
+=============
+
+The following STS REST APIs have been implemented in Ceph Object Gateway:
+
+1. AssumeRole: Returns a set of temporary credentials that can be used for
+cross-account access. The temporary credentials will have permissions that are
+allowed by both - permission policies attached with the Role and policy attached
+with the AssumeRole API.
+
+Parameters:
+ **RoleArn** (String/ Required): ARN of the Role to Assume.
+
+ **RoleSessionName** (String/ Required): An Identifier for the assumed role
+ session.
+
+ **Policy** (String/ Optional): An IAM Policy in JSON format.
+
+ **DurationSeconds** (Integer/ Optional): The duration in seconds of the session.
+ Its default value is 3600.
+
+ **ExternalId** (String/ Optional): A unique Id that might be used when a role is
+ assumed in another account.
+
+ **SerialNumber** (String/ Optional): The Id number of the MFA device associated
+ with the user making the AssumeRole call.
+
+ **TokenCode** (String/ Optional): The value provided by the MFA device, if the
+ trust policy of the role being assumed requires MFA.
+
+2. AssumeRoleWithWebIdentity: Returns a set of temporary credentials for users that
+have been authenticated by a web/mobile app by an OpenID Connect /OAuth2.0 Identity Provider.
+Currently Keycloak has been tested and integrated with RGW.
+
+Parameters:
+ **RoleArn** (String/ Required): ARN of the Role to Assume.
+
+ **RoleSessionName** (String/ Required): An Identifier for the assumed role
+ session.
+
+ **Policy** (String/ Optional): An IAM Policy in JSON format.
+
+ **DurationSeconds** (Integer/ Optional): The duration in seconds of the session.
+ Its default value is 3600.
+
+ **ProviderId** (String/ Optional): Fully qualified host component of the domain name
+ of the IDP. Valid only for OAuth2.0 tokens (not for OpenID Connect tokens).
+
+ **WebIdentityToken** (String/ Required): The OpenID Connect/ OAuth2.0 token, which the
+ application gets in return after authenticating its user with an IDP.
+
+Before invoking AssumeRoleWithWebIdentity, an OpenID Connect Provider entity (which the web application
+authenticates with), needs to be created in RGW.
+
+The trust between the IDP and the role is created by adding a condition to the role's trust policy, which
+allows access only to applications which satisfy the given condition.
+All claims of the JWT are supported in the condition of the role's trust policy.
+An example of a policy that uses the 'aud' claim in the condition is of the form::
+
+ '''{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Federated":["arn:aws:iam:::oidc-provider/<URL of IDP>"]},"Action":["sts:AssumeRoleWithWebIdentity"],"Condition":{"StringEquals":{"<URL of IDP> :app_id":"<aud>"}}}]}'''
+
+The app_id in the condition above must match the 'aud' claim of the incoming token.
+
+An example of a policy that uses the 'sub' claim in the condition is of the form::
+
+ "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"Federated\":[\"arn:aws:iam:::oidc-provider/<URL of IDP>\"]},\"Action\":[\"sts:AssumeRoleWithWebIdentity\"],\"Condition\":{\"StringEquals\":{\"<URL of IDP> :sub\":\"<sub>\"\}\}\}\]\}"
+
+Similarly, an example of a policy that uses 'azp' claim in the condition is of the form::
+
+ "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"Federated\":[\"arn:aws:iam:::oidc-provider/<URL of IDP>\"]},\"Action\":[\"sts:AssumeRoleWithWebIdentity\"],\"Condition\":{\"StringEquals\":{\"<URL of IDP> :azp\":\"<azp>\"\}\}\}\]\}"
+
+A shadow user is created corresponding to every federated user. The user id is derived from the 'sub' field of the incoming web token.
+The user is created in a separate namespace - 'oidc' such that the user id doesn't clash with any other user ids in rgw. The format of the user id
+is - <tenant>$<user-namespace>$<sub> where user-namespace is 'oidc' for users that authenticate with oidc providers.
+
+RGW now supports Session tags that can be passed in the web token to AssumeRoleWithWebIdentity call. More information related to Session Tags can be found here
+:doc:`session-tags`.
+
+STS Configuration
+=================
+
+The following configurable options have to be added for STS integration::
+
+ [client.{your-rgw-name}]
+ rgw_sts_key = {sts key for encrypting the session token}
+ rgw_s3_auth_use_sts = true
+
+Notes:
+
+* By default, STS and S3 APIs co-exist in the same namespace, and both S3
+ and STS APIs can be accessed via the same endpoint in Ceph Object Gateway.
+* The ``rgw_sts_key`` needs to be a hex-string consisting of exactly 16 characters.
+
+Examples
+========
+1. In order to get the example to work, make sure that the user TESTER has the ``roles`` capability assigned:
+
+.. code-block:: console
+
+ radosgw-admin caps add --uid="TESTER" --caps="roles=*"
+
+2. The following is an example of the AssumeRole API call, which shows steps to create a role, assign a policy to it
+ (that allows access to S3 resources), assuming a role to get temporary credentials and accessing S3 resources using
+ those credentials. In this example, TESTER1 assumes a role created by TESTER, to access S3 resources owned by TESTER,
+ according to the permission policy attached to the role.
+
+.. code-block:: python
+
+ import boto3
+
+ iam_client = boto3.client('iam',
+ aws_access_key_id=<access_key of TESTER>,
+ aws_secret_access_key=<secret_key of TESTER>,
+ endpoint_url=<IAM URL>,
+ region_name=''
+ )
+
+ policy_document = '''{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["arn:aws:iam:::user/TESTER1"]},"Action":["sts:AssumeRole"]}]}'''
+
+ role_response = iam_client.create_role(
+ AssumeRolePolicyDocument=policy_document,
+ Path='/',
+ RoleName='S3Access',
+ )
+
+ role_policy = '''{"Version":"2012-10-17","Statement":{"Effect":"Allow","Action":"s3:*","Resource":"arn:aws:s3:::*"}}'''
+
+ response = iam_client.put_role_policy(
+ RoleName='S3Access',
+ PolicyName='Policy1',
+ PolicyDocument=role_policy
+ )
+
+ sts_client = boto3.client('sts',
+ aws_access_key_id=<access_key of TESTER1>,
+ aws_secret_access_key=<secret_key of TESTER1>,
+ endpoint_url=<STS URL>,
+ region_name='',
+ )
+
+ response = sts_client.assume_role(
+ RoleArn=role_response['Role']['Arn'],
+ RoleSessionName='Bob',
+ DurationSeconds=3600
+ )
+
+ s3client = boto3.client('s3',
+ aws_access_key_id = response['Credentials']['AccessKeyId'],
+ aws_secret_access_key = response['Credentials']['SecretAccessKey'],
+ aws_session_token = response['Credentials']['SessionToken'],
+ endpoint_url=<S3 URL>,
+ region_name='',)
+
+ bucket_name = 'my-bucket'
+ s3bucket = s3client.create_bucket(Bucket=bucket_name)
+ resp = s3client.list_buckets()
+
+2. The following is an example of AssumeRoleWithWebIdentity API call, where an external app that has users authenticated with
+an OpenID Connect/ OAuth2 IDP (Keycloak in this example), assumes a role to get back temporary credentials and access S3 resources
+according to permission policy of the role.
+
+.. code-block:: python
+
+ import boto3
+
+ iam_client = boto3.client('iam',
+ aws_access_key_id=<access_key of TESTER>,
+ aws_secret_access_key=<secret_key of TESTER>,
+ endpoint_url=<IAM URL>,
+ region_name=''
+ )
+
+ oidc_response = iam_client.create_open_id_connect_provider(
+ Url=<URL of the OpenID Connect Provider,
+ ClientIDList=[
+ <Client id registered with the IDP>
+ ],
+ ThumbprintList=[
+ <Thumbprint of the IDP>
+ ]
+ )
+
+ policy_document = '''{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Federated":["arn:aws:iam:::oidc-provider/localhost:8080/auth/realms/demo"]},"Action":["sts:AssumeRoleWithWebIdentity"],"Condition":{"StringEquals":{"localhost:8080/auth/realms/demo:app_id":"customer-portal"}}}]}'''
+ role_response = iam_client.create_role(
+ AssumeRolePolicyDocument=policy_document,
+ Path='/',
+ RoleName='S3Access',
+ )
+
+ role_policy = '''{"Version":"2012-10-17","Statement":{"Effect":"Allow","Action":"s3:*","Resource":"arn:aws:s3:::*"}}'''
+
+ response = iam_client.put_role_policy(
+ RoleName='S3Access',
+ PolicyName='Policy1',
+ PolicyDocument=role_policy
+ )
+
+ sts_client = boto3.client('sts',
+ aws_access_key_id=<access_key of TESTER1>,
+ aws_secret_access_key=<secret_key of TESTER1>,
+ endpoint_url=<STS URL>,
+ region_name='',
+ )
+
+ response = client.assume_role_with_web_identity(
+ RoleArn=role_response['Role']['Arn'],
+ RoleSessionName='Bob',
+ DurationSeconds=3600,
+ WebIdentityToken=<Web Token>
+ )
+
+ s3client = boto3.client('s3',
+ aws_access_key_id = response['Credentials']['AccessKeyId'],
+ aws_secret_access_key = response['Credentials']['SecretAccessKey'],
+ aws_session_token = response['Credentials']['SessionToken'],
+ endpoint_url=<S3 URL>,
+ region_name='',)
+
+ bucket_name = 'my-bucket'
+ s3bucket = s3client.create_bucket(Bucket=bucket_name)
+ resp = s3client.list_buckets()
+
+How to obtain thumbprint of an OpenID Connect Provider IDP
+==========================================================
+1. Take the OpenID connect provider's URL and add /.well-known/openid-configuration
+to it to get the URL to get the IDP's configuration document. For example, if the URL
+of the IDP is http://localhost:8000/auth/realms/quickstart, then the URL to get the
+document from is http://localhost:8000/auth/realms/quickstart/.well-known/openid-configuration
+
+2. Use the following curl command to get the configuration document from the URL described
+in step 1::
+
+ curl -k -v \
+ -X GET \
+ -H "Content-Type: application/x-www-form-urlencoded" \
+ "http://localhost:8000/auth/realms/quickstart/.well-known/openid-configuration" \
+ | jq .
+
+ 3. From the response of step 2, use the value of "jwks_uri" to get the certificate of the IDP,
+ using the following code::
+ curl -k -v \
+ -X GET \
+ -H "Content-Type: application/x-www-form-urlencoded" \
+ "http://$KC_SERVER/$KC_CONTEXT/realms/$KC_REALM/protocol/openid-connect/certs" \
+ | jq .
+
+3. Copy the result of "x5c" in the response above, in a file certificate.crt, and add
+'-----BEGIN CERTIFICATE-----' at the beginning and "-----END CERTIFICATE-----"
+at the end.
+
+4. Use the following OpenSSL command to get the certificate thumbprint::
+
+ openssl x509 -in certificate.crt -fingerprint -noout
+
+5. The result of the above command in step 4, will be a SHA1 fingerprint, like the following::
+
+ SHA1 Fingerprint=F7:D7:B3:51:5D:D0:D3:19:DD:21:9A:43:A9:EA:72:7A:D6:06:52:87
+
+6. Remove the colons from the result above to get the final thumbprint which can be as input
+while creating the OpenID Connect Provider entity in IAM::
+
+ F7D7B3515DD0D319DD219A43A9EA727AD6065287
+
+Roles in RGW
+============
+
+More information for role manipulation can be found here
+:doc:`role`.
+
+OpenID Connect Provider in RGW
+==============================
+
+More information for OpenID Connect Provider entity manipulation
+can be found here
+:doc:`oidc`.
+
+Keycloak integration with Radosgw
+=================================
+
+Steps for integrating Radosgw with Keycloak can be found here
+:doc:`keycloak`.
+
+STSLite
+=======
+STSLite has been built on STS, and documentation for the same can be found here
+:doc:`STSLite`. \ No newline at end of file
diff --git a/doc/radosgw/STSLite.rst b/doc/radosgw/STSLite.rst
new file mode 100644
index 000000000..7880e373f
--- /dev/null
+++ b/doc/radosgw/STSLite.rst
@@ -0,0 +1,196 @@
+=========
+STS Lite
+=========
+
+Ceph Object Gateway provides support for a subset of Amazon Secure Token Service
+(STS) APIs. STS Lite is an extension of STS and builds upon one of its APIs to
+decrease the load on external IDPs like Keystone and LDAP.
+
+A set of temporary security credentials is returned after authenticating
+a set of AWS credentials with the external IDP. These temporary credentials can be used
+to make subsequent S3 calls which will be authenticated by the STS engine in Ceph,
+resulting in less load on the Keystone/ LDAP server.
+
+Temporary and limited privileged credentials can be obtained for a local user
+also using the STS Lite API.
+
+STS Lite REST APIs
+==================
+
+The following STS Lite REST API is part of STS Lite in Ceph Object Gateway:
+
+1. GetSessionToken: Returns a set of temporary credentials for a set of AWS
+credentials. After initial authentication with Keystone/ LDAP, the temporary
+credentials returned can be used to make subsequent S3 calls. The temporary
+credentials will have the same permission as that of the AWS credentials.
+
+Parameters:
+ **DurationSeconds** (Integer/ Optional): The duration in seconds for which the
+ credentials should remain valid. Its default value is 3600. Its default max
+ value is 43200 which is can be configured using rgw sts max session duration.
+
+ **SerialNumber** (String/ Optional): The Id number of the MFA device associated
+ with the user making the GetSessionToken call.
+
+ **TokenCode** (String/ Optional): The value provided by the MFA device, if MFA is required.
+
+An administrative user needs to attach a policy to allow invocation of GetSessionToken API using its permanent
+credentials and to allow subsequent S3 operations invocation using only the temporary credentials returned
+by GetSessionToken.
+
+The user attaching the policy needs to have admin caps. For example::
+
+ radosgw-admin caps add --uid="TESTER" --caps="user-policy=*"
+
+The following is the policy that needs to be attached to a user 'TESTER1'::
+
+ user_policy = "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Deny\",\"Action\":\"s3:*\",\"Resource\":[\"*\"],\"Condition\":{\"BoolIfExists\":{\"sts:authentication\":\"false\"}}},{\"Effect\":\"Allow\",\"Action\":\"sts:GetSessionToken\",\"Resource\":\"*\",\"Condition\":{\"BoolIfExists\":{\"sts:authentication\":\"false\"}}}]}"
+
+
+STS Lite Configuration
+======================
+
+The following configurable options are available for STS Lite integration::
+
+ [client.radosgw.gateway]
+ rgw sts key = {sts key for encrypting the session token}
+ rgw s3 auth use sts = true
+
+The above STS configurables can be used with the Keystone configurables if one
+needs to use STS Lite in conjunction with Keystone. The complete set of
+configurable options will be::
+
+ [client.{your-rgw-name}]
+ rgw_sts_key = {sts key for encrypting/ decrypting the session token, exactly 16 hex characters}
+ rgw_s3_auth_use_sts = true
+
+ rgw keystone url = {keystone server url:keystone server admin port}
+ rgw keystone admin project = {keystone admin project name}
+ rgw keystone admin tenant = {keystone service tenant name}
+ rgw keystone admin domain = {keystone admin domain name}
+ rgw keystone api version = {keystone api version}
+ rgw keystone implicit tenants = {true for private tenant for each new user}
+ rgw keystone admin password = {keystone service tenant user name}
+ rgw keystone admin user = keystone service tenant user password}
+ rgw keystone accepted roles = {accepted user roles}
+ rgw keystone token cache size = {number of tokens to cache}
+ rgw s3 auth use keystone = true
+
+The details of the integrating ldap with Ceph Object Gateway can be found here:
+:doc:`keystone`
+
+The complete set of configurables to use STS Lite with LDAP are::
+
+ [client.{your-rgw-name}]
+ rgw_sts_key = {sts key for encrypting/ decrypting the session token, exactly 16 hex characters}
+ rgw_s3_auth_use_sts = true
+
+ rgw_s3_auth_use_ldap = true
+ rgw_ldap_uri = {LDAP server to use}
+ rgw_ldap_binddn = {Distinguished Name (DN) of the service account}
+ rgw_ldap_secret = {password for the service account}
+ rgw_ldap_searchdn = {base in the directory information tree for searching users}
+ rgw_ldap_dnattr = {attribute being used in the constructed search filter to match a username}
+ rgw_ldap_searchfilter = {search filter}
+
+The details of the integrating ldap with Ceph Object Gateway can be found here:
+:doc:`ldap-auth`
+
+Note: By default, STS and S3 APIs co-exist in the same namespace, and both S3
+and STS APIs can be accessed via the same endpoint in Ceph Object Gateway.
+
+Example showing how to Use STS Lite with Keystone
+=================================================
+
+The following are the steps needed to use STS Lite with Keystone. Boto 3.x has
+been used to write an example code to show the integration of STS Lite with
+Keystone.
+
+1. Generate EC2 credentials :
+
+.. code-block:: javascript
+
+ openstack ec2 credentials create
+ +------------+--------------------------------------------------------+
+ | Field | Value |
+ +------------+--------------------------------------------------------+
+ | access | b924dfc87d454d15896691182fdeb0ef |
+ | links | {u'self': u'http://192.168.0.15/identity/v3/users/ |
+ | | 40a7140e424f493d8165abc652dc731c/credentials/ |
+ | | OS-EC2/b924dfc87d454d15896691182fdeb0ef'} |
+ | project_id | c703801dccaf4a0aaa39bec8c481e25a |
+ | secret | 6a2142613c504c42a94ba2b82147dc28 |
+ | trust_id | None |
+ | user_id | 40a7140e424f493d8165abc652dc731c |
+ +------------+--------------------------------------------------------+
+
+2. Use the credentials created in the step 1. to get back a set of temporary
+ credentials using GetSessionToken API.
+
+.. code-block:: python
+
+ import boto3
+
+ access_key = <ec2 access key>
+ secret_key = <ec2 secret key>
+
+ client = boto3.client('sts',
+ aws_access_key_id=access_key,
+ aws_secret_access_key=secret_key,
+ endpoint_url=<STS URL>,
+ region_name='',
+ )
+
+ response = client.get_session_token(
+ DurationSeconds=43200
+ )
+
+3. The temporary credentials obtained in step 2. can be used for making S3 calls:
+
+.. code-block:: python
+
+ s3client = boto3.client('s3',
+ aws_access_key_id = response['Credentials']['AccessKeyId'],
+ aws_secret_access_key = response['Credentials']['SecretAccessKey'],
+ aws_session_token = response['Credentials']['SessionToken'],
+ endpoint_url=<S3 URL>,
+ region_name='')
+
+ bucket = s3client.create_bucket(Bucket='my-new-shiny-bucket')
+ response = s3client.list_buckets()
+ for bucket in response["Buckets"]:
+ print("{name}\t{created}".format(
+ name = bucket['Name'],
+ created = bucket['CreationDate'],
+ ))
+
+Similar steps can be performed for using GetSessionToken with LDAP.
+
+Limitations and Workarounds
+===========================
+
+1. Keystone currently supports only S3 requests, hence in order to successfully
+authenticate an STS request, the following workaround needs to be added to boto
+to the following file - botocore/auth.py
+
+Lines 13-16 have been added as a workaround in the code block below:
+
+.. code-block:: python
+
+ class SigV4Auth(BaseSigner):
+ """
+ Sign a request with Signature V4.
+ """
+ REQUIRES_REGION = True
+
+ def __init__(self, credentials, service_name, region_name):
+ self.credentials = credentials
+ # We initialize these value here so the unit tests can have
+ # valid values. But these will get overridden in ``add_auth``
+ # later for real requests.
+ self._region_name = region_name
+ if service_name == 'sts':
+ self._service_name = 's3'
+ else:
+ self._service_name = service_name
+
diff --git a/doc/radosgw/admin.rst b/doc/radosgw/admin.rst
new file mode 100644
index 000000000..8d70252fe
--- /dev/null
+++ b/doc/radosgw/admin.rst
@@ -0,0 +1,715 @@
+=============
+ Admin Guide
+=============
+
+Once you have your Ceph Object Storage service up and running, you may
+administer the service with user management, access controls, quotas
+and usage tracking among other features.
+
+
+User Management
+===============
+
+Ceph Object Storage user management refers to users of the Ceph Object Storage
+service (i.e., not the Ceph Object Gateway as a user of the Ceph Storage
+Cluster). You must create a user, access key and secret to enable end users to
+interact with Ceph Object Gateway services.
+
+There are two user types:
+
+- **User:** The term 'user' reflects a user of the S3 interface.
+
+- **Subuser:** The term 'subuser' reflects a user of the Swift interface. A subuser
+ is associated to a user .
+
+.. ditaa::
+ +---------+
+ | User |
+ +----+----+
+ |
+ | +-----------+
+ +-----+ Subuser |
+ +-----------+
+
+You can create, modify, view, suspend and remove users and subusers. In addition
+to user and subuser IDs, you may add a display name and an email address for a
+user. You can specify a key and secret, or generate a key and secret
+automatically. When generating or specifying keys, note that user IDs correspond
+to an S3 key type and subuser IDs correspond to a swift key type. Swift keys
+also have access levels of ``read``, ``write``, ``readwrite`` and ``full``.
+
+
+Create a User
+-------------
+
+To create a user (S3 interface), execute the following::
+
+ radosgw-admin user create --uid={username} --display-name="{display-name}" [--email={email}]
+
+For example::
+
+ radosgw-admin user create --uid=johndoe --display-name="John Doe" --email=john@example.com
+
+.. code-block:: javascript
+
+ { "user_id": "johndoe",
+ "display_name": "John Doe",
+ "email": "john@example.com",
+ "suspended": 0,
+ "max_buckets": 1000,
+ "subusers": [],
+ "keys": [
+ { "user": "johndoe",
+ "access_key": "11BS02LGFB6AL6H1ADMW",
+ "secret_key": "vzCEkuryfn060dfee4fgQPqFrncKEIkh3ZcdOANY"}],
+ "swift_keys": [],
+ "caps": [],
+ "op_mask": "read, write, delete",
+ "default_placement": "",
+ "placement_tags": [],
+ "bucket_quota": { "enabled": false,
+ "max_size_kb": -1,
+ "max_objects": -1},
+ "user_quota": { "enabled": false,
+ "max_size_kb": -1,
+ "max_objects": -1},
+ "temp_url_keys": []}
+
+Creating a user also creates an ``access_key`` and ``secret_key`` entry for use
+with any S3 API-compatible client.
+
+.. important:: Check the key output. Sometimes ``radosgw-admin``
+ generates a JSON escape (``\``) character, and some clients
+ do not know how to handle JSON escape characters. Remedies include
+ removing the JSON escape character (``\``), encapsulating the string
+ in quotes, regenerating the key and ensuring that it
+ does not have a JSON escape character or specify the key and secret
+ manually.
+
+
+Create a Subuser
+----------------
+
+To create a subuser (Swift interface) for the user, you must specify the user ID
+(``--uid={username}``), a subuser ID and the access level for the subuser. ::
+
+ radosgw-admin subuser create --uid={uid} --subuser={uid} --access=[ read | write | readwrite | full ]
+
+For example::
+
+ radosgw-admin subuser create --uid=johndoe --subuser=johndoe:swift --access=full
+
+
+.. note:: ``full`` is not ``readwrite``, as it also includes the access control policy.
+
+.. code-block:: javascript
+
+ { "user_id": "johndoe",
+ "display_name": "John Doe",
+ "email": "john@example.com",
+ "suspended": 0,
+ "max_buckets": 1000,
+ "subusers": [
+ { "id": "johndoe:swift",
+ "permissions": "full-control"}],
+ "keys": [
+ { "user": "johndoe",
+ "access_key": "11BS02LGFB6AL6H1ADMW",
+ "secret_key": "vzCEkuryfn060dfee4fgQPqFrncKEIkh3ZcdOANY"}],
+ "swift_keys": [],
+ "caps": [],
+ "op_mask": "read, write, delete",
+ "default_placement": "",
+ "placement_tags": [],
+ "bucket_quota": { "enabled": false,
+ "max_size_kb": -1,
+ "max_objects": -1},
+ "user_quota": { "enabled": false,
+ "max_size_kb": -1,
+ "max_objects": -1},
+ "temp_url_keys": []}
+
+
+Get User Info
+-------------
+
+To get information about a user, you must specify ``user info`` and the user ID
+(``--uid={username}``) . ::
+
+ radosgw-admin user info --uid=johndoe
+
+
+
+Modify User Info
+----------------
+
+To modify information about a user, you must specify the user ID (``--uid={username}``)
+and the attributes you want to modify. Typical modifications are to keys and secrets,
+email addresses, display names and access levels. For example::
+
+ radosgw-admin user modify --uid=johndoe --display-name="John E. Doe"
+
+To modify subuser values, specify ``subuser modify``, user ID and the subuser ID. For example::
+
+ radosgw-admin subuser modify --uid=johndoe --subuser=johndoe:swift --access=full
+
+
+User Enable/Suspend
+-------------------
+
+When you create a user, the user is enabled by default. However, you may suspend
+user privileges and re-enable them at a later time. To suspend a user, specify
+``user suspend`` and the user ID. ::
+
+ radosgw-admin user suspend --uid=johndoe
+
+To re-enable a suspended user, specify ``user enable`` and the user ID. ::
+
+ radosgw-admin user enable --uid=johndoe
+
+.. note:: Disabling the user disables the subuser.
+
+
+Remove a User
+-------------
+
+When you remove a user, the user and subuser are removed from the system.
+However, you may remove just the subuser if you wish. To remove a user (and
+subuser), specify ``user rm`` and the user ID. ::
+
+ radosgw-admin user rm --uid=johndoe
+
+To remove the subuser only, specify ``subuser rm`` and the subuser ID. ::
+
+ radosgw-admin subuser rm --subuser=johndoe:swift
+
+
+Options include:
+
+- **Purge Data:** The ``--purge-data`` option purges all data associated
+ to the UID.
+
+- **Purge Keys:** The ``--purge-keys`` option purges all keys associated
+ to the UID.
+
+
+Remove a Subuser
+----------------
+
+When you remove a sub user, you are removing access to the Swift interface.
+The user will remain in the system. To remove the subuser, specify
+``subuser rm`` and the subuser ID. ::
+
+ radosgw-admin subuser rm --subuser=johndoe:swift
+
+
+
+Options include:
+
+- **Purge Keys:** The ``--purge-keys`` option purges all keys associated
+ to the UID.
+
+
+Add / Remove a Key
+------------------------
+
+Both users and subusers require the key to access the S3 or Swift interface. To
+use S3, the user needs a key pair which is composed of an access key and a
+secret key. On the other hand, to use Swift, the user typically needs a secret
+key (password), and use it together with the associated user ID. You may create
+a key and either specify or generate the access key and/or secret key. You may
+also remove a key. Options include:
+
+- ``--key-type=<type>`` specifies the key type. The options are: s3, swift
+- ``--access-key=<key>`` manually specifies an S3 access key.
+- ``--secret-key=<key>`` manually specifies a S3 secret key or a Swift secret key.
+- ``--gen-access-key`` automatically generates a random S3 access key.
+- ``--gen-secret`` automatically generates a random S3 secret key or a random Swift secret key.
+
+An example how to add a specified S3 key pair for a user. ::
+
+ radosgw-admin key create --uid=foo --key-type=s3 --access-key fooAccessKey --secret-key fooSecretKey
+
+.. code-block:: javascript
+
+ { "user_id": "foo",
+ "rados_uid": 0,
+ "display_name": "foo",
+ "email": "foo@example.com",
+ "suspended": 0,
+ "keys": [
+ { "user": "foo",
+ "access_key": "fooAccessKey",
+ "secret_key": "fooSecretKey"}],
+ }
+
+Note that you may create multiple S3 key pairs for a user.
+
+To attach a specified swift secret key for a subuser. ::
+
+ radosgw-admin key create --subuser=foo:bar --key-type=swift --secret-key barSecret
+
+.. code-block:: javascript
+
+ { "user_id": "foo",
+ "rados_uid": 0,
+ "display_name": "foo",
+ "email": "foo@example.com",
+ "suspended": 0,
+ "subusers": [
+ { "id": "foo:bar",
+ "permissions": "full-control"}],
+ "swift_keys": [
+ { "user": "foo:bar",
+ "secret_key": "asfghjghghmgm"}]}
+
+Note that a subuser can have only one swift secret key.
+
+Subusers can also be used with S3 APIs if the subuser is associated with a S3 key pair. ::
+
+ radosgw-admin key create --subuser=foo:bar --key-type=s3 --access-key barAccessKey --secret-key barSecretKey
+
+.. code-block:: javascript
+
+ { "user_id": "foo",
+ "rados_uid": 0,
+ "display_name": "foo",
+ "email": "foo@example.com",
+ "suspended": 0,
+ "subusers": [
+ { "id": "foo:bar",
+ "permissions": "full-control"}],
+ "keys": [
+ { "user": "foo:bar",
+ "access_key": "barAccessKey",
+ "secret_key": "barSecretKey"}],
+ }
+
+
+To remove a S3 key pair, specify the access key. ::
+
+ radosgw-admin key rm --uid=foo --key-type=s3 --access-key=fooAccessKey
+
+To remove the swift secret key. ::
+
+ radosgw-admin key rm --subuser=foo:bar --key-type=swift
+
+
+Add / Remove Admin Capabilities
+-------------------------------
+
+The Ceph Storage Cluster provides an administrative API that enables users to
+execute administrative functions via the REST API. By default, users do NOT have
+access to this API. To enable a user to exercise administrative functionality,
+provide the user with administrative capabilities.
+
+To add administrative capabilities to a user, execute the following::
+
+ radosgw-admin caps add --uid={uid} --caps={caps}
+
+
+You can add read, write or all capabilities to users, buckets, metadata and
+usage (utilization). For example::
+
+ --caps="[users|buckets|metadata|usage|zone|amz-cache|info|bilog|mdlog|datalog|user-policy|oidc-provider|roles|ratelimit]=[*|read|write|read, write]"
+
+For example::
+
+ radosgw-admin caps add --uid=johndoe --caps="users=*;buckets=*"
+
+
+To remove administrative capabilities from a user, execute the following::
+
+ radosgw-admin caps rm --uid=johndoe --caps={caps}
+
+
+Quota Management
+================
+
+The Ceph Object Gateway enables you to set quotas on users and buckets owned by
+users. Quotas include the maximum number of objects in a bucket and the maximum
+storage size a bucket can hold.
+
+- **Bucket:** The ``--bucket`` option allows you to specify a quota for
+ buckets the user owns.
+
+- **Maximum Objects:** The ``--max-objects`` setting allows you to specify
+ the maximum number of objects. A negative value disables this setting.
+
+- **Maximum Size:** The ``--max-size`` option allows you to specify a quota
+ size in B/K/M/G/T, where B is the default. A negative value disables this setting.
+
+- **Quota Scope:** The ``--quota-scope`` option sets the scope for the quota.
+ The options are ``bucket`` and ``user``. Bucket quotas apply to buckets a
+ user owns. User quotas apply to a user.
+
+
+Set User Quota
+--------------
+
+Before you enable a quota, you must first set the quota parameters.
+For example::
+
+ radosgw-admin quota set --quota-scope=user --uid=<uid> [--max-objects=<num objects>] [--max-size=<max size>]
+
+For example::
+
+ radosgw-admin quota set --quota-scope=user --uid=johndoe --max-objects=1024 --max-size=1024B
+
+
+A negative value for num objects and / or max size means that the
+specific quota attribute check is disabled.
+
+
+Enable/Disable User Quota
+-------------------------
+
+Once you set a user quota, you may enable it. For example::
+
+ radosgw-admin quota enable --quota-scope=user --uid=<uid>
+
+You may disable an enabled user quota. For example::
+
+ radosgw-admin quota disable --quota-scope=user --uid=<uid>
+
+
+Set Bucket Quota
+----------------
+
+Bucket quotas apply to the buckets owned by the specified ``uid``. They are
+independent of the user. ::
+
+ radosgw-admin quota set --uid=<uid> --quota-scope=bucket [--max-objects=<num objects>] [--max-size=<max size]
+
+A negative value for num objects and / or max size means that the
+specific quota attribute check is disabled.
+
+
+Enable/Disable Bucket Quota
+---------------------------
+
+Once you set a bucket quota, you may enable it. For example::
+
+ radosgw-admin quota enable --quota-scope=bucket --uid=<uid>
+
+You may disable an enabled bucket quota. For example::
+
+ radosgw-admin quota disable --quota-scope=bucket --uid=<uid>
+
+
+Get Quota Settings
+------------------
+
+You may access each user's quota settings via the user information
+API. To read user quota setting information with the CLI interface,
+execute the following::
+
+ radosgw-admin user info --uid=<uid>
+
+
+Update Quota Stats
+------------------
+
+Quota stats get updated asynchronously. You can update quota
+statistics for all users and all buckets manually to retrieve
+the latest quota stats. ::
+
+ radosgw-admin user stats --uid=<uid> --sync-stats
+
+.. _rgw_user_usage_stats:
+
+Get User Usage Stats
+--------------------
+
+To see how much of the quota a user has consumed, execute the following::
+
+ radosgw-admin user stats --uid=<uid>
+
+.. note:: You should execute ``radosgw-admin user stats`` with the
+ ``--sync-stats`` option to receive the latest data.
+
+Default Quotas
+--------------
+
+You can set default quotas in the config. These defaults are used when
+creating a new user and have no effect on existing users. If the
+relevant default quota is set in config, then that quota is set on the
+new user, and that quota is enabled. See ``rgw bucket default quota max objects``,
+``rgw bucket default quota max size``, ``rgw user default quota max objects``, and
+``rgw user default quota max size`` in `Ceph Object Gateway Config Reference`_
+
+Quota Cache
+-----------
+
+Quota statistics are cached on each RGW instance. If there are multiple
+instances, then the cache can keep quotas from being perfectly enforced, as
+each instance will have a different view of quotas. The options that control
+this are ``rgw bucket quota ttl``, ``rgw user quota bucket sync interval`` and
+``rgw user quota sync interval``. The higher these values are, the more
+efficient quota operations are, but the more out-of-sync multiple instances
+will be. The lower these values are, the closer to perfect enforcement
+multiple instances will achieve. If all three are 0, then quota caching is
+effectively disabled, and multiple instances will have perfect quota
+enforcement. See `Ceph Object Gateway Config Reference`_
+
+Reading / Writing Global Quotas
+-------------------------------
+
+You can read and write global quota settings in the period configuration. To
+view the global quota settings::
+
+ radosgw-admin global quota get
+
+The global quota settings can be manipulated with the ``global quota``
+counterparts of the ``quota set``, ``quota enable``, and ``quota disable``
+commands. ::
+
+ radosgw-admin global quota set --quota-scope bucket --max-objects 1024
+ radosgw-admin global quota enable --quota-scope bucket
+
+.. note:: In a multisite configuration, where there is a realm and period
+ present, changes to the global quotas must be committed using ``period
+ update --commit``. If there is no period present, the rados gateway(s) must
+ be restarted for the changes to take effect.
+
+
+Rate Limit Management
+=====================
+
+The Ceph Object Gateway makes it possible to set rate limits on users and
+buckets. "Rate limit" includes the maximum number of read operations (read
+ops) and write operations (write ops) per minute and the number of bytes per
+minute that can be written or read per user or per bucket.
+
+Operations that use the ``GET`` method or the ``HEAD`` method in their REST
+requests are "read requests". All other requests are "write requests".
+
+Each object gateway tracks per-user metrics separately from bucket metrics.
+These metrics are not shared with other gateways. The configured limits should
+be divided by the number of active object gateways. For example, if "user A" is
+to be be limited to 10 ops per minute and there are two object gateways in the
+cluster, then the limit on "user A" should be ``5`` (10 ops per minute / 2
+RGWs). If the requests are **not** balanced between RGWs, the rate limit might
+be underutilized. For example: if the ops limit is ``5`` and there are two
+RGWs, **but** the Load Balancer sends load to only one of those RGWs, the
+effective limit is 5 ops, because this limit is enforced per RGW. If the rate
+limit that has been set for the bucket has been reached but the rate limit that
+has been set for the user has not been reached, then the request is cancelled.
+The contrary holds as well: if the rate limit that has been set for the user
+has been reached but the rate limit that has been set for the bucket has not
+been reached, then the request is cancelled.
+
+The accounting of bandwidth happens only after a request has been accepted.
+This means that requests will proceed even if the bucket rate limit or user
+rate limit is reached during the execution of the request. The RGW keeps track
+of a "debt" consisting of bytes used in excess of the configured value; users
+or buckets that incur this kind of debt are prevented from sending more
+requests until the "debt" has been repaid. The maximum size of the "debt" is
+twice the max-read/write-bytes per minute. If "user A" is subject to a 1-byte
+read limit per minute and they attempt to GET an object that is 1 GB in size,
+then the ``GET`` action will fail. After "user A" has completed this 1 GB
+operation, RGW blocks the user's requests for up to two minutes. After this
+time has elapsed, "user A" will be able to send ``GET`` requests again.
+
+
+- **Bucket:** The ``--bucket`` option allows you to specify a rate limit for a
+ bucket.
+
+- **User:** The ``--uid`` option allows you to specify a rate limit for a
+ user.
+
+- **Maximum Read Ops:** The ``--max-read-ops`` setting allows you to specify
+ the maximum number of read ops per minute per RGW. A 0 value disables this setting (which means unlimited access).
+
+- **Maximum Read Bytes:** The ``--max-read-bytes`` setting allows you to specify
+ the maximum number of read bytes per minute per RGW. A 0 value disables this setting (which means unlimited access).
+
+- **Maximum Write Ops:** The ``--max-write-ops`` setting allows you to specify
+ the maximum number of write ops per minute per RGW. A 0 value disables this setting (which means unlimited access).
+
+- **Maximum Write Bytes:** The ``--max-write-bytes`` setting allows you to specify
+ the maximum number of write bytes per minute per RGW. A 0 value disables this setting (which means unlimited access).
+
+- **Rate Limit Scope:** The ``--ratelimit-scope`` option sets the scope for the rate limit.
+ The options are ``bucket`` , ``user`` and ``anonymous``. Bucket rate limit apply to buckets.
+ The user rate limit applies to a user. Anonymous applies to an unauthenticated user.
+ Anonymous scope is only available for global rate limit.
+
+
+Set User Rate Limit
+-------------------
+
+Before you enable a rate limit, you must first set the rate limit parameters.
+For example::
+
+ radosgw-admin ratelimit set --ratelimit-scope=user --uid=<uid> <[--max-read-ops=<num ops>] [--max-read-bytes=<num bytes>]
+ [--max-write-ops=<num ops>] [--max-write-bytes=<num bytes>]>
+
+For example::
+
+ radosgw-admin ratelimit set --ratelimit-scope=user --uid=johndoe --max-read-ops=1024 --max-write-bytes=10240
+
+
+A 0 value for num ops and / or num bytes means that the
+specific rate limit attribute check is disabled.
+
+Get User Rate Limit
+-------------------
+
+Get the current configured rate limit parameters
+For example::
+
+ radosgw-admin ratelimit get --ratelimit-scope=user --uid=<uid>
+
+For example::
+
+ radosgw-admin ratelimit get --ratelimit-scope=user --uid=johndoe
+
+
+A 0 value for num ops and / or num bytes means that the
+specific rate limit attribute check is disabled.
+
+
+Enable/Disable User Rate Limit
+------------------------------
+
+Once you set a user rate limit, you may enable it. For example::
+
+ radosgw-admin ratelimit enable --ratelimit-scope=user --uid=<uid>
+
+You may disable an enabled user rate limit. For example::
+
+ radosgw-admin ratelimit disable --ratelimit-scope=user --uid=johndoe
+
+
+Set Bucket Rate Limit
+---------------------
+
+Before you enable a rate limit, you must first set the rate limit parameters.
+For example::
+
+ radosgw-admin ratelimit set --ratelimit-scope=bucket --bucket=<bucket> <[--max-read-ops=<num ops>] [--max-read-bytes=<num bytes>]
+ [--max-write-ops=<num ops>] [--max-write-bytes=<num bytes>]>
+
+For example::
+
+ radosgw-admin ratelimit set --ratelimit-scope=bucket --bucket=mybucket --max-read-ops=1024 --max-write-bytes=10240
+
+
+A 0 value for num ops and / or num bytes means that the
+specific rate limit attribute check is disabled.
+
+Get Bucket Rate Limit
+---------------------
+
+Get the current configured rate limit parameters
+For example::
+
+ radosgw-admin ratelimit set --ratelimit-scope=bucket --bucket=<bucket>
+
+For example::
+
+ radosgw-admin ratelimit get --ratelimit-scope=bucket --bucket=mybucket
+
+
+A 0 value for num ops and / or num bytes means that the
+specific rate limit attribute check is disabled.
+
+
+Enable/Disable Bucket Rate Limit
+--------------------------------
+
+Once you set a bucket rate limit, you may enable it. For example::
+
+ radosgw-admin ratelimit enable --ratelimit-scope=bucket --bucket=<bucket>
+
+You may disable an enabled bucket rate limit. For example::
+
+ radosgw-admin ratelimit disable --ratelimit-scope=bucket --uid=mybucket
+
+
+Reading / Writing Global Rate Limit Configuration
+-------------------------------------------------
+
+You can read and write global rate limit settings in the period configuration. To
+view the global rate limit settings::
+
+ radosgw-admin global ratelimit get
+
+The global rate limit settings can be manipulated with the ``global ratelimit``
+counterparts of the ``ratelimit set``, ``ratelimit enable``, and ``ratelimit disable``
+commands. Per user and per bucket ratelimit configuration is overriding the global configuration::
+
+ radosgw-admin global ratelimit set --ratelimit-scope bucket --max-read-ops=1024
+ radosgw-admin global ratelimit enable --ratelimit-scope bucket
+
+The global rate limit can configure rate limit scope for all authenticated users::
+
+ radosgw-admin global ratelimit set --ratelimit-scope user --max-read-ops=1024
+ radosgw-admin global ratelimit enable --ratelimit-scope user
+
+The global rate limit can configure rate limit scope for all unauthenticated users::
+
+ radosgw-admin global ratelimit set --ratelimit-scope=anonymous --max-read-ops=1024
+ radosgw-admin global ratelimit enable --ratelimit-scope=anonymous
+
+.. note:: In a multisite configuration, where there is a realm and period
+ present, changes to the global rate limit must be committed using ``period
+ update --commit``. If there is no period present, the rados gateway(s) must
+ be restarted for the changes to take effect.
+
+Usage
+=====
+
+The Ceph Object Gateway logs usage for each user. You can track
+user usage within date ranges too.
+
+- Add ``rgw enable usage log = true`` in [client.rgw] section of ceph.conf and restart the radosgw service.
+
+Options include:
+
+- **Start Date:** The ``--start-date`` option allows you to filter usage
+ stats from a particular start date and an optional start time
+ (**format:** ``yyyy-mm-dd [HH:MM:SS]``).
+
+- **End Date:** The ``--end-date`` option allows you to filter usage up
+ to a particular date and an optional end time
+ (**format:** ``yyyy-mm-dd [HH:MM:SS]``).
+
+- **Log Entries:** The ``--show-log-entries`` option allows you to specify
+ whether or not to include log entries with the usage stats
+ (options: ``true`` | ``false``).
+
+.. note:: You may specify time with minutes and seconds, but it is stored
+ with 1 hour resolution.
+
+
+Show Usage
+----------
+
+To show usage statistics, specify the ``usage show``. To show usage for a
+particular user, you must specify a user ID. You may also specify a start date,
+end date, and whether or not to show log entries.::
+
+ radosgw-admin usage show --uid=johndoe --start-date=2012-03-01 --end-date=2012-04-01
+
+You may also show a summary of usage information for all users by omitting a user ID. ::
+
+ radosgw-admin usage show --show-log-entries=false
+
+
+Trim Usage
+----------
+
+With heavy use, usage logs can begin to take up storage space. You can trim
+usage logs for all users and for specific users. You may also specify date
+ranges for trim operations. ::
+
+ radosgw-admin usage trim --start-date=2010-01-01 --end-date=2010-12-31
+ radosgw-admin usage trim --uid=johndoe
+ radosgw-admin usage trim --uid=johndoe --end-date=2013-12-31
+
+
+.. _radosgw-admin: ../../man/8/radosgw-admin/
+.. _Pool Configuration: ../../rados/configuration/pool-pg-config-ref/
+.. _Ceph Object Gateway Config Reference: ../config-ref/
diff --git a/doc/radosgw/adminops.rst b/doc/radosgw/adminops.rst
new file mode 100644
index 000000000..0974b95c5
--- /dev/null
+++ b/doc/radosgw/adminops.rst
@@ -0,0 +1,2166 @@
+.. _radosgw admin ops:
+
+==================
+ Admin Operations
+==================
+
+An admin API request will be done on a URI that starts with the configurable 'admin'
+resource entry point. Authorization for the admin API duplicates the S3 authorization
+mechanism. Some operations require that the user holds special administrative capabilities.
+The response entity type (XML or JSON) may be specified as the 'format' option in the
+request and defaults to JSON if not specified.
+
+Info
+====
+
+Get RGW cluster/endpoint information.
+
+:caps: info=read
+
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/info?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+None.
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, the response contains an ``info`` section.
+
+``info``
+
+:Description: A container for all returned information.
+:Type: Container
+
+``cluster_id``
+
+:Description: The (typically unique) identifier for the controlling
+ backing store for the RGW cluster. In the typical case,
+ this is value returned from librados::rados::cluster_fsid().
+:Type: String
+:Parent: ``info``
+
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+None.
+
+
+Get Usage
+=========
+
+Request bandwidth usage information.
+
+Note: this feature is disabled by default, can be enabled by setting ``rgw
+enable usage log = true`` in the appropriate section of ceph.conf. For changes
+in ceph.conf to take effect, radosgw process restart is needed.
+
+:caps: usage=read
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/usage?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user for which the information is requested. If not specified will apply to all users.
+:Type: String
+:Example: ``foo_user``
+:Required: No
+
+``start``
+
+:Description: Date and (optional) time that specifies the start time of the requested data.
+:Type: String
+:Example: ``2012-09-25 16:00:00``
+:Required: No
+
+``end``
+
+:Description: Date and (optional) time that specifies the end time of the requested data (non-inclusive).
+:Type: String
+:Example: ``2012-09-25 16:00:00``
+:Required: No
+
+
+``show-entries``
+
+:Description: Specifies whether data entries should be returned.
+:Type: Boolean
+:Example: True [True]
+:Required: No
+
+
+``show-summary``
+
+:Description: Specifies whether data summary should be returned.
+:Type: Boolean
+:Example: True [True]
+:Required: No
+
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, the response contains the requested information.
+
+``usage``
+
+:Description: A container for the usage information.
+:Type: Container
+
+``entries``
+
+:Description: A container for the usage entries information.
+:Type: Container
+
+``user``
+
+:Description: A container for the user data information.
+:Type: Container
+
+``owner``
+
+:Description: The name of the user that owns the buckets.
+:Type: String
+
+``bucket``
+
+:Description: The bucket name.
+:Type: String
+
+``time``
+
+:Description: Time lower bound for which data is being specified (rounded to the beginning of the first relevant hour).
+:Type: String
+
+``epoch``
+
+:Description: The time specified in seconds since 1/1/1970.
+:Type: String
+
+``categories``
+
+:Description: A container for stats categories.
+:Type: Container
+
+``entry``
+
+:Description: A container for stats entry.
+:Type: Container
+
+``category``
+
+:Description: Name of request category for which the stats are provided.
+:Type: String
+
+``bytes_sent``
+
+:Description: Number of bytes sent by the RADOS Gateway.
+:Type: Integer
+
+``bytes_received``
+
+:Description: Number of bytes received by the RADOS Gateway.
+:Type: Integer
+
+``ops``
+
+:Description: Number of operations.
+:Type: Integer
+
+``successful_ops``
+
+:Description: Number of successful operations.
+:Type: Integer
+
+``summary``
+
+:Description: A container for stats summary.
+:Type: Container
+
+``total``
+
+:Description: A container for stats summary aggregated total.
+:Type: Container
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+TBD.
+
+Trim Usage
+==========
+
+Remove usage information. With no dates specified, removes all usage
+information.
+
+Note: this feature is disabled by default, can be enabled by setting ``rgw
+enable usage log = true`` in the appropriate section of ceph.conf. For changes
+in ceph.conf to take effect, radosgw process restart is needed.
+
+:caps: usage=write
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{admin}/usage?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user for which the information is requested. If not specified will apply to all users.
+:Type: String
+:Example: ``foo_user``
+:Required: No
+
+``start``
+
+:Description: Date and (optional) time that specifies the start time of the requested data.
+:Type: String
+:Example: ``2012-09-25 16:00:00``
+:Required: No
+
+``end``
+
+:Description: Date and (optional) time that specifies the end time of the requested data (none inclusive).
+:Type: String
+:Example: ``2012-09-25 16:00:00``
+:Required: No
+
+
+``remove-all``
+
+:Description: Required when uid is not specified, in order to acknowledge multi user data removal.
+:Type: Boolean
+:Example: True [False]
+:Required: No
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+TBD.
+
+Get User Info
+=============
+
+Get user information.
+
+:caps: users=read
+
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/user?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user for which the information is requested.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, the response contains the user information.
+
+``user``
+
+:Description: A container for the user data information.
+:Type: Container
+
+``user_id``
+
+:Description: The user id.
+:Type: String
+:Parent: ``user``
+
+``display_name``
+
+:Description: Display name for the user.
+:Type: String
+:Parent: ``user``
+
+``suspended``
+
+:Description: True if the user is suspended.
+:Type: Boolean
+:Parent: ``user``
+
+``max_buckets``
+
+:Description: The maximum number of buckets to be owned by the user.
+:Type: Integer
+:Parent: ``user``
+
+``subusers``
+
+:Description: Subusers associated with this user account.
+:Type: Container
+:Parent: ``user``
+
+``keys``
+
+:Description: S3 keys associated with this user account.
+:Type: Container
+:Parent: ``user``
+
+``swift_keys``
+
+:Description: Swift keys associated with this user account.
+:Type: Container
+:Parent: ``user``
+
+``caps``
+
+:Description: User capabilities.
+:Type: Container
+:Parent: ``user``
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+None.
+
+Create User
+===========
+
+Create a new user. By default, a S3 key pair will be created automatically
+and returned in the response. If only one of ``access-key`` or ``secret-key``
+is provided, the omitted key will be automatically generated. By default, a
+generated key is added to the keyring without replacing an existing key pair.
+If ``access-key`` is specified and refers to an existing key owned by the user
+then it will be modified.
+
+.. versionadded:: Luminous
+
+A ``tenant`` may either be specified as a part of uid or as an additional
+request param.
+
+:caps: users=write
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{admin}/user?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user ID to be created.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+A tenant name may also specified as a part of ``uid``, by following the syntax
+``tenant$user``, refer to :ref:`Multitenancy <rgw-multitenancy>` for more details.
+
+``display-name``
+
+:Description: The display name of the user to be created.
+:Type: String
+:Example: ``foo user``
+:Required: Yes
+
+
+``email``
+
+:Description: The email address associated with the user.
+:Type: String
+:Example: ``foo@bar.com``
+:Required: No
+
+``key-type``
+
+:Description: Key type to be generated, options are: swift, s3 (default).
+:Type: String
+:Example: ``s3`` [``s3``]
+:Required: No
+
+``access-key``
+
+:Description: Specify access key.
+:Type: String
+:Example: ``ABCD0EF12GHIJ2K34LMN``
+:Required: No
+
+
+``secret-key``
+
+:Description: Specify secret key.
+:Type: String
+:Example: ``0AbCDEFg1h2i34JklM5nop6QrSTUV+WxyzaBC7D8``
+:Required: No
+
+``user-caps``
+
+:Description: User capabilities.
+:Type: String
+:Example: ``usage=read, write; users=read``
+:Required: No
+
+``generate-key``
+
+:Description: Generate a new key pair and add to the existing keyring.
+:Type: Boolean
+:Example: True [True]
+:Required: No
+
+``max-buckets``
+
+:Description: Specify the maximum number of buckets the user can own.
+:Type: Integer
+:Example: 500 [1000]
+:Required: No
+
+``suspended``
+
+:Description: Specify whether the user should be suspended.
+:Type: Boolean
+:Example: False [False]
+:Required: No
+
+.. versionadded:: Jewel
+
+``tenant``
+
+:Description: the Tenant under which a user is a part of.
+:Type: string
+:Example: tenant1
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, the response contains the user information.
+
+``user``
+
+:Description: A container for the user data information.
+:Type: Container
+
+``tenant``
+
+:Description: The tenant which user is a part of.
+:Type: String
+:Parent: ``user``
+
+``user_id``
+
+:Description: The user id.
+:Type: String
+:Parent: ``user``
+
+``display_name``
+
+:Description: Display name for the user.
+:Type: String
+:Parent: ``user``
+
+``suspended``
+
+:Description: True if the user is suspended.
+:Type: Boolean
+:Parent: ``user``
+
+``max_buckets``
+
+:Description: The maximum number of buckets to be owned by the user.
+:Type: Integer
+:Parent: ``user``
+
+``subusers``
+
+:Description: Subusers associated with this user account.
+:Type: Container
+:Parent: ``user``
+
+``keys``
+
+:Description: S3 keys associated with this user account.
+:Type: Container
+:Parent: ``user``
+
+``swift_keys``
+
+:Description: Swift keys associated with this user account.
+:Type: Container
+:Parent: ``user``
+
+``caps``
+
+:Description: User capabilities.
+:Type: Container
+:Parent: ``user``
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``UserExists``
+
+:Description: Attempt to create existing user.
+:Code: 409 Conflict
+
+``InvalidAccessKey``
+
+:Description: Invalid access key specified.
+:Code: 400 Bad Request
+
+``InvalidKeyType``
+
+:Description: Invalid key type specified.
+:Code: 400 Bad Request
+
+``InvalidSecretKey``
+
+:Description: Invalid secret key specified.
+:Code: 400 Bad Request
+
+``InvalidKeyType``
+
+:Description: Invalid key type specified.
+:Code: 400 Bad Request
+
+``KeyExists``
+
+:Description: Provided access key exists and belongs to another user.
+:Code: 409 Conflict
+
+``EmailExists``
+
+:Description: Provided email address exists.
+:Code: 409 Conflict
+
+``InvalidCapability``
+
+:Description: Attempt to grant invalid admin capability.
+:Code: 400 Bad Request
+
+
+Modify User
+===========
+
+Modify a user.
+
+:caps: users=write
+
+Syntax
+~~~~~~
+
+::
+
+ POST /{admin}/user?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user ID to be modified.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+``display-name``
+
+:Description: The display name of the user to be modified.
+:Type: String
+:Example: ``foo user``
+:Required: No
+
+``email``
+
+:Description: The email address to be associated with the user.
+:Type: String
+:Example: ``foo@bar.com``
+:Required: No
+
+``generate-key``
+
+:Description: Generate a new key pair and add to the existing keyring.
+:Type: Boolean
+:Example: True [False]
+:Required: No
+
+``access-key``
+
+:Description: Specify access key.
+:Type: String
+:Example: ``ABCD0EF12GHIJ2K34LMN``
+:Required: No
+
+``secret-key``
+
+:Description: Specify secret key.
+:Type: String
+:Example: ``0AbCDEFg1h2i34JklM5nop6QrSTUV+WxyzaBC7D8``
+:Required: No
+
+``key-type``
+
+:Description: Key type to be generated, options are: swift, s3 (default).
+:Type: String
+:Example: ``s3``
+:Required: No
+
+``max-buckets``
+
+:Description: Specify the maximum number of buckets the user can own.
+:Type: Integer
+:Example: 500 [1000]
+:Required: No
+
+``suspended``
+
+:Description: Specify whether the user should be suspended.
+:Type: Boolean
+:Example: False [False]
+:Required: No
+
+``op-mask``
+
+:Description: The op-mask of the user to be modified.
+:Type: String
+:Example: ``read, write, delete, *``
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, the response contains the user information.
+
+``user``
+
+:Description: A container for the user data information.
+:Type: Container
+
+``user_id``
+
+:Description: The user id.
+:Type: String
+:Parent: ``user``
+
+``display_name``
+
+:Description: Display name for the user.
+:Type: String
+:Parent: ``user``
+
+
+``suspended``
+
+:Description: True if the user is suspended.
+:Type: Boolean
+:Parent: ``user``
+
+
+``max_buckets``
+
+:Description: The maximum number of buckets to be owned by the user.
+:Type: Integer
+:Parent: ``user``
+
+
+``subusers``
+
+:Description: Subusers associated with this user account.
+:Type: Container
+:Parent: ``user``
+
+
+``keys``
+
+:Description: S3 keys associated with this user account.
+:Type: Container
+:Parent: ``user``
+
+
+``swift_keys``
+
+:Description: Swift keys associated with this user account.
+:Type: Container
+:Parent: ``user``
+
+
+``caps``
+
+:Description: User capabilities.
+:Type: Container
+:Parent: ``user``
+
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``InvalidAccessKey``
+
+:Description: Invalid access key specified.
+:Code: 400 Bad Request
+
+``InvalidKeyType``
+
+:Description: Invalid key type specified.
+:Code: 400 Bad Request
+
+``InvalidSecretKey``
+
+:Description: Invalid secret key specified.
+:Code: 400 Bad Request
+
+``KeyExists``
+
+:Description: Provided access key exists and belongs to another user.
+:Code: 409 Conflict
+
+``EmailExists``
+
+:Description: Provided email address exists.
+:Code: 409 Conflict
+
+``InvalidCapability``
+
+:Description: Attempt to grant invalid admin capability.
+:Code: 400 Bad Request
+
+Remove User
+===========
+
+Remove an existing user.
+
+:caps: users=write
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{admin}/user?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user ID to be removed.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes.
+
+``purge-data``
+
+:Description: When specified the buckets and objects belonging
+ to the user will also be removed.
+:Type: Boolean
+:Example: True
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+None
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+None.
+
+Create Subuser
+==============
+
+Create a new subuser (primarily useful for clients using the Swift API).
+Note that in general for a subuser to be useful, it must be granted
+permissions by specifying ``access``. As with user creation if
+``subuser`` is specified without ``secret``, then a secret key will
+be automatically generated.
+
+:caps: users=write
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{admin}/user?subuser&format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user ID under which a subuser is to be created.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+
+``subuser``
+
+:Description: Specify the subuser ID to be created.
+:Type: String
+:Example: ``sub_foo``
+:Required: Yes
+
+``secret-key``
+
+:Description: Specify secret key.
+:Type: String
+:Example: ``0AbCDEFg1h2i34JklM5nop6QrSTUV+WxyzaBC7D8``
+:Required: No
+
+``key-type``
+
+:Description: Key type to be generated, options are: swift (default), s3.
+:Type: String
+:Example: ``swift`` [``swift``]
+:Required: No
+
+``access``
+
+:Description: Set access permissions for sub-user, should be one
+ of ``read, write, readwrite, full``.
+:Type: String
+:Example: ``read``
+:Required: No
+
+``generate-secret``
+
+:Description: Generate the secret key.
+:Type: Boolean
+:Example: True [False]
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, the response contains the subuser information.
+
+
+``subusers``
+
+:Description: Subusers associated with the user account.
+:Type: Container
+
+``id``
+
+:Description: Subuser id.
+:Type: String
+:Parent: ``subusers``
+
+``permissions``
+
+:Description: Subuser access to user account.
+:Type: String
+:Parent: ``subusers``
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``SubuserExists``
+
+:Description: Specified subuser exists.
+:Code: 409 Conflict
+
+``InvalidKeyType``
+
+:Description: Invalid key type specified.
+:Code: 400 Bad Request
+
+``InvalidSecretKey``
+
+:Description: Invalid secret key specified.
+:Code: 400 Bad Request
+
+``InvalidAccess``
+
+:Description: Invalid subuser access specified.
+:Code: 400 Bad Request
+
+Modify Subuser
+==============
+
+Modify an existing subuser
+
+:caps: users=write
+
+Syntax
+~~~~~~
+
+::
+
+ POST /{admin}/user?subuser&format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user ID under which the subuser is to be modified.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+``subuser``
+
+:Description: The subuser ID to be modified.
+:Type: String
+:Example: ``sub_foo``
+:Required: Yes
+
+``generate-secret``
+
+:Description: Generate a new secret key for the subuser,
+ replacing the existing key.
+:Type: Boolean
+:Example: True [False]
+:Required: No
+
+``secret``
+
+:Description: Specify secret key.
+:Type: String
+:Example: ``0AbCDEFg1h2i34JklM5nop6QrSTUV+WxyzaBC7D8``
+:Required: No
+
+``key-type``
+
+:Description: Key type to be generated, options are: swift (default), s3 .
+:Type: String
+:Example: ``swift`` [``swift``]
+:Required: No
+
+``access``
+
+:Description: Set access permissions for sub-user, should be one
+ of ``read, write, readwrite, full``.
+:Type: String
+:Example: ``read``
+:Required: No
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, the response contains the subuser information.
+
+
+``subusers``
+
+:Description: Subusers associated with the user account.
+:Type: Container
+
+``id``
+
+:Description: Subuser id.
+:Type: String
+:Parent: ``subusers``
+
+``permissions``
+
+:Description: Subuser access to user account.
+:Type: String
+:Parent: ``subusers``
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``InvalidKeyType``
+
+:Description: Invalid key type specified.
+:Code: 400 Bad Request
+
+``InvalidSecretKey``
+
+:Description: Invalid secret key specified.
+:Code: 400 Bad Request
+
+``InvalidAccess``
+
+:Description: Invalid subuser access specified.
+:Code: 400 Bad Request
+
+Remove Subuser
+==============
+
+Remove an existing subuser
+
+:caps: users=write
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{admin}/user?subuser&format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user ID under which the subuser is to be removed.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+
+``subuser``
+
+:Description: The subuser ID to be removed.
+:Type: String
+:Example: ``sub_foo``
+:Required: Yes
+
+``purge-keys``
+
+:Description: Remove keys belonging to the subuser.
+:Type: Boolean
+:Example: True [True]
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+None.
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+None.
+
+Create Key
+==========
+
+Create a new key. If a ``subuser`` is specified then by default created keys
+will be swift type. If only one of ``access-key`` or ``secret-key`` is provided the
+committed key will be automatically generated, that is if only ``secret-key`` is
+specified then ``access-key`` will be automatically generated. By default, a
+generated key is added to the keyring without replacing an existing key pair.
+If ``access-key`` is specified and refers to an existing key owned by the user
+then it will be modified. The response is a container listing all keys of the same
+type as the key created. Note that when creating a swift key, specifying the option
+``access-key`` will have no effect. Additionally, only one swift key may be held by
+each user or subuser.
+
+:caps: users=write
+
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{admin}/user?key&format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user ID to receive the new key.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+``subuser``
+
+:Description: The subuser ID to receive the new key.
+:Type: String
+:Example: ``sub_foo``
+:Required: No
+
+``key-type``
+
+:Description: Key type to be generated, options are: swift, s3 (default).
+:Type: String
+:Example: ``s3`` [``s3``]
+:Required: No
+
+``access-key``
+
+:Description: Specify the access key.
+:Type: String
+:Example: ``AB01C2D3EF45G6H7IJ8K``
+:Required: No
+
+``secret-key``
+
+:Description: Specify the secret key.
+:Type: String
+:Example: ``0ab/CdeFGhij1klmnopqRSTUv1WxyZabcDEFgHij``
+:Required: No
+
+``generate-key``
+
+:Description: Generate a new key pair and add to the existing keyring.
+:Type: Boolean
+:Example: True [True]
+:Required: No
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+``keys``
+
+:Description: Keys of type created associated with this user account.
+:Type: Container
+
+``user``
+
+:Description: The user account associated with the key.
+:Type: String
+:Parent: ``keys``
+
+``access-key``
+
+:Description: The access key.
+:Type: String
+:Parent: ``keys``
+
+``secret-key``
+
+:Description: The secret key
+:Type: String
+:Parent: ``keys``
+
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``InvalidAccessKey``
+
+:Description: Invalid access key specified.
+:Code: 400 Bad Request
+
+``InvalidKeyType``
+
+:Description: Invalid key type specified.
+:Code: 400 Bad Request
+
+``InvalidSecretKey``
+
+:Description: Invalid secret key specified.
+:Code: 400 Bad Request
+
+``InvalidKeyType``
+
+:Description: Invalid key type specified.
+:Code: 400 Bad Request
+
+``KeyExists``
+
+:Description: Provided access key exists and belongs to another user.
+:Code: 409 Conflict
+
+Remove Key
+==========
+
+Remove an existing key.
+
+:caps: users=write
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{admin}/user?key&format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``access-key``
+
+:Description: The S3 access key belonging to the S3 key pair to remove.
+:Type: String
+:Example: ``AB01C2D3EF45G6H7IJ8K``
+:Required: Yes
+
+``uid``
+
+:Description: The user to remove the key from.
+:Type: String
+:Example: ``foo_user``
+:Required: No
+
+``subuser``
+
+:Description: The subuser to remove the key from.
+:Type: String
+:Example: ``sub_foo``
+:Required: No
+
+``key-type``
+
+:Description: Key type to be removed, options are: swift, s3.
+ NOTE: Required to remove swift key.
+:Type: String
+:Example: ``swift``
+:Required: No
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+None.
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+None.
+
+Get Bucket Info
+===============
+
+Get information about a subset of the existing buckets. If ``uid`` is specified
+without ``bucket`` then all buckets belonging to the user will be returned. If
+``bucket`` alone is specified, information for that particular bucket will be
+retrieved.
+
+:caps: buckets=read
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/bucket?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``bucket``
+
+:Description: The bucket to return info on.
+:Type: String
+:Example: ``foo_bucket``
+:Required: No
+
+``uid``
+
+:Description: The user to retrieve bucket information for.
+:Type: String
+:Example: ``foo_user``
+:Required: No
+
+``stats``
+
+:Description: Return bucket statistics.
+:Type: Boolean
+:Example: True [False]
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful the request returns a buckets container containing
+the desired bucket information.
+
+``stats``
+
+:Description: Per bucket information.
+:Type: Container
+
+``buckets``
+
+:Description: Contains a list of one or more bucket containers.
+:Type: Container
+
+``bucket``
+
+:Description: Container for single bucket information.
+:Type: Container
+:Parent: ``buckets``
+
+``name``
+
+:Description: The name of the bucket.
+:Type: String
+:Parent: ``bucket``
+
+``pool``
+
+:Description: The pool the bucket is stored in.
+:Type: String
+:Parent: ``bucket``
+
+``id``
+
+:Description: The unique bucket id.
+:Type: String
+:Parent: ``bucket``
+
+``marker``
+
+:Description: Internal bucket tag.
+:Type: String
+:Parent: ``bucket``
+
+``owner``
+
+:Description: The user id of the bucket owner.
+:Type: String
+:Parent: ``bucket``
+
+``usage``
+
+:Description: Storage usage information.
+:Type: Container
+:Parent: ``bucket``
+
+``index``
+
+:Description: Status of bucket index.
+:Type: String
+:Parent: ``bucket``
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``IndexRepairFailed``
+
+:Description: Bucket index repair failed.
+:Code: 409 Conflict
+
+Check Bucket Index
+==================
+
+Check the index of an existing bucket. NOTE: to check multipart object
+accounting with ``check-objects``, ``fix`` must be set to True.
+
+:caps: buckets=write
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/bucket?index&format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``bucket``
+
+:Description: The bucket to return info on.
+:Type: String
+:Example: ``foo_bucket``
+:Required: Yes
+
+``check-objects``
+
+:Description: Check multipart object accounting.
+:Type: Boolean
+:Example: True [False]
+:Required: No
+
+``fix``
+
+:Description: Also fix the bucket index when checking.
+:Type: Boolean
+:Example: False [False]
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+``index``
+
+:Description: Status of bucket index.
+:Type: String
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``IndexRepairFailed``
+
+:Description: Bucket index repair failed.
+:Code: 409 Conflict
+
+Remove Bucket
+=============
+
+Delete an existing bucket.
+
+:caps: buckets=write
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{admin}/bucket?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``bucket``
+
+:Description: The bucket to remove.
+:Type: String
+:Example: ``foo_bucket``
+:Required: Yes
+
+``purge-objects``
+
+:Description: Remove a buckets objects before deletion.
+:Type: Boolean
+:Example: True [False]
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+None.
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``BucketNotEmpty``
+
+:Description: Attempted to delete non-empty bucket.
+:Code: 409 Conflict
+
+``ObjectRemovalFailed``
+
+:Description: Unable to remove objects.
+:Code: 409 Conflict
+
+Unlink Bucket
+=============
+
+Unlink a bucket from a specified user. Primarily useful for changing
+bucket ownership.
+
+:caps: buckets=write
+
+Syntax
+~~~~~~
+
+::
+
+ POST /{admin}/bucket?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``bucket``
+
+:Description: The bucket to unlink.
+:Type: String
+:Example: ``foo_bucket``
+:Required: Yes
+
+``uid``
+
+:Description: The user ID to unlink the bucket from.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+None.
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``BucketUnlinkFailed``
+
+:Description: Unable to unlink bucket from specified user.
+:Code: 409 Conflict
+
+Link Bucket
+===========
+
+Link a bucket to a specified user, unlinking the bucket from
+any previous user.
+
+:caps: buckets=write
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{admin}/bucket?format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``bucket``
+
+:Description: The bucket name to unlink.
+:Type: String
+:Example: ``foo_bucket``
+:Required: Yes
+
+``bucket-id``
+
+:Description: The bucket id to unlink.
+:Type: String
+:Example: ``dev.6607669.420``
+:Required: No
+
+``uid``
+
+:Description: The user ID to link the bucket to.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+``bucket``
+
+:Description: Container for single bucket information.
+:Type: Container
+
+``name``
+
+:Description: The name of the bucket.
+:Type: String
+:Parent: ``bucket``
+
+``pool``
+
+:Description: The pool the bucket is stored in.
+:Type: String
+:Parent: ``bucket``
+
+``id``
+
+:Description: The unique bucket id.
+:Type: String
+:Parent: ``bucket``
+
+``marker``
+
+:Description: Internal bucket tag.
+:Type: String
+:Parent: ``bucket``
+
+``owner``
+
+:Description: The user id of the bucket owner.
+:Type: String
+:Parent: ``bucket``
+
+``usage``
+
+:Description: Storage usage information.
+:Type: Container
+:Parent: ``bucket``
+
+``index``
+
+:Description: Status of bucket index.
+:Type: String
+:Parent: ``bucket``
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``BucketUnlinkFailed``
+
+:Description: Unable to unlink bucket from specified user.
+:Code: 409 Conflict
+
+``BucketLinkFailed``
+
+:Description: Unable to link bucket to specified user.
+:Code: 409 Conflict
+
+Remove Object
+=============
+
+Remove an existing object. NOTE: Does not require owner to be non-suspended.
+
+:caps: buckets=write
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{admin}/bucket?object&format=json HTTP/1.1
+ Host: {fqdn}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``bucket``
+
+:Description: The bucket containing the object to be removed.
+:Type: String
+:Example: ``foo_bucket``
+:Required: Yes
+
+``object``
+
+:Description: The object to remove.
+:Type: String
+:Example: ``foo.txt``
+:Required: Yes
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+None.
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``NoSuchObject``
+
+:Description: Specified object does not exist.
+:Code: 404 Not Found
+
+``ObjectRemovalFailed``
+
+:Description: Unable to remove objects.
+:Code: 409 Conflict
+
+
+
+Get Bucket or Object Policy
+===========================
+
+Read the policy of an object or bucket.
+
+:caps: buckets=read
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/bucket?policy&format=json HTTP/1.1
+ Host: {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``bucket``
+
+:Description: The bucket to read the policy from.
+:Type: String
+:Example: ``foo_bucket``
+:Required: Yes
+
+``object``
+
+:Description: The object to read the policy from.
+:Type: String
+:Example: ``foo.txt``
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, returns the object or bucket policy
+
+``policy``
+
+:Description: Access control policy.
+:Type: Container
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``IncompleteBody``
+
+:Description: Either bucket was not specified for a bucket policy request or bucket
+ and object were not specified for an object policy request.
+:Code: 400 Bad Request
+
+Add A User Capability
+=====================
+
+Add an administrative capability to a specified user.
+
+:caps: users=write
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{admin}/user?caps&format=json HTTP/1.1
+ Host: {fqdn}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user ID to add an administrative capability to.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+``user-caps``
+
+:Description: The administrative capability to add to the user.
+:Type: String
+:Example: ``usage=read,write;user=write``
+:Required: Yes
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, the response contains the user's capabilities.
+
+``user``
+
+:Description: A container for the user data information.
+:Type: Container
+:Parent: ``user``
+
+``user_id``
+
+:Description: The user id.
+:Type: String
+:Parent: ``user``
+
+``caps``
+
+:Description: User capabilities.
+:Type: Container
+:Parent: ``user``
+
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``InvalidCapability``
+
+:Description: Attempt to grant invalid admin capability.
+:Code: 400 Bad Request
+
+Example Request
+~~~~~~~~~~~~~~~
+
+::
+
+ PUT /{admin}/user?caps&user-caps=usage=read,write;user=write&format=json HTTP/1.1
+ Host: {fqdn}
+ Content-Type: text/plain
+ Authorization: {your-authorization-token}
+
+
+
+Remove A User Capability
+========================
+
+Remove an administrative capability from a specified user.
+
+:caps: users=write
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{admin}/user?caps&format=json HTTP/1.1
+ Host: {fqdn}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``uid``
+
+:Description: The user ID to remove an administrative capability from.
+:Type: String
+:Example: ``foo_user``
+:Required: Yes
+
+``user-caps``
+
+:Description: The administrative capabilities to remove from the user.
+:Type: String
+:Example: ``usage=read, write``
+:Required: Yes
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, the response contains the user's capabilities.
+
+``user``
+
+:Description: A container for the user data information.
+:Type: Container
+:Parent: ``user``
+
+``user_id``
+
+:Description: The user id.
+:Type: String
+:Parent: ``user``
+
+``caps``
+
+:Description: User capabilities.
+:Type: Container
+:Parent: ``user``
+
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``InvalidCapability``
+
+:Description: Attempt to remove an invalid admin capability.
+:Code: 400 Bad Request
+
+``NoSuchCap``
+
+:Description: User does not possess specified capability.
+:Code: 404 Not Found
+
+
+Quotas
+======
+
+The Admin Operations API enables you to set quotas on users and on bucket owned
+by users. See `Quota Management`_ for additional details. Quotas include the
+maximum number of objects in a bucket and the maximum storage size in megabytes.
+
+To view quotas, the user must have a ``users=read`` capability. To set,
+modify or disable a quota, the user must have ``users=write`` capability.
+See the `Admin Guide`_ for details.
+
+Valid parameters for quotas include:
+
+- **Bucket:** The ``bucket`` option allows you to specify a quota for
+ buckets owned by a user.
+
+- **Maximum Objects:** The ``max-objects`` setting allows you to specify
+ the maximum number of objects. A negative value disables this setting.
+
+- **Maximum Size:** The ``max-size`` option allows you to specify a quota
+ for the maximum number of bytes. The ``max-size-kb`` option allows you
+ to specify it in KiB. A negative value disables this setting.
+
+- **Quota Type:** The ``quota-type`` option sets the scope for the quota.
+ The options are ``bucket`` and ``user``.
+
+- **Enable/Disable Quota:** The ``enabled`` option specifies whether the
+ quota should be enabled. The value should be either 'True' or 'False'.
+
+Get User Quota
+~~~~~~~~~~~~~~
+
+To get a quota, the user must have ``users`` capability set with ``read``
+permission. ::
+
+ GET /admin/user?quota&uid=<uid>&quota-type=user
+
+
+Set User Quota
+~~~~~~~~~~~~~~
+
+To set a quota, the user must have ``users`` capability set with ``write``
+permission. ::
+
+ PUT /admin/user?quota&uid=<uid>&quota-type=user
+
+
+The content must include a JSON representation of the quota settings
+as encoded in the corresponding read operation.
+
+
+Get Bucket Quota
+~~~~~~~~~~~~~~~~
+
+To get a quota, the user must have ``users`` capability set with ``read``
+permission. ::
+
+ GET /admin/user?quota&uid=<uid>&quota-type=bucket
+
+
+Set Bucket Quota
+~~~~~~~~~~~~~~~~
+
+To set a quota, the user must have ``users`` capability set with ``write``
+permission. ::
+
+ PUT /admin/user?quota&uid=<uid>&quota-type=bucket
+
+The content must include a JSON representation of the quota settings
+as encoded in the corresponding read operation.
+
+
+Set Quota for an Individual Bucket
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To set a quota, the user must have ``buckets`` capability set with ``write``
+permission. ::
+
+ PUT /admin/bucket?quota&uid=<uid>&bucket=<bucket-name>
+
+The content must include a JSON representation of the quota settings
+as mentioned in Set Bucket Quota section above.
+
+
+
+Rate Limit
+==========
+
+The Admin Operations API enables you to set and get ratelimit configurations on users and on bucket and global rate limit configurations. See `Rate Limit Management`_ for additional details.
+Rate Limit includes the maximum number of operations and/or bytes per minute, separated by read and/or write, to a bucket and/or by a user and the maximum storage size in megabytes.
+
+To view rate limit, the user must have a ``ratelimit=read`` capability. To set,
+modify or disable a ratelimit, the user must have ``ratelimit=write`` capability.
+See the `Admin Guide`_ for details.
+
+Valid parameters for quotas include:
+
+- **Bucket:** The ``bucket`` option allows you to specify a rate limit for
+ a bucket.
+
+- **User:** The ``uid`` option allows you to specify a rate limit for a user.
+
+- **Maximum Read Bytes:** The ``max-read-bytes`` setting allows you to specify
+ the maximum number of read bytes per minute. A 0 value disables this setting.
+
+- **Maximum Write Bytes:** The ``max-write-bytes`` setting allows you to specify
+ the maximum number of write bytes per minute. A 0 value disables this setting.
+
+- **Maximum Read Ops:** The ``max-read-ops`` setting allows you to specify
+ the maximum number of read ops per minute. A 0 value disables this setting.
+
+- **Maximum Write Ops:** The ``max-write-ops`` setting allows you to specify
+ the maximum number of write ops per minute. A 0 value disables this setting.
+
+- **Global:** The ``global`` option allows you to specify a global rate limit.
+ The value should be either 'True' or 'False'.
+
+- **Rate Limit Scope:** The ``ratelimit-scope`` option sets the scope for the rate limit.
+ The options are ``bucket`` , ``user`` and ``anonymous``.
+ ``anonymous`` is only valid for setting global configuration
+
+- **Enable/Disable Rate Limit:** The ``enabled`` option specifies whether the
+ rate limit should be enabled. The value should be either 'True' or 'False'.
+
+Get User Rate Limit
+~~~~~~~~~~~~~~~~~~~
+
+To get a rate limit, the user must have ``ratelimit`` capability set with ``read``
+permission. ::
+
+ GET /{admin}/ratelimit?ratelimit-scope=user&uid=<uid>
+
+
+Set User Rate Limit
+~~~~~~~~~~~~~~~~~~~
+
+To set a rate limit, the user must have ``ratelimit`` capability set with ``write``
+permission. ::
+
+ POST /{admin}/ratelimit?ratelimit-scope=user&uid=<uid><[&max-read-bytes=<bytes>][&max-write-bytes=<bytes>][&max-read-ops=<ops>][&max-write-ops=<ops>][enabled=<True|False>]>
+
+
+
+Get Bucket Rate Limit
+~~~~~~~~~~~~~~~~~~~~~
+
+To get a rate limit, the user must have ``users`` capability set with ``read``
+permission. ::
+
+ GET /{admin}/ratelimit?bucket=<bucket>&ratelimit-scope=bucket
+
+
+
+Set Rate Limit for an Individual Bucket
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To set a rate limit, the user must have ``ratelimit`` capability set with ``write``
+permission. ::
+
+ POST /{admin}/ratelimit?bucket=<bucket-name>&ratelimit-scope=bucket<[&max-read-bytes=<bytes>][&max-write-bytes=<bytes>][&max-read-ops=<ops>][&max-write-ops=<ops>]>
+
+
+
+Get Global Rate Limit
+~~~~~~~~~~~~~~~~~~~~~
+
+To get a global rate limit, the user must have ``ratelimit`` capability set with ``read``
+permission. ::
+
+ GET /{admin}/ratelimit?global=<True|False>
+
+
+
+Set Global User Rate Limit
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To set a rate limit, the user must have ``ratelimit`` capability set with ``write``
+permission. ::
+
+ POST /{admin}/ratelimit?ratelimit-scope=user&global=<True|False><[&max-read-bytes=<bytes>][&max-write-bytes=<bytes>][&max-read-ops=<ops>][&max-write-ops=<ops>][enabled=<True|False>]>
+
+
+
+Set Global Rate Limit Bucket
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To set a rate limit, the user must have ``ratelimit`` capability set with ``write``
+permission. ::
+
+ POST /{admin}/ratelimit?ratelimit-scope=bucket&global=<True|False><[&max-read-bytes=<bytes>][&max-write-bytes=<bytes>][&max-read-ops=<ops>][&max-write-ops=<ops>]>
+
+
+
+Set Global Anonymous User Rate Limit
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To set a rate limit, the user must have ``ratelimit`` capability set with ``write``
+permission. ::
+
+ POST /{admin}/ratelimit?ratelimit-scope=anon&global=<True|False><[&max-read-bytes=<bytes>][&max-write-bytes=<bytes>][&max-read-ops=<ops>][&max-write-ops=<ops>][enabled=<True|False>]>
+
+
+
+Standard Error Responses
+========================
+
+``AccessDenied``
+
+:Description: Access denied.
+:Code: 403 Forbidden
+
+``InternalError``
+
+:Description: Internal server error.
+:Code: 500 Internal Server Error
+
+``NoSuchUser``
+
+:Description: User does not exist.
+:Code: 404 Not Found
+
+``NoSuchBucket``
+
+:Description: Bucket does not exist.
+:Code: 404 Not Found
+
+``NoSuchKey``
+
+:Description: No such access key.
+:Code: 404 Not Found
+
+
+
+
+Binding libraries
+========================
+
+``Golang``
+
+ - `ceph/go-ceph`_
+ - `IrekFasikhov/go-rgwadmin`_
+ - `QuentinPerez/go-radosgw`_
+
+``Java``
+
+ - `twonote/radosgw-admin4j`_
+
+``PHP``
+
+ - `lbausch/php-ceph-radosgw-admin`_
+ - `myENA/php-rgw-api`_
+
+``Python``
+
+ - `UMIACS/rgwadmin`_
+ - `valerytschopp/python-radosgw-admin`_
+
+
+
+.. _Admin Guide: ../admin
+.. _Quota Management: ../admin#quota-management
+.. _IrekFasikhov/go-rgwadmin: https://github.com/IrekFasikhov/go-rgwadmin
+.. _QuentinPerez/go-radosgw: https://github.com/QuentinPerez/go-radosgw
+.. _ceph/go-ceph: https://github.com/ceph/go-ceph/
+.. _Rate Limit Management: ../admin#rate-limit-management
+.. _IrekFasikhov/go-rgwadmin: https://github.com/IrekFasikhov/go-rgwadmin
+.. _QuentinPerez/go-radosgw: https://github.com/QuentinPerez/go-radosgw
+.. _twonote/radosgw-admin4j: https://github.com/twonote/radosgw-admin4j
+.. _lbausch/php-ceph-radosgw-admin: https://github.com/lbausch/php-ceph-radosgw-admin
+.. _myENA/php-rgw-api: https://github.com/myENA/php-rgw-api
+.. _UMIACS/rgwadmin: https://github.com/UMIACS/rgwadmin
+.. _valerytschopp/python-radosgw-admin: https://github.com/valerytschopp/python-radosgw-admin
diff --git a/doc/radosgw/api.rst b/doc/radosgw/api.rst
new file mode 100644
index 000000000..cb31284e0
--- /dev/null
+++ b/doc/radosgw/api.rst
@@ -0,0 +1,16 @@
+.. _radosgw api:
+
+===============
+librgw (Python)
+===============
+
+.. highlight:: python
+
+The `rgw` python module provides file-like access to rgw.
+
+API Reference
+=============
+
+.. automodule:: rgw
+ :members: LibRGWFS, FileHandle
+
diff --git a/doc/radosgw/archive-sync-module.rst b/doc/radosgw/archive-sync-module.rst
new file mode 100644
index 000000000..b121ee6b1
--- /dev/null
+++ b/doc/radosgw/archive-sync-module.rst
@@ -0,0 +1,44 @@
+===================
+Archive Sync Module
+===================
+
+.. versionadded:: Nautilus
+
+This sync module leverages the versioning feature of the S3 objects in RGW to
+have an archive zone that captures the different versions of the S3 objects
+as they occur over time in the other zones.
+
+An archive zone allows to have a history of versions of S3 objects that can
+only be eliminated through the gateways associated with the archive zone.
+
+This functionality is useful to have a configuration where several
+non-versioned zones replicate their data and metadata through their zone
+gateways (mirror configuration) providing high availability to the end users,
+while the archive zone captures all the data updates and metadata for
+consolidate them as versions of S3 objects.
+
+Including an archive zone in a multizone configuration allows you to have the
+flexibility of an S3 object history in one only zone while saving the space
+that the replicas of the versioned S3 objects would consume in the rest of the
+zones.
+
+
+
+Archive Sync Tier Type Configuration
+------------------------------------
+
+How to Configure
+~~~~~~~~~~~~~~~~
+
+See `Multisite Configuration`_ for how to multisite config instructions. The
+archive sync module requires a creation of a new zone. The zone tier type needs
+to be defined as ``archive``:
+
+::
+
+ # radosgw-admin zone create --rgw-zonegroup={zone-group-name} \
+ --rgw-zone={zone-name} \
+ --endpoints={http://fqdn}[,{http://fqdn}]
+ --tier-type=archive
+
+.. _Multisite Configuration: ../multisite
diff --git a/doc/radosgw/barbican.rst b/doc/radosgw/barbican.rst
new file mode 100644
index 000000000..a90d063fb
--- /dev/null
+++ b/doc/radosgw/barbican.rst
@@ -0,0 +1,123 @@
+==============================
+OpenStack Barbican Integration
+==============================
+
+OpenStack `Barbican`_ can be used as a secure key management service for
+`Server-Side Encryption`_.
+
+.. image:: ../images/rgw-encryption-barbican.png
+
+#. `Configure Keystone`_
+#. `Create a Keystone user`_
+#. `Configure the Ceph Object Gateway`_
+#. `Create a key in Barbican`_
+
+Configure Keystone
+==================
+
+Barbican depends on Keystone for authorization and access control of its keys.
+
+See `OpenStack Keystone Integration`_.
+
+Create a Keystone user
+======================
+
+Create a new user that will be used by the Ceph Object Gateway to retrieve
+keys.
+
+For example::
+
+ user = rgwcrypt-user
+ pass = rgwcrypt-password
+ tenant = rgwcrypt
+
+See OpenStack documentation for `Manage projects, users, and roles`_.
+
+Create a key in Barbican
+========================
+
+See Barbican documentation for `How to Create a Secret`_. Requests to
+Barbican must include a valid Keystone token in the ``X-Auth-Token`` header.
+
+.. note:: Server-side encryption keys must be 256-bit long and base64 encoded.
+
+Example request::
+
+ POST /v1/secrets HTTP/1.1
+ Host: barbican.example.com:9311
+ Accept: */*
+ Content-Type: application/json
+ X-Auth-Token: 7f7d588dd29b44df983bc961a6b73a10
+ Content-Length: 299
+
+ {
+ "name": "my-key",
+ "expiration": "2016-12-28T19:14:44.180394",
+ "algorithm": "aes",
+ "bit_length": 256,
+ "mode": "cbc",
+ "payload": "6b+WOZ1T3cqZMxgThRcXAQBrS5mXKdDUphvpxptl9/4=",
+ "payload_content_type": "application/octet-stream",
+ "payload_content_encoding": "base64"
+ }
+
+Response::
+
+ {"secret_ref": "http://barbican.example.com:9311/v1/secrets/d1e7ef3b-f841-4b7c-90b2-b7d90ca2d723"}
+
+In the response, ``d1e7ef3b-f841-4b7c-90b2-b7d90ca2d723`` is the key id that
+can be used in any `SSE-KMS`_ request.
+
+This newly created key is not accessible by user ``rgwcrypt-user``. This
+privilege must be added with an ACL. See `How to Set/Replace ACL`_ for more
+details.
+
+Example request (assuming that the Keystone id of ``rgwcrypt-user`` is
+``906aa90bd8a946c89cdff80d0869460f``)::
+
+ PUT /v1/secrets/d1e7ef3b-f841-4b7c-90b2-b7d90ca2d723/acl HTTP/1.1
+ Host: barbican.example.com:9311
+ Accept: */*
+ Content-Type: application/json
+ X-Auth-Token: 7f7d588dd29b44df983bc961a6b73a10
+ Content-Length: 101
+
+ {
+ "read":{
+ "users":[ "906aa90bd8a946c89cdff80d0869460f" ],
+ "project-access": true
+ }
+ }
+
+Response::
+
+ {"acl_ref": "http://barbican.example.com:9311/v1/secrets/d1e7ef3b-f841-4b7c-90b2-b7d90ca2d723/acl"}
+
+Configure the Ceph Object Gateway
+=================================
+
+Edit the Ceph configuration file to enable Barbican as a KMS and add information
+about the Barbican server and Keystone user::
+
+ rgw crypt s3 kms backend = barbican
+ rgw barbican url = http://barbican.example.com:9311
+ rgw keystone barbican user = rgwcrypt-user
+ rgw keystone barbican password = rgwcrypt-password
+
+When using Keystone API version 2::
+
+ rgw keystone barbican tenant = rgwcrypt
+
+When using API version 3::
+
+ rgw keystone barbican project
+ rgw keystone barbican domain
+
+
+.. _Barbican: https://wiki.openstack.org/wiki/Barbican
+.. _Server-Side Encryption: ../encryption
+.. _OpenStack Keystone Integration: ../keystone
+.. _Manage projects, users, and roles: https://docs.openstack.org/admin-guide/cli-manage-projects-users-and-roles.html#create-a-user
+.. _How to Create a Secret: https://developer.openstack.org/api-guide/key-manager/secrets.html#how-to-create-a-secret
+.. _SSE-KMS: http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html
+.. _How to Set/Replace ACL: https://developer.openstack.org/api-guide/key-manager/acls.html#how-to-set-replace-acl
diff --git a/doc/radosgw/bucketpolicy.rst b/doc/radosgw/bucketpolicy.rst
new file mode 100644
index 000000000..99f40c5d7
--- /dev/null
+++ b/doc/radosgw/bucketpolicy.rst
@@ -0,0 +1,216 @@
+===============
+Bucket Policies
+===============
+
+.. versionadded:: Luminous
+
+The Ceph Object Gateway supports a subset of the Amazon S3 policy
+language applied to buckets.
+
+
+Creation and Removal
+====================
+
+Bucket policies are managed through standard S3 operations rather than
+radosgw-admin.
+
+For example, one may use s3cmd to set or delete a policy thus::
+
+ $ cat > examplepol
+ {
+ "Version": "2012-10-17",
+ "Statement": [{
+ "Effect": "Allow",
+ "Principal": {"AWS": ["arn:aws:iam::usfolks:user/fred:subuser"]},
+ "Action": "s3:PutObjectAcl",
+ "Resource": [
+ "arn:aws:s3:::happybucket/*"
+ ]
+ }]
+ }
+
+ $ s3cmd setpolicy examplepol s3://happybucket
+ $ s3cmd delpolicy s3://happybucket
+
+
+Limitations
+===========
+
+Currently, we support only the following actions:
+
+- s3:AbortMultipartUpload
+- s3:CreateBucket
+- s3:DeleteBucketPolicy
+- s3:DeleteBucket
+- s3:DeleteBucketWebsite
+- s3:DeleteObject
+- s3:DeleteObjectVersion
+- s3:DeleteReplicationConfiguration
+- s3:GetAccelerateConfiguration
+- s3:GetBucketAcl
+- s3:GetBucketCORS
+- s3:GetBucketLocation
+- s3:GetBucketLogging
+- s3:GetBucketNotification
+- s3:GetBucketPolicy
+- s3:GetBucketRequestPayment
+- s3:GetBucketTagging
+- s3:GetBucketVersioning
+- s3:GetBucketWebsite
+- s3:GetLifecycleConfiguration
+- s3:GetObjectAcl
+- s3:GetObject
+- s3:GetObjectTorrent
+- s3:GetObjectVersionAcl
+- s3:GetObjectVersion
+- s3:GetObjectVersionTorrent
+- s3:GetReplicationConfiguration
+- s3:IPAddress
+- s3:NotIpAddress
+- s3:ListAllMyBuckets
+- s3:ListBucketMultipartUploads
+- s3:ListBucket
+- s3:ListBucketVersions
+- s3:ListMultipartUploadParts
+- s3:PutAccelerateConfiguration
+- s3:PutBucketAcl
+- s3:PutBucketCORS
+- s3:PutBucketLogging
+- s3:PutBucketNotification
+- s3:PutBucketPolicy
+- s3:PutBucketRequestPayment
+- s3:PutBucketTagging
+- s3:PutBucketVersioning
+- s3:PutBucketWebsite
+- s3:PutLifecycleConfiguration
+- s3:PutObjectAcl
+- s3:PutObject
+- s3:PutObjectVersionAcl
+- s3:PutReplicationConfiguration
+- s3:RestoreObject
+
+We do not yet support setting policies on users, groups, or roles.
+
+We use the RGW ‘tenant’ identifier in place of the Amazon twelve-digit
+account ID. In the future we may allow you to assign an account ID to
+a tenant, but for now if you want to use policies between AWS S3 and
+RGW S3 you will have to use the Amazon account ID as the tenant ID when
+creating users.
+
+Under AWS, all tenants share a single namespace. RGW gives every
+tenant its own namespace of buckets. There may be an option to enable
+an AWS-like 'flat' bucket namespace in future versions. At present, to
+access a bucket belonging to another tenant, address it as
+"tenant:bucket" in the S3 request.
+
+In AWS, a bucket policy can grant access to another account, and that
+account owner can then grant access to individual users with user
+permissions. Since we do not yet support user, role, and group
+permissions, account owners will currently need to grant access
+directly to individual users, and granting an entire account access to
+a bucket grants access to all users in that account.
+
+Bucket policies do not yet support string interpolation.
+
+For all requests, condition keys we support are:
+- aws:CurrentTime
+- aws:EpochTime
+- aws:PrincipalType
+- aws:Referer
+- aws:SecureTransport
+- aws:SourceIp
+- aws:UserAgent
+- aws:username
+
+We support certain s3 condition keys for bucket and object requests.
+
+.. versionadded:: Mimic
+
+Bucket Related Operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++-----------------------+----------------------+----------------+
+| Permission | Condition Keys | Comments |
++-----------------------+----------------------+----------------+
+| | s3:x-amz-acl | |
+| | s3:x-amz-grant-<perm>| |
+|s3:createBucket | where perm is one of | |
+| | read/write/read-acp | |
+| | write-acp/ | |
+| | full-control | |
++-----------------------+----------------------+----------------+
+| | s3:prefix | |
+| +----------------------+----------------+
+| s3:ListBucket & | s3:delimiter | |
+| +----------------------+----------------+
+| s3:ListBucketVersions | s3:max-keys | |
++-----------------------+----------------------+----------------+
+| s3:PutBucketAcl | s3:x-amz-acl | |
+| | s3:x-amz-grant-<perm>| |
++-----------------------+----------------------+----------------+
+
+.. _tag_policy:
+
+Object Related Operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++-----------------------------+-----------------------------------------------+-------------------+
+|Permission |Condition Keys | Comments |
+| | | |
++-----------------------------+-----------------------------------------------+-------------------+
+| |s3:x-amz-acl & s3:x-amz-grant-<perm> | |
+| | | |
+| +-----------------------------------------------+-------------------+
+| |s3:x-amz-copy-source | |
+| | | |
+| +-----------------------------------------------+-------------------+
+| |s3:x-amz-server-side-encryption | |
+| | | |
+| +-----------------------------------------------+-------------------+
+|s3:PutObject |s3:x-amz-server-side-encryption-aws-kms-key-id | |
+| | | |
+| +-----------------------------------------------+-------------------+
+| |s3:x-amz-metadata-directive |PUT & COPY to |
+| | |overwrite/preserve |
+| | |metadata in COPY |
+| | |requests |
+| +-----------------------------------------------+-------------------+
+| |s3:RequestObjectTag/<tag-key> | |
+| | | |
++-----------------------------+-----------------------------------------------+-------------------+
+|s3:PutObjectAcl |s3:x-amz-acl & s3-amz-grant-<perm> | |
+|s3:PutObjectVersionAcl | | |
+| +-----------------------------------------------+-------------------+
+| |s3:ExistingObjectTag/<tag-key> | |
+| | | |
++-----------------------------+-----------------------------------------------+-------------------+
+| |s3:RequestObjectTag/<tag-key> | |
+|s3:PutObjectTagging & +-----------------------------------------------+-------------------+
+|s3:PutObjectVersionTagging |s3:ExistingObjectTag/<tag-key> | |
+| | | |
++-----------------------------+-----------------------------------------------+-------------------+
+|s3:GetObject & |s3:ExistingObjectTag/<tag-key> | |
+|s3:GetObjectVersion | | |
++-----------------------------+-----------------------------------------------+-------------------+
+|s3:GetObjectAcl & |s3:ExistingObjectTag/<tag-key> | |
+|s3:GetObjectVersionAcl | | |
++-----------------------------+-----------------------------------------------+-------------------+
+|s3:GetObjectTagging & |s3:ExistingObjectTag/<tag-key> | |
+|s3:GetObjectVersionTagging | | |
++-----------------------------+-----------------------------------------------+-------------------+
+|s3:DeleteObjectTagging & |s3:ExistingObjectTag/<tag-key> | |
+|s3:DeleteObjectVersionTagging| | |
++-----------------------------+-----------------------------------------------+-------------------+
+
+
+More may be supported soon as we integrate with the recently rewritten
+Authentication/Authorization subsystem.
+
+Swift
+=====
+
+There is no way to set bucket policies under Swift, but bucket
+policies that have been set govern Swift as well as S3 operations.
+
+Swift credentials are matched against Principals specified in a policy
+in a way specific to whatever backend is being used.
diff --git a/doc/radosgw/cloud-sync-module.rst b/doc/radosgw/cloud-sync-module.rst
new file mode 100644
index 000000000..a601bd503
--- /dev/null
+++ b/doc/radosgw/cloud-sync-module.rst
@@ -0,0 +1,244 @@
+=========================
+Cloud Sync Module
+=========================
+
+.. versionadded:: Mimic
+
+This module syncs zone data to a remote cloud service. The sync is unidirectional; data is not synced back from the
+remote zone. The goal of this module is to enable syncing data to multiple cloud providers. The currently supported
+cloud providers are those that are compatible with AWS (S3).
+
+User credentials for the remote cloud object store service need to be configured. Since many cloud services impose limits
+on the number of buckets that each user can create, the mapping of source objects and buckets is configurable.
+It is possible to configure different targets to different buckets and bucket prefixes. Note that source ACLs will not
+be preserved. It is possible to map permissions of specific source users to specific destination users.
+
+Due to API limitations there is no way to preserve original object modification time and ETag. The cloud sync module
+stores these as metadata attributes on the destination objects.
+
+
+
+Cloud Sync Tier Type Configuration
+-------------------------------------
+
+Trivial Configuration:
+~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ {
+ "connection": {
+ "access_key": <access>,
+ "secret": <secret>,
+ "endpoint": <endpoint>,
+ "host_style": <path | virtual>,
+ },
+ "acls": [ { "type": <id | email | uri>,
+ "source_id": <source_id>,
+ "dest_id": <dest_id> } ... ],
+ "target_path": <target_path>,
+ }
+
+
+Non Trivial Configuration:
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ {
+ "default": {
+ "connection": {
+ "access_key": <access>,
+ "secret": <secret>,
+ "endpoint": <endpoint>,
+ "host_style" <path | virtual>,
+ },
+ "acls": [
+ {
+ "type" : <id | email | uri>, # optional, default is id
+ "source_id": <id>,
+ "dest_id": <id>
+ } ... ]
+ "target_path": <path> # optional
+ },
+ "connections": [
+ {
+ "connection_id": <id>,
+ "access_key": <access>,
+ "secret": <secret>,
+ "endpoint": <endpoint>,
+ "host_style" <path | virtual>, # optional
+ } ... ],
+ "acl_profiles": [
+ {
+ "acls_id": <id>, # acl mappings
+ "acls": [ {
+ "type": <id | email | uri>,
+ "source_id": <id>,
+ "dest_id": <id>
+ } ... ]
+ }
+ ],
+ "profiles": [
+ {
+ "source_bucket": <source>,
+ "connection_id": <connection_id>,
+ "acls_id": <mappings_id>,
+ "target_path": <dest>, # optional
+ } ... ],
+ }
+
+
+.. Note:: Trivial configuration can coincide with the non-trivial one.
+
+
+* ``connection`` (container)
+
+Represents a connection to the remote cloud service. Contains ``connection_id``, ``access_key``,
+``secret``, ``endpoint``, and ``host_style``.
+
+* ``access_key`` (string)
+
+The remote cloud access key that will be used for a specific connection.
+
+* ``secret`` (string)
+
+The secret key for the remote cloud service.
+
+* ``endpoint`` (string)
+
+URL of remote cloud service endpoint.
+
+* ``host_style`` (path | virtual)
+
+Type of host style to be used when accessing remote cloud endpoint (default: ``path``).
+
+* ``acls`` (array)
+
+Contains a list of ``acl_mappings``.
+
+* ``acl_mapping`` (container)
+
+Each ``acl_mapping`` structure contains ``type``, ``source_id``, and ``dest_id``. These
+will define the ACL mutation that will be done on each object. An ACL mutation allows converting source
+user id to a destination id.
+
+* ``type`` (id | email | uri)
+
+ACL type: ``id`` defines user id, ``email`` defines user by email, and ``uri`` defines user by ``uri`` (group).
+
+* ``source_id`` (string)
+
+ID of user in the source zone.
+
+* ``dest_id`` (string)
+
+ID of user in the destination.
+
+* ``target_path`` (string)
+
+A string that defines how the target path is created. The target path specifies a prefix to which
+the source object name is appended. The target path configurable can include any of the following
+variables:
+- ``sid``: unique string that represents the sync instance ID
+- ``zonegroup``: the zonegroup name
+- ``zonegroup_id``: the zonegroup ID
+- ``zone``: the zone name
+- ``zone_id``: the zone id
+- ``bucket``: source bucket name
+- ``owner``: source bucket owner ID
+
+For example: ``target_path = rgwx-${zone}-${sid}/${owner}/${bucket}``
+
+
+* ``acl_profiles`` (array)
+
+An array of ``acl_profile``.
+
+* ``acl_profile`` (container)
+
+Each profile contains ``acls_id`` (string) that represents the profile, and ``acls`` array that
+holds a list of ``acl_mappings``.
+
+* ``profiles`` (array)
+
+A list of profiles. Each profile contains the following:
+- ``source_bucket``: either a bucket name, or a bucket prefix (if ends with ``*``) that defines the source bucket(s) for this profile
+- ``target_path``: as defined above
+- ``connection_id``: ID of the connection that will be used for this profile
+- ``acls_id``: ID of ACLs profile that will be used for this profile
+
+
+S3 Specific Configurables:
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Currently cloud sync will only work with backends that are compatible with AWS S3. There are
+a few configurables that can be used to tweak its behavior when accessing these cloud services:
+
+::
+
+ {
+ "multipart_sync_threshold": {object_size},
+ "multipart_min_part_size": {part_size}
+ }
+
+
+* ``multipart_sync_threshold`` (integer)
+
+Objects this size or larger will be synced to the cloud using multipart upload.
+
+* ``multipart_min_part_size`` (integer)
+
+Minimum parts size to use when syncing objects using multipart upload.
+
+
+How to Configure
+~~~~~~~~~~~~~~~~
+
+See :ref:`multisite` for how to multisite config instructions. The cloud sync module requires a creation of a new zone. The zone
+tier type needs to be defined as ``cloud``:
+
+::
+
+ # radosgw-admin zone create --rgw-zonegroup={zone-group-name} \
+ --rgw-zone={zone-name} \
+ --endpoints={http://fqdn}[,{http://fqdn}]
+ --tier-type=cloud
+
+
+The tier configuration can be then done using the following command
+
+::
+
+ # radosgw-admin zone modify --rgw-zonegroup={zone-group-name} \
+ --rgw-zone={zone-name} \
+ --tier-config={key}={val}[,{key}={val}]
+
+The ``key`` in the configuration specifies the config variable that needs to be updated, and
+the ``val`` specifies its new value. Nested values can be accessed using period. For example:
+
+::
+
+ # radosgw-admin zone modify --rgw-zonegroup={zone-group-name} \
+ --rgw-zone={zone-name} \
+ --tier-config=connection.access_key={key},connection.secret={secret}
+
+
+Configuration array entries can be accessed by specifying the specific entry to be referenced enclosed
+in square brackets, and adding new array entry can be done by using `[]`. Index value of `-1` references
+the last entry in the array. At the moment it is not possible to create a new entry and reference it
+again at the same command.
+For example, creating a new profile for buckets starting with {prefix}:
+
+::
+
+ # radosgw-admin zone modify --rgw-zonegroup={zone-group-name} \
+ --rgw-zone={zone-name} \
+ --tier-config=profiles[].source_bucket={prefix}'*'
+
+ # radosgw-admin zone modify --rgw-zonegroup={zone-group-name} \
+ --rgw-zone={zone-name} \
+ --tier-config=profiles[-1].connection_id={conn_id},profiles[-1].acls_id={acls_id}
+
+
+An entry can be removed by using ``--tier-config-rm={key}``.
diff --git a/doc/radosgw/cloud-transition.rst b/doc/radosgw/cloud-transition.rst
new file mode 100644
index 000000000..c00ad790b
--- /dev/null
+++ b/doc/radosgw/cloud-transition.rst
@@ -0,0 +1,368 @@
+================
+Cloud Transition
+================
+
+This feature enables data transition to a remote cloud service as part of `Lifecycle Configuration <https://docs.aws.amazon.com/AmazonS3/latest/dev/object-lifecycle-mgmt.html>`__ via :ref:`storage_classes`. The transition is unidirectional; data cannot be transitioned back from the remote zone. The goal of this feature is to enable data transition to multiple cloud providers. The currently supported cloud providers are those that are compatible with AWS (S3).
+
+Special storage class of tier type ``cloud-s3`` is used to configure the remote cloud S3 object store service to which the data needs to be transitioned. These are defined in terms of zonegroup placement targets and unlike regular storage classes, do not need a data pool.
+
+User credentials for the remote cloud object store service need to be configured. Note that source ACLs will not
+be preserved. It is possible to map permissions of specific source users to specific destination users.
+
+
+Cloud Storage Class Configuration
+---------------------------------
+
+::
+
+ {
+ "access_key": <access>,
+ "secret": <secret>,
+ "endpoint": <endpoint>,
+ "region": <region>,
+ "host_style": <path | virtual>,
+ "acls": [ { "type": <id | email | uri>,
+ "source_id": <source_id>,
+ "dest_id": <dest_id> } ... ],
+ "target_path": <target_path>,
+ "target_storage_class": <target-storage-class>,
+ "multipart_sync_threshold": {object_size},
+ "multipart_min_part_size": {part_size},
+ "retain_head_object": <true | false>
+ }
+
+
+Cloud Transition Specific Configurables:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* ``access_key`` (string)
+
+The remote cloud S3 access key that will be used for a specific connection.
+
+* ``secret`` (string)
+
+The secret key for the remote cloud S3 service.
+
+* ``endpoint`` (string)
+
+URL of remote cloud S3 service endpoint.
+
+* ``region`` (string)
+
+The remote cloud S3 service region name.
+
+* ``host_style`` (path | virtual)
+
+Type of host style to be used when accessing remote cloud S3 endpoint (default: ``path``).
+
+* ``acls`` (array)
+
+Contains a list of ``acl_mappings``.
+
+* ``acl_mapping`` (container)
+
+Each ``acl_mapping`` structure contains ``type``, ``source_id``, and ``dest_id``. These
+will define the ACL mutation that will be done on each object. An ACL mutation allows converting source
+user id to a destination id.
+
+* ``type`` (id | email | uri)
+
+ACL type: ``id`` defines user id, ``email`` defines user by email, and ``uri`` defines user by ``uri`` (group).
+
+* ``source_id`` (string)
+
+ID of user in the source zone.
+
+* ``dest_id`` (string)
+
+ID of user in the destination.
+
+* ``target_path`` (string)
+
+A string that defines how the target path is created. The target path specifies a prefix to which
+the source 'bucket-name/object-name' is appended. If not specified the target_path created is "rgwx-${zonegroup}-${storage-class}-cloud-bucket".
+
+For example: ``target_path = rgwx-archive-${zonegroup}/``
+
+* ``target_storage_class`` (string)
+
+A string that defines the target storage class to which the object transitions to. If not specified, object is transitioned to STANDARD storage class.
+
+* ``retain_head_object`` (true | false)
+
+If true, retains the metadata of the object transitioned to cloud. If false (default), the object is deleted post transition.
+This option is ignored for current versioned objects. For more details, refer to section "Versioned Objects" below.
+
+
+S3 Specific Configurables:
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Currently cloud transition will only work with backends that are compatible with AWS S3. There are
+a few configurables that can be used to tweak its behavior when accessing these cloud services:
+
+::
+
+ {
+ "multipart_sync_threshold": {object_size},
+ "multipart_min_part_size": {part_size}
+ }
+
+
+* ``multipart_sync_threshold`` (integer)
+
+Objects this size or larger will be transitioned to the cloud using multipart upload.
+
+* ``multipart_min_part_size`` (integer)
+
+Minimum parts size to use when transitioning objects using multipart upload.
+
+
+How to Configure
+~~~~~~~~~~~~~~~~
+
+See :ref:`adding_a_storage_class` for how to configure storage-class for a zonegroup. The cloud transition requires a creation of a special storage class with tier type defined as ``cloud-s3``
+
+.. note:: If you have not done any previous `Multisite Configuration`_,
+ a ``default`` zone and zonegroup are created for you, and changes
+ to the zone/zonegroup will not take effect until the Ceph Object
+ Gateways are restarted. If you have created a realm for multisite,
+ the zone/zonegroup changes will take effect once the changes are
+ committed with ``radosgw-admin period update --commit``.
+
+::
+
+ # radosgw-admin zonegroup placement add --rgw-zonegroup={zone-group-name} \
+ --placement-id={placement-id} \
+ --storage-class={storage-class-name} \
+ --tier-type=cloud-s3
+
+For example:
+
+::
+
+ # radosgw-admin zonegroup placement add --rgw-zonegroup=default \
+ --placement-id=default-placement \
+ --storage-class=CLOUDTIER --tier-type=cloud-s3
+ [
+ {
+ "key": "default-placement",
+ "val": {
+ "name": "default-placement",
+ "tags": [],
+ "storage_classes": [
+ "CLOUDTIER",
+ "STANDARD"
+ ],
+ "tier_targets": [
+ {
+ "key": "CLOUDTIER",
+ "val": {
+ "tier_type": "cloud-s3",
+ "storage_class": "CLOUDTIER",
+ "retain_head_object": "false",
+ "s3": {
+ "endpoint": "",
+ "access_key": "",
+ "secret": "",
+ "host_style": "path",
+ "target_storage_class": "",
+ "target_path": "",
+ "acl_mappings": [],
+ "multipart_sync_threshold": 33554432,
+ "multipart_min_part_size": 33554432
+ }
+ }
+ }
+ ]
+ }
+ }
+ ]
+
+
+.. note:: Once a storage class is created of ``--tier-type=cloud-s3``, it cannot be later modified to any other storage class type.
+
+The tier configuration can be then done using the following command
+
+::
+
+ # radosgw-admin zonegroup placement modify --rgw-zonegroup={zone-group-name} \
+ --placement-id={placement-id} \
+ --storage-class={storage-class-name} \
+ --tier-config={key}={val}[,{key}={val}]
+
+The ``key`` in the configuration specifies the config variable that needs to be updated, and
+the ``val`` specifies its new value.
+
+
+For example:
+
+::
+
+ # radosgw-admin zonegroup placement modify --rgw-zonegroup default \
+ --placement-id default-placement \
+ --storage-class CLOUDTIER \
+ --tier-config=endpoint=http://XX.XX.XX.XX:YY,\
+ access_key=<access_key>,secret=<secret>, \
+ multipart_sync_threshold=44432, \
+ multipart_min_part_size=44432, \
+ retain_head_object=true
+
+Nested values can be accessed using period. For example:
+
+::
+
+ # radosgw-admin zonegroup placement modify --rgw-zonegroup={zone-group-name} \
+ --placement-id={placement-id} \
+ --storage-class={storage-class-name} \
+ --tier-config=acls.source_id=${source-id}, \
+ acls.dest_id=${dest-id}
+
+
+
+Configuration array entries can be accessed by specifying the specific entry to be referenced enclosed
+in square brackets, and adding new array entry can be done by using `[]`.
+For example, creating a new acl array entry:
+
+::
+
+ # radosgw-admin zonegroup placement modify --rgw-zonegroup={zone-group-name} \
+ --placement-id={placement-id} \
+ --storage-class={storage-class-name} \
+ --tier-config=acls[].source_id=${source-id}, \
+ acls[${source-id}].dest_id=${dest-id}, \
+ acls[${source-id}].type=email
+
+An entry can be removed by using ``--tier-config-rm={key}``.
+
+For example,
+
+::
+
+ # radosgw-admin zonegroup placement modify --rgw-zonegroup default \
+ --placement-id default-placement \
+ --storage-class CLOUDTIER \
+ --tier-config-rm=acls.source_id=testid
+
+ # radosgw-admin zonegroup placement modify --rgw-zonegroup default \
+ --placement-id default-placement \
+ --storage-class CLOUDTIER \
+ --tier-config-rm=target_path
+
+The storage class can be removed using the following command
+
+::
+
+ # radosgw-admin zonegroup placement rm --rgw-zonegroup={zone-group-name} \
+ --placement-id={placement-id} \
+ --storage-class={storage-class-name}
+
+For example,
+
+::
+
+ # radosgw-admin zonegroup placement rm --rgw-zonegroup default \
+ --placement-id default-placement \
+ --storage-class CLOUDTIER
+ [
+ {
+ "key": "default-placement",
+ "val": {
+ "name": "default-placement",
+ "tags": [],
+ "storage_classes": [
+ "STANDARD"
+ ]
+ }
+ }
+ ]
+
+Object modification & Limitations
+----------------------------------
+
+The cloud storage class once configured can then be used like any other storage class in the bucket lifecycle rules. For example,
+
+::
+
+ <Transition>
+ <StorageClass>CLOUDTIER</StorageClass>
+ ....
+ ....
+ </Transition>
+
+
+Since the transition is unidirectional, while configuring S3 lifecycle rules, the cloud storage class should be specified last among all the storage classes the object transitions to. Subsequent rules (if any) do not apply post transition to the cloud.
+
+Due to API limitations there is no way to preserve original object modification time and ETag but they get stored as metadata attributes on the destination objects, as shown below:
+
+::
+
+ x-amz-meta-rgwx-source: rgw
+ x-amz-meta-rgwx-source-etag: ed076287532e86365e841e92bfc50d8c
+ x-amz-meta-rgwx-source-key: lc.txt
+ x-amz-meta-rgwx-source-mtime: 1608546349.757100363
+ x-amz-meta-rgwx-versioned-epoch: 0
+
+In order to allow some cloud services detect the source and map the user-defined 'x-amz-meta-' attributes, below two additional new attributes are added to the objects being transitioned
+
+::
+
+ x-rgw-cloud : true/false
+ (set to "true", by default, if the object is being transitioned from RGW)
+
+ x-rgw-cloud-keep-attrs : true/false
+ (if set to default value "true", the cloud service should map and store all the x-amz-meta-* attributes. If it cannot, then the operation should fail.
+ if set to "false", the cloud service can ignore such attributes and just store the object data being sent.)
+
+
+By default, post transition, the source object gets deleted. But it is possible to retain its metadata but with updated values (like storage-class and object-size) by setting config option 'retain_head_object' to true. However GET on those objects shall still fail with 'InvalidObjectState' error.
+
+For example,
+::
+
+ # s3cmd info s3://bucket/lc.txt
+ s3://bucket/lc.txt (object):
+ File size: 12
+ Last mod: Mon, 21 Dec 2020 10:25:56 GMT
+ MIME type: text/plain
+ Storage: CLOUDTIER
+ MD5 sum: ed076287532e86365e841e92bfc50d8c
+ SSE: none
+ Policy: none
+ CORS: none
+ ACL: M. Tester: FULL_CONTROL
+ x-amz-meta-s3cmd-attrs: atime:1608466266/ctime:1597606156/gid:0/gname:root/md5:ed076287532e86365e841e92bfc50d8c/mode:33188/mtime:1597605793/uid:0/uname:root
+
+ # s3cmd get s3://bucket/lc.txt lc_restore.txt
+ download: 's3://bucket/lc.txt' -> 'lc_restore.txt' [1 of 1]
+ ERROR: S3 error: 403 (InvalidObjectState)
+
+To avoid object names collision across various buckets, source bucket name is prepended to the target object name. If the object is versioned, object versionid is appended to the end.
+
+Below is the sample object name format:
+::
+
+ s3://<target_path>/<source_bucket_name>/<source_object_name>(-<source_object_version_id>)
+
+
+Versioned Objects
+~~~~~~~~~~~~~~~~~
+
+For versioned and locked objects, similar semantics as that of LifecycleExpiration are applied as stated below.
+
+* If the object is current, post transitioning to cloud, it is made noncurrent with delete marker created.
+
+* If the object is noncurrent and is locked, its transition is skipped.
+
+
+Future Work
+-----------
+
+* Send presigned redirect or read-through the objects transitioned to cloud
+
+* Support s3:RestoreObject operation on cloud transitioned objects.
+
+* Federation between RGW and Cloud services.
+
+* Support transition to other cloud providers (like Azure).
+
+.. _`Multisite Configuration`: ../multisite
diff --git a/doc/radosgw/compression.rst b/doc/radosgw/compression.rst
new file mode 100644
index 000000000..fba0681da
--- /dev/null
+++ b/doc/radosgw/compression.rst
@@ -0,0 +1,91 @@
+===========
+Compression
+===========
+
+.. versionadded:: Kraken
+
+The Ceph Object Gateway supports server-side compression of uploaded objects,
+using any of Ceph's existing compression plugins.
+
+.. note:: The Reef release added a :ref:`feature_compress_encrypted` zonegroup
+ feature to enable compression with `Server-Side Encryption`_.
+
+
+Configuration
+=============
+
+Compression can be enabled on a storage class in the Zone's placement target
+by providing the ``--compression=<type>`` option to the command
+``radosgw-admin zone placement modify``.
+
+The compression ``type`` refers to the name of the compression plugin to use
+when writing new object data. Each compressed object remembers which plugin
+was used, so changing this setting does not hinder the ability to decompress
+existing objects, nor does it force existing objects to be recompressed.
+
+This compression setting applies to all new objects uploaded to buckets using
+this placement target. Compression can be disabled by setting the ``type`` to
+an empty string or ``none``.
+
+For example::
+
+ $ radosgw-admin zone placement modify \
+ --rgw-zone default \
+ --placement-id default-placement \
+ --storage-class STANDARD \
+ --compression zlib
+ {
+ ...
+ "placement_pools": [
+ {
+ "key": "default-placement",
+ "val": {
+ "index_pool": "default.rgw.buckets.index",
+ "storage_classes": {
+ "STANDARD": {
+ "data_pool": "default.rgw.buckets.data",
+ "compression_type": "zlib"
+ }
+ },
+ "data_extra_pool": "default.rgw.buckets.non-ec",
+ "index_type": 0,
+ }
+ }
+ ],
+ ...
+ }
+
+.. note:: A ``default`` zone is created for you if you have not done any
+ previous `Multisite Configuration`_.
+
+
+Statistics
+==========
+
+While all existing commands and APIs continue to report object and bucket
+sizes based their uncompressed data, compression statistics for a given bucket
+are included in its ``bucket stats``::
+
+ $ radosgw-admin bucket stats --bucket=<name>
+ {
+ ...
+ "usage": {
+ "rgw.main": {
+ "size": 1075028,
+ "size_actual": 1331200,
+ "size_utilized": 592035,
+ "size_kb": 1050,
+ "size_kb_actual": 1300,
+ "size_kb_utilized": 579,
+ "num_objects": 104
+ }
+ },
+ ...
+ }
+
+The ``size_utilized`` and ``size_kb_utilized`` fields represent the total
+size of compressed data, in bytes and kilobytes respectively.
+
+
+.. _`Server-Side Encryption`: ../encryption
+.. _`Multisite Configuration`: ../multisite
diff --git a/doc/radosgw/config-ref.rst b/doc/radosgw/config-ref.rst
new file mode 100644
index 000000000..916ff4ff5
--- /dev/null
+++ b/doc/radosgw/config-ref.rst
@@ -0,0 +1,301 @@
+======================================
+ Ceph Object Gateway Config Reference
+======================================
+
+The following settings may added to the Ceph configuration file (i.e., usually
+``ceph.conf``) under the ``[client.radosgw.{instance-name}]`` section. The
+settings may contain default values. If you do not specify each setting in the
+Ceph configuration file, the default value will be set automatically.
+
+Configuration variables set under the ``[client.radosgw.{instance-name}]``
+section will not apply to rgw or radosgw-admin commands without an instance-name
+specified in the command. Thus variables meant to be applied to all RGW
+instances or all radosgw-admin options can be put into the ``[global]`` or the
+``[client]`` section to avoid specifying ``instance-name``.
+
+.. confval:: rgw_frontends
+.. confval:: rgw_data
+.. confval:: rgw_enable_apis
+.. confval:: rgw_cache_enabled
+.. confval:: rgw_cache_lru_size
+.. confval:: rgw_dns_name
+.. confval:: rgw_script_uri
+.. confval:: rgw_request_uri
+.. confval:: rgw_print_continue
+.. confval:: rgw_remote_addr_param
+.. confval:: rgw_op_thread_timeout
+.. confval:: rgw_op_thread_suicide_timeout
+.. confval:: rgw_thread_pool_size
+.. confval:: rgw_num_control_oids
+.. confval:: rgw_init_timeout
+.. confval:: rgw_mime_types_file
+.. confval:: rgw_s3_success_create_obj_status
+.. confval:: rgw_resolve_cname
+.. confval:: rgw_obj_stripe_size
+.. confval:: rgw_extended_http_attrs
+.. confval:: rgw_exit_timeout_secs
+.. confval:: rgw_get_obj_window_size
+.. confval:: rgw_get_obj_max_req_size
+.. confval:: rgw_multipart_min_part_size
+.. confval:: rgw_relaxed_s3_bucket_names
+.. confval:: rgw_list_buckets_max_chunk
+.. confval:: rgw_override_bucket_index_max_shards
+.. confval:: rgw_curl_wait_timeout_ms
+.. confval:: rgw_copy_obj_progress
+.. confval:: rgw_copy_obj_progress_every_bytes
+.. confval:: rgw_max_copy_obj_concurrent_io
+.. confval:: rgw_admin_entry
+.. confval:: rgw_content_length_compat
+.. confval:: rgw_bucket_quota_ttl
+.. confval:: rgw_user_quota_bucket_sync_interval
+.. confval:: rgw_user_quota_sync_interval
+.. confval:: rgw_bucket_default_quota_max_objects
+.. confval:: rgw_bucket_default_quota_max_size
+.. confval:: rgw_user_default_quota_max_objects
+.. confval:: rgw_user_default_quota_max_size
+.. confval:: rgw_verify_ssl
+.. confval:: rgw_max_chunk_size
+
+Lifecycle Settings
+==================
+
+Bucket Lifecycle configuration can be used to manage your objects so they are stored
+effectively throughout their lifetime. In past releases Lifecycle processing was rate-limited
+by single threaded processing. With the Nautilus release this has been addressed and the
+Ceph Object Gateway now allows for parallel thread processing of bucket lifecycles across
+additional Ceph Object Gateway instances and replaces the in-order
+index shard enumeration with a random ordered sequence.
+
+There are two options in particular to look at when looking to increase the
+aggressiveness of lifecycle processing:
+
+.. confval:: rgw_lc_max_worker
+.. confval:: rgw_lc_max_wp_worker
+
+These values can be tuned based upon your specific workload to further increase the
+aggressiveness of lifecycle processing. For a workload with a larger number of buckets (thousands)
+you would look at increasing the :confval:`rgw_lc_max_worker` value from the default value of 3 whereas for a
+workload with a smaller number of buckets but higher number of objects (hundreds of thousands)
+per bucket you would consider decreasing :confval:`rgw_lc_max_wp_worker` from the default value of 3.
+
+.. note:: When looking to tune either of these specific values please validate the
+ current Cluster performance and Ceph Object Gateway utilization before increasing.
+
+Garbage Collection Settings
+===========================
+
+The Ceph Object Gateway allocates storage for new objects immediately.
+
+The Ceph Object Gateway purges the storage space used for deleted and overwritten
+objects in the Ceph Storage cluster some time after the gateway deletes the
+objects from the bucket index. The process of purging the deleted object data
+from the Ceph Storage cluster is known as Garbage Collection or GC.
+
+To view the queue of objects awaiting garbage collection, execute the following
+
+.. prompt:: bash $
+
+ radosgw-admin gc list
+
+.. note:: Specify ``--include-all`` to list all entries, including unexpired
+ Garbage Collection objects.
+
+Garbage collection is a background activity that may
+execute continuously or during times of low loads, depending upon how the
+administrator configures the Ceph Object Gateway. By default, the Ceph Object
+Gateway conducts GC operations continuously. Since GC operations are a normal
+part of Ceph Object Gateway operations, especially with object delete
+operations, objects eligible for garbage collection exist most of the time.
+
+Some workloads may temporarily or permanently outpace the rate of garbage
+collection activity. This is especially true of delete-heavy workloads, where
+many objects get stored for a short period of time and then deleted. For these
+types of workloads, administrators can increase the priority of garbage
+collection operations relative to other operations with the following
+configuration parameters.
+
+.. confval:: rgw_gc_max_objs
+.. confval:: rgw_gc_obj_min_wait
+.. confval:: rgw_gc_processor_max_time
+.. confval:: rgw_gc_processor_period
+.. confval:: rgw_gc_max_concurrent_io
+
+:Tuning Garbage Collection for Delete Heavy Workloads:
+
+As an initial step towards tuning Ceph Garbage Collection to be more
+aggressive the following options are suggested to be increased from their
+default configuration values::
+
+ rgw_gc_max_concurrent_io = 20
+ rgw_gc_max_trim_chunk = 64
+
+.. note:: Modifying these values requires a restart of the RGW service.
+
+Once these values have been increased from default please monitor for performance of the cluster during Garbage Collection to verify no adverse performance issues due to the increased values.
+
+Multisite Settings
+==================
+
+.. versionadded:: Jewel
+
+You may include the following settings in your Ceph configuration
+file under each ``[client.radosgw.{instance-name}]`` instance.
+
+.. confval:: rgw_zone
+.. confval:: rgw_zonegroup
+.. confval:: rgw_realm
+.. confval:: rgw_run_sync_thread
+.. confval:: rgw_data_log_window
+.. confval:: rgw_data_log_changes_size
+.. confval:: rgw_data_log_obj_prefix
+.. confval:: rgw_data_log_num_shards
+.. confval:: rgw_md_log_max_shards
+.. confval:: rgw_data_sync_poll_interval
+.. confval:: rgw_meta_sync_poll_interval
+.. confval:: rgw_bucket_sync_spawn_window
+.. confval:: rgw_data_sync_spawn_window
+.. confval:: rgw_meta_sync_spawn_window
+
+.. important:: The values of :confval:`rgw_data_log_num_shards` and
+ :confval:`rgw_md_log_max_shards` should not be changed after sync has
+ started.
+
+S3 Settings
+===========
+
+.. confval:: rgw_s3_auth_use_ldap
+
+Swift Settings
+==============
+
+.. confval:: rgw_enforce_swift_acls
+.. confval:: rgw_swift_tenant_name
+.. confval:: rgw_swift_token_expiration
+.. confval:: rgw_swift_url
+.. confval:: rgw_swift_url_prefix
+.. confval:: rgw_swift_auth_url
+.. confval:: rgw_swift_auth_entry
+.. confval:: rgw_swift_account_in_url
+.. confval:: rgw_swift_versioning_enabled
+.. confval:: rgw_trust_forwarded_https
+
+Logging Settings
+================
+
+.. confval:: rgw_log_nonexistent_bucket
+.. confval:: rgw_log_object_name
+.. confval:: rgw_log_object_name_utc
+.. confval:: rgw_usage_max_shards
+.. confval:: rgw_usage_max_user_shards
+.. confval:: rgw_enable_ops_log
+.. confval:: rgw_enable_usage_log
+.. confval:: rgw_ops_log_rados
+.. confval:: rgw_ops_log_socket_path
+.. confval:: rgw_ops_log_data_backlog
+.. confval:: rgw_usage_log_flush_threshold
+.. confval:: rgw_usage_log_tick_interval
+.. confval:: rgw_log_http_headers
+
+Keystone Settings
+=================
+
+.. confval:: rgw_keystone_url
+.. confval:: rgw_keystone_api_version
+.. confval:: rgw_keystone_admin_domain
+.. confval:: rgw_keystone_admin_project
+.. confval:: rgw_keystone_admin_token
+.. confval:: rgw_keystone_admin_token_path
+.. confval:: rgw_keystone_admin_tenant
+.. confval:: rgw_keystone_admin_user
+.. confval:: rgw_keystone_admin_password
+.. confval:: rgw_keystone_admin_password_path
+.. confval:: rgw_keystone_accepted_roles
+.. confval:: rgw_keystone_token_cache_size
+.. confval:: rgw_keystone_verify_ssl
+.. confval:: rgw_keystone_service_token_enabled
+.. confval:: rgw_keystone_service_token_accepted_roles
+.. confval:: rgw_keystone_expired_token_cache_expiration
+
+Server-side encryption Settings
+===============================
+
+.. confval:: rgw_crypt_s3_kms_backend
+
+Barbican Settings
+=================
+
+.. confval:: rgw_barbican_url
+.. confval:: rgw_keystone_barbican_user
+.. confval:: rgw_keystone_barbican_password
+.. confval:: rgw_keystone_barbican_tenant
+.. confval:: rgw_keystone_barbican_project
+.. confval:: rgw_keystone_barbican_domain
+
+HashiCorp Vault Settings
+========================
+
+.. confval:: rgw_crypt_vault_auth
+.. confval:: rgw_crypt_vault_token_file
+.. confval:: rgw_crypt_vault_addr
+.. confval:: rgw_crypt_vault_prefix
+.. confval:: rgw_crypt_vault_secret_engine
+.. confval:: rgw_crypt_vault_namespace
+
+SSE-S3 Settings
+===============
+
+.. confval:: rgw_crypt_sse_s3_backend
+.. confval:: rgw_crypt_sse_s3_vault_secret_engine
+.. confval:: rgw_crypt_sse_s3_key_template
+.. confval:: rgw_crypt_sse_s3_vault_auth
+.. confval:: rgw_crypt_sse_s3_vault_token_file
+.. confval:: rgw_crypt_sse_s3_vault_addr
+.. confval:: rgw_crypt_sse_s3_vault_prefix
+.. confval:: rgw_crypt_sse_s3_vault_namespace
+.. confval:: rgw_crypt_sse_s3_vault_verify_ssl
+.. confval:: rgw_crypt_sse_s3_vault_ssl_cacert
+.. confval:: rgw_crypt_sse_s3_vault_ssl_clientcert
+.. confval:: rgw_crypt_sse_s3_vault_ssl_clientkey
+
+
+QoS settings
+------------
+
+.. versionadded:: Nautilus
+
+The ``civetweb`` frontend has a threading model that uses a thread per
+connection and hence is automatically throttled by :confval:`rgw_thread_pool_size`
+configurable when it comes to accepting connections. The newer ``beast`` frontend is
+not restricted by the thread pool size when it comes to accepting new
+connections, so a scheduler abstraction is introduced in the Nautilus release
+to support future methods of scheduling requests.
+
+Currently the scheduler defaults to a throttler which throttles the active
+connections to a configured limit. QoS based on mClock is currently in an
+*experimental* phase and not recommended for production yet. Current
+implementation of *dmclock_client* op queue divides RGW ops on admin, auth
+(swift auth, sts) metadata & data requests.
+
+
+.. confval:: rgw_max_concurrent_requests
+.. confval:: rgw_scheduler_type
+.. confval:: rgw_dmclock_auth_res
+.. confval:: rgw_dmclock_auth_wgt
+.. confval:: rgw_dmclock_auth_lim
+.. confval:: rgw_dmclock_admin_res
+.. confval:: rgw_dmclock_admin_wgt
+.. confval:: rgw_dmclock_admin_lim
+.. confval:: rgw_dmclock_data_res
+.. confval:: rgw_dmclock_data_wgt
+.. confval:: rgw_dmclock_data_lim
+.. confval:: rgw_dmclock_metadata_res
+.. confval:: rgw_dmclock_metadata_wgt
+.. confval:: rgw_dmclock_metadata_lim
+
+.. _Architecture: ../../architecture#data-striping
+.. _Pool Configuration: ../../rados/configuration/pool-pg-config-ref/
+.. _Cluster Pools: ../../rados/operations/pools
+.. _Rados cluster handles: ../../rados/api/librados-intro/#step-2-configuring-a-cluster-handle
+.. _Barbican: ../barbican
+.. _Encryption: ../encryption
+.. _HTTP Frontends: ../frontends
diff --git a/doc/radosgw/d3n_datacache.rst b/doc/radosgw/d3n_datacache.rst
new file mode 100644
index 000000000..12d2850a5
--- /dev/null
+++ b/doc/radosgw/d3n_datacache.rst
@@ -0,0 +1,116 @@
+==================
+D3N RGW Data Cache
+==================
+
+.. contents::
+
+Datacenter-Data-Delivery Network (D3N) uses high-speed storage such as NVMe flash or DRAM to cache
+datasets on the access side.
+Such caching allows big data jobs to use the compute and fast storage resources available on each
+Rados Gateway node at the edge.
+
+Many datacenters include low-cost, centralized storage repositories, called data lakes,
+to store and share terabyte and petabyte-scale datasets.
+By necessity most distributed big-data analytic clusters such as Hadoop and Spark must
+depend on accessing a centrally located data lake that is relatively far away.
+Even with a well-designed datacenter network, cluster-to-data lake bandwidth is typically much less
+than the bandwidth of a solid-state storage located at an edge node.
+
+| D3N improves the performance of big-data jobs running in analysis clusters by speeding up recurring reads from the data lake.
+| The Rados Gateways act as cache servers for the back-end object store (OSDs), storing data locally for reuse.
+
+Architecture
+============
+
+D3N improves the performance of big-data jobs by speeding up repeatedly accessed dataset reads from the data lake.
+Cache servers are located in the datacenter on the access side of potential network and storage bottlenecks.
+D3Ns two-layer logical cache forms a traditional caching hierarchy :sup:`*`
+where caches nearer the client have the lowest access latency and overhead,
+while caches in higher levels in the hierarchy are slower (requiring multiple hops to access),
+The layer 1 cache server nearest to the client handles object requests by breaking them into blocks,
+returning any blocks which are cached locally, and forwarding missed requests to the block home location
+(as determined by consistent hashing) in the next layer.
+Cache misses are forwarded to successive logical caching layers until a miss at the top layer is resolved
+by a request to the data lake (Rados)
+
+:sup:`*` currently only layer 1 cache has been upstreamed.
+
+See `MOC D3N (Datacenter-scale Data Delivery Network)`_ and `Red Hat Research D3N Cache for Data Centers`_.
+
+Implementation
+==============
+
+- The D3N cache supports both the `S3` and `Swift` object storage interfaces.
+- D3N currently caches only tail objects, because they are immutable (by default it is parts of objects that are larger than 4MB).
+ (the NGINX `RGW Data cache and CDN`_ supports caching of all object sizes)
+
+
+Requirements
+------------
+
+- An SSD (/dev/nvme,/dev/pmem,/dev/shm) or similar block storage device, formatted
+ (filesystems other than XFS were not tested) and mounted.
+ It will be used as the cache backing store.
+ (depending on device performance, multiple RGWs may share a single device but each requires
+ a discrete directory on the device filesystem)
+
+Limitations
+-----------
+
+- D3N will not cache objects compressed by `Rados Gateway Compression`_ (OSD level compression is supported).
+- D3N will not cache objects encrypted by `Rados Gateway Encryption`_.
+- D3N will be disabled if the ``rgw_max_chunk_size`` config variable value differs from the ``rgw_obj_stripe_size`` config variable value.
+
+
+D3N Environment Setup
+=====================
+
+Running
+-------
+
+To enable D3N on an existing RGWs the following configuration entries are required
+in each Rados Gateways ceph.conf client section, for example for ``[client.rgw.8000]``::
+
+ [client.rgw.8000]
+ rgw_d3n_l1_local_datacache_enabled = true
+ rgw_d3n_l1_datacache_persistent_path = "/mnt/nvme0/rgw_datacache/client.rgw.8000/"
+ rgw_d3n_l1_datacache_size = 10737418240
+
+The above example assumes that the cache backing-store solid state device
+is mounted at `/mnt/nvme0` and has `10 GB` of free space available for the cache.
+
+The persistent path directory has to be created before starting the Gateway.
+(``mkdir -p /mnt/nvme0/rgw_datacache/client.rgw.8000/``)
+
+If another Gateway is co-located on the same machine, configure it's persistent path to a discrete directory,
+for example in the case of `[client.rgw.8001]` configure
+``rgw_d3n_l1_datacache_persistent_path = "/mnt/nvme0/rgw_datacache/client.rgw.8001/"``
+in the ``[client.rgw.8001]`` ceph.conf client section.
+
+In a multiple co-located Gateways configuration consider assigning clients with different workloads
+to each Gateway without a balancer in order to avoid cached data duplication.
+
+ NOTE: each time the Rados Gateway is restarted the content of the cache directory is purged.
+
+Logs
+----
+- D3N related log lines in `radosgw.*.log` contain the string ``d3n`` (case insensitive).
+- low level D3N logs can be enabled by the ``debug_rgw_datacache`` subsystem (up to ``debug_rgw_datacache=30``)
+
+
+CONFIG REFERENCE
+================
+The following D3N related settings can be added to the Ceph configuration file
+(i.e., usually `ceph.conf`) under the ``[client.rgw.{instance-name}]`` section.
+
+.. confval:: rgw_d3n_l1_local_datacache_enabled
+.. confval:: rgw_d3n_l1_datacache_persistent_path
+.. confval:: rgw_d3n_l1_datacache_size
+.. confval:: rgw_d3n_l1_eviction_policy
+
+
+.. _MOC D3N (Datacenter-scale Data Delivery Network): https://massopen.cloud/research-and-development/cloud-research/d3n/
+.. _Red Hat Research D3N Cache for Data Centers: https://research.redhat.com/blog/research_project/d3n-multilayer-cache/
+.. _Rados Gateway Compression: ../compression/
+.. _Rados Gateway Encryption: ../encryption/
+.. _RGW Data cache and CDN: ../rgw-cache/
diff --git a/doc/radosgw/dynamicresharding.rst b/doc/radosgw/dynamicresharding.rst
new file mode 100644
index 000000000..b8bd68d9e
--- /dev/null
+++ b/doc/radosgw/dynamicresharding.rst
@@ -0,0 +1,238 @@
+.. _rgw_dynamic_bucket_index_resharding:
+
+===================================
+RGW Dynamic Bucket Index Resharding
+===================================
+
+.. versionadded:: Luminous
+
+A large bucket index can lead to performance problems, which can
+be addressed by sharding bucket indexes.
+Until Luminous, changing the number of bucket shards (resharding)
+needed to be done offline, with RGW services disabled.
+Since the Luminous release Ceph has supported online bucket resharding.
+
+Each bucket index shard can handle its entries efficiently up until
+reaching a certain threshold. If this threshold is
+exceeded the system can suffer from performance issues. The dynamic
+resharding feature detects this situation and automatically increases
+the number of shards used by a bucket's index, resulting in a
+reduction of the number of entries in each shard. This
+process is transparent to the user. Writes to the target bucket
+are blocked (but reads are not) briefly during resharding process.
+
+By default dynamic bucket index resharding can only increase the
+number of bucket index shards to 1999, although this upper-bound is a
+configuration parameter (see Configuration below). When
+possible, the process chooses a prime number of shards in order to
+spread the number of entries across the bucket index
+shards more evenly.
+
+Detection of resharding opportunities runs as a background process
+that periodically
+scans all buckets. A bucket that requires resharding is added to
+a queue. A thread runs in the background and processes the queueued
+resharding tasks, one at a time and in order.
+
+Multisite
+=========
+
+With Ceph releases Prior to Reef, the Ceph Object Gateway (RGW) does not support
+dynamic resharding in a
+multisite environment. For information on dynamic resharding, see
+:ref:`Resharding <feature_resharding>` in the RGW multisite documentation.
+
+Configuration
+=============
+
+Enable/Disable dynamic bucket index resharding:
+
+- ``rgw_dynamic_resharding``: true/false, default: true
+
+Configuration options that control the resharding process:
+
+- ``rgw_max_objs_per_shard``: maximum number of objects per bucket index shard before resharding is triggered, default: 100000
+
+- ``rgw_max_dynamic_shards``: maximum number of bucket index shards that dynamic resharding can increase to, default: 1999
+
+- ``rgw_reshard_bucket_lock_duration``: duration, in seconds, that writes to the bucket are locked during resharding, default: 360 (i.e., 6 minutes)
+
+- ``rgw_reshard_thread_interval``: maximum time, in seconds, between rounds of resharding queue processing, default: 600 seconds (i.e., 10 minutes)
+
+- ``rgw_reshard_num_logs``: number of shards for the resharding queue, default: 16
+
+Admin commands
+==============
+
+Add a bucket to the resharding queue
+------------------------------------
+
+::
+
+ # radosgw-admin reshard add --bucket <bucket_name> --num-shards <new number of shards>
+
+List resharding queue
+---------------------
+
+::
+
+ # radosgw-admin reshard list
+
+Process tasks on the resharding queue
+-------------------------------------
+
+::
+
+ # radosgw-admin reshard process
+
+Bucket resharding status
+------------------------
+
+::
+
+ # radosgw-admin reshard status --bucket <bucket_name>
+
+The output is a JSON array of 3 objects (reshard_status, new_bucket_instance_id, num_shards) per shard.
+
+For example, the output at each dynamic resharding stage is shown below:
+
+``1. Before resharding occurred:``
+::
+
+ [
+ {
+ "reshard_status": "not-resharding",
+ "new_bucket_instance_id": "",
+ "num_shards": -1
+ }
+ ]
+
+``2. During resharding:``
+::
+
+ [
+ {
+ "reshard_status": "in-progress",
+ "new_bucket_instance_id": "1179f470-2ebf-4630-8ec3-c9922da887fd.8652.1",
+ "num_shards": 2
+ },
+ {
+ "reshard_status": "in-progress",
+ "new_bucket_instance_id": "1179f470-2ebf-4630-8ec3-c9922da887fd.8652.1",
+ "num_shards": 2
+ }
+ ]
+
+``3. After resharding completed:``
+::
+
+ [
+ {
+ "reshard_status": "not-resharding",
+ "new_bucket_instance_id": "",
+ "num_shards": -1
+ },
+ {
+ "reshard_status": "not-resharding",
+ "new_bucket_instance_id": "",
+ "num_shards": -1
+ }
+ ]
+
+
+Cancel pending bucket resharding
+--------------------------------
+
+Note: Bucket resharding operations cannot be cancelled while executing. ::
+
+ # radosgw-admin reshard cancel --bucket <bucket_name>
+
+Manual immediate bucket resharding
+----------------------------------
+
+::
+
+ # radosgw-admin bucket reshard --bucket <bucket_name> --num-shards <new number of shards>
+
+When choosing a number of shards, the administrator must anticipate each
+bucket's peak number of objects. Ideally one should aim for no
+more than 100000 entries per shard at any given time.
+
+Additionally, bucket index shards that are prime numbers are more effective
+in evenly distributing bucket index entries.
+For example, 7001 bucket index shards is better than 7000
+since the former is prime. A variety of web sites have lists of prime
+numbers; search for "list of prime numbers" with your favorite
+search engine to locate some web sites.
+
+Troubleshooting
+===============
+
+Clusters prior to Luminous 12.2.11 and Mimic 13.2.5 left behind stale bucket
+instance entries, which were not automatically cleaned up. This issue also affected
+LifeCycle policies, which were no longer applied to resharded buckets. Both of
+these issues could be worked around by running ``radosgw-admin`` commands.
+
+Stale instance management
+-------------------------
+
+List the stale instances in a cluster that are ready to be cleaned up.
+
+::
+
+ # radosgw-admin reshard stale-instances list
+
+Clean up the stale instances in a cluster. Note: cleanup of these
+instances should only be done on a single-site cluster.
+
+::
+
+ # radosgw-admin reshard stale-instances rm
+
+
+Lifecycle fixes
+---------------
+
+For clusters with resharded instances, it is highly likely that the old
+lifecycle processes would have flagged and deleted lifecycle processing as the
+bucket instance changed during a reshard. While this is fixed for buckets
+deployed on newer Ceph releases (from Mimic 13.2.6 and Luminous 12.2.12),
+older buckets that had lifecycle policies and that have undergone
+resharding must be fixed manually.
+
+The command to do so is:
+
+::
+
+ # radosgw-admin lc reshard fix --bucket {bucketname}
+
+
+If the ``--bucket`` argument is not provided, this
+command will try to fix lifecycle policies for all the buckets in the cluster.
+
+Object Expirer fixes
+--------------------
+
+Objects subject to Swift object expiration on older clusters may have
+been dropped from the log pool and never deleted after the bucket was
+resharded. This would happen if their expiration time was before the
+cluster was upgraded, but if their expiration was after the upgrade
+the objects would be correctly handled. To manage these expire-stale
+objects, ``radosgw-admin`` provides two subcommands.
+
+Listing:
+
+::
+
+ # radosgw-admin objects expire-stale list --bucket {bucketname}
+
+Displays a list of object names and expiration times in JSON format.
+
+Deleting:
+
+::
+
+ # radosgw-admin objects expire-stale rm --bucket {bucketname}
+
+
+Initiates deletion of such objects, displaying a list of object names, expiration times, and deletion status in JSON format.
diff --git a/doc/radosgw/elastic-sync-module.rst b/doc/radosgw/elastic-sync-module.rst
new file mode 100644
index 000000000..60c806e87
--- /dev/null
+++ b/doc/radosgw/elastic-sync-module.rst
@@ -0,0 +1,181 @@
+=========================
+ElasticSearch Sync Module
+=========================
+
+.. versionadded:: Kraken
+
+.. note::
+ As of 31 May 2020, only Elasticsearch 6 and lower are supported. ElasticSearch 7 is not supported.
+
+This sync module writes the metadata from other zones to `ElasticSearch`_. As of
+luminous this is a json of data fields we currently store in ElasticSearch.
+
+::
+
+ {
+ "_index" : "rgw-gold-ee5863d6",
+ "_type" : "object",
+ "_id" : "34137443-8592-48d9-8ca7-160255d52ade.34137.1:object1:null",
+ "_score" : 1.0,
+ "_source" : {
+ "bucket" : "testbucket123",
+ "name" : "object1",
+ "instance" : "null",
+ "versioned_epoch" : 0,
+ "owner" : {
+ "id" : "user1",
+ "display_name" : "user1"
+ },
+ "permissions" : [
+ "user1"
+ ],
+ "meta" : {
+ "size" : 712354,
+ "mtime" : "2017-05-04T12:54:16.462Z",
+ "etag" : "7ac66c0f148de9519b8bd264312c4d64"
+ }
+ }
+ }
+
+
+
+ElasticSearch tier type configurables
+-------------------------------------
+
+* ``endpoint``
+
+Specifies the Elasticsearch server endpoint to access
+
+* ``num_shards`` (integer)
+
+The number of shards that Elasticsearch will be configured with on
+data sync initialization. Note that this cannot be changed after init.
+Any change here requires rebuild of the Elasticsearch index and reinit
+of the data sync process.
+
+* ``num_replicas`` (integer)
+
+The number of the replicas that Elasticsearch will be configured with
+on data sync initialization.
+
+* ``explicit_custom_meta`` (true | false)
+
+Specifies whether all user custom metadata will be indexed, or whether
+user will need to configure (at the bucket level) what custom
+metadata entries should be indexed. This is false by default
+
+* ``index_buckets_list`` (comma separated list of strings)
+
+If empty, all buckets will be indexed. Otherwise, only buckets
+specified here will be indexed. It is possible to provide bucket
+prefixes (e.g., foo\*), or bucket suffixes (e.g., \*bar).
+
+* ``approved_owners_list`` (comma separated list of strings)
+
+If empty, buckets of all owners will be indexed (subject to other
+restrictions), otherwise, only buckets owned by specified owners will
+be indexed. Suffixes and prefixes can also be provided.
+
+* ``override_index_path`` (string)
+
+if not empty, this string will be used as the elasticsearch index
+path. Otherwise the index path will be determined and generated on
+sync initialization.
+
+
+End user metadata queries
+-------------------------
+
+.. versionadded:: Luminous
+
+Since the ElasticSearch cluster now stores object metadata, it is important that
+the ElasticSearch endpoint is not exposed to the public and only accessible to
+the cluster administrators. For exposing metadata queries to the end user itself
+this poses a problem since we'd want the user to only query their metadata and
+not of any other users, this would require the ElasticSearch cluster to
+authenticate users in a way similar to RGW does which poses a problem.
+
+As of Luminous RGW in the metadata master zone can now service end user
+requests. This allows for not exposing the elasticsearch endpoint in public and
+also solves the authentication and authorization problem since RGW itself can
+authenticate the end user requests. For this purpose RGW introduces a new query
+in the bucket APIs that can service elasticsearch requests. All these requests
+must be sent to the metadata master zone.
+
+Syntax
+~~~~~~
+
+Get an elasticsearch query
+``````````````````````````
+
+::
+
+ GET /{bucket}?query={query-expr}
+
+request params:
+ - max-keys: max number of entries to return
+ - marker: pagination marker
+
+``expression := [(]<arg> <op> <value> [)][<and|or> ...]``
+
+op is one of the following:
+<, <=, ==, >=, >
+
+For example ::
+
+ GET /?query=name==foo
+
+Will return all the indexed keys that user has read permission to, and
+are named 'foo'.
+
+The output will be a list of keys in XML that is similar to the S3
+list buckets response.
+
+Configure custom metadata fields
+````````````````````````````````
+
+Define which custom metadata entries should be indexed (under the
+specified bucket), and what are the types of these keys. If explicit
+custom metadata indexing is configured, this is needed so that rgw
+will index the specified custom metadata values. Otherwise it is
+needed in cases where the indexed metadata keys are of a type other
+than string.
+
+::
+
+ POST /{bucket}?mdsearch
+ x-amz-meta-search: <key [; type]> [, ...]
+
+Multiple metadata fields must be comma separated, a type can be forced for a
+field with a `;`. The currently allowed types are string(default), integer and
+date
+
+eg. if you want to index a custom object metadata x-amz-meta-year as int,
+x-amz-meta-date as type date and x-amz-meta-title as string, you'd do
+
+::
+
+ POST /mybooks?mdsearch
+ x-amz-meta-search: x-amz-meta-year;int, x-amz-meta-release-date;date, x-amz-meta-title;string
+
+
+Delete custom metadata configuration
+````````````````````````````````````
+
+Delete custom metadata bucket configuration.
+
+::
+
+ DELETE /<bucket>?mdsearch
+
+Get custom metadata configuration
+`````````````````````````````````
+
+Retrieve custom metadata bucket configuration.
+
+::
+
+ GET /<bucket>?mdsearch
+
+
+.. _`Elasticsearch`: https://github.com/elastic/elasticsearch
diff --git a/doc/radosgw/encryption.rst b/doc/radosgw/encryption.rst
new file mode 100644
index 000000000..e30fe1468
--- /dev/null
+++ b/doc/radosgw/encryption.rst
@@ -0,0 +1,96 @@
+==========
+Encryption
+==========
+
+.. versionadded:: Luminous
+
+The Ceph Object Gateway supports server-side encryption of uploaded objects,
+with 3 options for the management of encryption keys. Server-side encryption
+means that the data is sent over HTTP in its unencrypted form, and the Ceph
+Object Gateway stores that data in the Ceph Storage Cluster in encrypted form.
+
+.. note:: Requests for server-side encryption must be sent over a secure HTTPS
+ connection to avoid sending secrets in plaintext. If a proxy is used
+ for SSL termination, ``rgw trust forwarded https`` must be enabled
+ before forwarded requests will be trusted as secure.
+
+.. note:: Server-side encryption keys must be 256-bit long and base64 encoded.
+
+Customer-Provided Keys
+======================
+
+In this mode, the client passes an encryption key along with each request to
+read or write encrypted data. It is the client's responsibility to manage those
+keys and remember which key was used to encrypt each object.
+
+This is implemented in S3 according to the `Amazon SSE-C`_ specification.
+
+As all key management is handled by the client, no special Ceph configuration
+is needed to support this encryption mode.
+
+Key Management Service
+======================
+
+In this mode, an administrator stores keys in a secure key management service.
+These keys are then
+retrieved on demand by the Ceph Object Gateway to serve requests to encrypt
+or decrypt data.
+
+This is implemented in S3 according to the `Amazon SSE-KMS`_ specification.
+
+In principle, any key management service could be used here. Currently
+integration with `Barbican`_, `Vault`_, and `KMIP`_ are implemented.
+
+See `OpenStack Barbican Integration`_, `HashiCorp Vault Integration`_,
+and `KMIP Integration`_.
+
+SSE-S3
+======
+
+This makes key management invisible to the user. They are still stored
+in Vault, but they are automatically created and deleted by Ceph and
+retrieved as required to serve requests to encrypt
+or decrypt data.
+
+This is implemented in S3 according to the `Amazon SSE-S3`_ specification.
+
+In principle, any key management service could be used here. Currently
+only integration with `Vault`_, is implemented.
+
+See `HashiCorp Vault Integration`_.
+
+Bucket Encryption APIs
+======================
+
+Bucket Encryption APIs to support server-side encryption with Amazon
+S3-managed keys (SSE-S3) or AWS KMS customer master keys (SSE-KMS).
+
+See `PutBucketEncryption`_, `GetBucketEncryption`_, `DeleteBucketEncryption`_
+
+Automatic Encryption (for testing only)
+=======================================
+
+A ``rgw crypt default encryption key`` can be set in ceph.conf to force the
+encryption of all objects that do not otherwise specify an encryption mode.
+
+The configuration expects a base64-encoded 256 bit key. For example::
+
+ rgw crypt default encryption key = 4YSmvJtBv0aZ7geVgAsdpRnLBEwWSWlMIGnRS8a9TSA=
+
+.. important:: This mode is for diagnostic purposes only! The ceph configuration
+ file is not a secure method for storing encryption keys. Keys that are
+ accidentally exposed in this way should be considered compromised.
+
+
+.. _Amazon SSE-C: https://docs.aws.amazon.com/AmazonS3/latest/dev/ServerSideEncryptionCustomerKeys.html
+.. _Amazon SSE-KMS: http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html
+.. _Amazon SSE-S3: https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingServerSideEncryption.html
+.. _Barbican: https://wiki.openstack.org/wiki/Barbican
+.. _Vault: https://www.vaultproject.io/docs/
+.. _KMIP: http://www.oasis-open.org/committees/kmip/
+.. _PutBucketEncryption: https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutBucketEncryption.html
+.. _GetBucketEncryption: https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetBucketEncryption.html
+.. _DeleteBucketEncryption: https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteBucketEncryption.html
+.. _OpenStack Barbican Integration: ../barbican
+.. _HashiCorp Vault Integration: ../vault
+.. _KMIP Integration: ../kmip
diff --git a/doc/radosgw/frontends.rst b/doc/radosgw/frontends.rst
new file mode 100644
index 000000000..45b29cb6f
--- /dev/null
+++ b/doc/radosgw/frontends.rst
@@ -0,0 +1,163 @@
+.. _rgw_frontends:
+
+==============
+HTTP Frontends
+==============
+
+.. contents::
+
+The Ceph Object Gateway supports two embedded HTTP frontend libraries
+that can be configured with ``rgw_frontends``. See `Config Reference`_
+for details about the syntax.
+
+Beast
+=====
+
+.. versionadded:: Mimic
+
+The ``beast`` frontend uses the Boost.Beast library for HTTP parsing
+and the Boost.Asio library for asynchronous network i/o.
+
+Options
+-------
+
+``port`` and ``ssl_port``
+
+:Description: Sets the ipv4 & ipv6 listening port number. Can be specified multiple
+ times as in ``port=80 port=8000``.
+:Type: Integer
+:Default: ``80``
+
+
+``endpoint`` and ``ssl_endpoint``
+
+:Description: Sets the listening address in the form ``address[:port]``, where
+ the address is an IPv4 address string in dotted decimal form, or
+ an IPv6 address in hexadecimal notation surrounded by square
+ brackets. Specifying a IPv6 endpoint would listen to v6 only. The
+ optional port defaults to 80 for ``endpoint`` and 443 for
+ ``ssl_endpoint``. Can be specified multiple times as in
+ ``endpoint=[::1] endpoint=192.168.0.100:8000``.
+
+:Type: Integer
+:Default: None
+
+
+``ssl_certificate``
+
+:Description: Path to the SSL certificate file used for SSL-enabled endpoints.
+ If path is prefixed with ``config://``, the certificate will be
+ pulled from the ceph monitor ``config-key`` database.
+
+:Type: String
+:Default: None
+
+
+``ssl_private_key``
+
+:Description: Optional path to the private key file used for SSL-enabled
+ endpoints. If one is not given, the ``ssl_certificate`` file
+ is used as the private key.
+ If path is prefixed with ``config://``, the certificate will be
+ pulled from the ceph monitor ``config-key`` database.
+
+:Type: String
+:Default: None
+
+``ssl_options``
+
+:Description: Optional colon separated list of ssl context options:
+
+ ``default_workarounds`` Implement various bug workarounds.
+
+ ``no_compression`` Disable compression.
+
+ ``no_sslv2`` Disable SSL v2.
+
+ ``no_sslv3`` Disable SSL v3.
+
+ ``no_tlsv1`` Disable TLS v1.
+
+ ``no_tlsv1_1`` Disable TLS v1.1.
+
+ ``no_tlsv1_2`` Disable TLS v1.2.
+
+ ``single_dh_use`` Always create a new key when using tmp_dh parameters.
+
+:Type: String
+:Default: ``no_sslv2:no_sslv3:no_tlsv1:no_tlsv1_1``
+
+``ssl_ciphers``
+
+:Description: Optional list of one or more cipher strings separated by colons.
+ The format of the string is described in openssl's ciphers(1)
+ manual.
+
+:Type: String
+:Default: None
+
+``tcp_nodelay``
+
+:Description: If set the socket option will disable Nagle's algorithm on
+ the connection which means that packets will be sent as soon
+ as possible instead of waiting for a full buffer or timeout to occur.
+
+ ``1`` Disable Nagel's algorithm for all sockets.
+
+ ``0`` Keep the default: Nagel's algorithm enabled.
+
+:Type: Integer (0 or 1)
+:Default: 0
+
+``max_connection_backlog``
+
+:Description: Optional value to define the maximum size for the queue of
+ connections waiting to be accepted. If not configured, the value
+ from ``boost::asio::socket_base::max_connections`` will be used.
+
+:Type: Integer
+:Default: None
+
+``request_timeout_ms``
+
+:Description: The amount of time in milliseconds that Beast will wait
+ for more incoming data or outgoing data before giving up.
+ Setting this value to 0 will disable timeout.
+
+:Type: Integer
+:Default: ``65000``
+
+``rgw_thread_pool_size``
+
+:Description: Sets the number of threads spawned by Beast to handle
+ incoming HTTP connections. This effectively limits the number
+ of concurrent connections that the frontend can service.
+
+:Type: Integer
+:Default: ``512``
+
+``max_header_size``
+
+:Description: The maximum number of header bytes available for a single request.
+
+:Type: Integer
+:Default: ``16384``
+:Maximum: ``65536``
+
+
+Generic Options
+===============
+
+Some frontend options are generic and supported by all frontends:
+
+``prefix``
+
+:Description: A prefix string that is inserted into the URI of all
+ requests. For example, a swift-only frontend could supply
+ a uri prefix of ``/swift``.
+
+:Type: String
+:Default: None
+
+
+.. _Config Reference: ../config-ref
diff --git a/doc/radosgw/index.rst b/doc/radosgw/index.rst
new file mode 100644
index 000000000..704436202
--- /dev/null
+++ b/doc/radosgw/index.rst
@@ -0,0 +1,87 @@
+.. _object-gateway:
+
+=====================
+ Ceph Object Gateway
+=====================
+
+:term:`Ceph Object Gateway` is an object storage interface built on top of
+``librados``. It provides a RESTful gateway between applications and Ceph
+Storage Clusters. :term:`Ceph Object Storage` supports two interfaces:
+
+#. **S3-compatible:** Provides object storage functionality with an interface
+ that is compatible with a large subset of the Amazon S3 RESTful API.
+
+#. **Swift-compatible:** Provides object storage functionality with an interface
+ that is compatible with a large subset of the OpenStack Swift API.
+
+Ceph Object Storage uses the Ceph Object Gateway daemon (``radosgw``), an HTTP
+server designed for interacting with a Ceph Storage Cluster. The Ceph Object
+Gateway provides interfaces that are compatible with both Amazon S3 and
+OpenStack Swift, and it has its own user management. Ceph Object Gateway can
+store data in the same Ceph Storage Cluster in which data from Ceph File System
+clients and Ceph Block Device clients is stored. The S3 API and the Swift API
+share a common namespace, which makes it possible to write data to a Ceph
+Storage Cluster with one API and then retrieve that data with the other API.
+
+.. ditaa::
+
+ +------------------------+ +------------------------+
+ | S3 compatible API | | Swift compatible API |
+ +------------------------+-+------------------------+
+ | radosgw |
+ +---------------------------------------------------+
+ | librados |
+ +------------------------+-+------------------------+
+ | OSDs | | Monitors |
+ +------------------------+ +------------------------+
+
+.. note:: Ceph Object Storage does **NOT** use the Ceph Metadata Server.
+
+
+.. toctree::
+ :maxdepth: 1
+
+ HTTP Frontends <frontends>
+ Multisite Configuration <multisite>
+ Pool Placement and Storage Classes <placement>
+ Multisite Sync Policy Configuration <multisite-sync-policy>
+ Configuring Pools <pools>
+ Config Reference <config-ref>
+ Admin Guide <admin>
+ S3 API <s3>
+ Data caching and CDN <rgw-cache.rst>
+ Swift API <swift>
+ Admin Ops API <adminops>
+ Python binding <api>
+ Export over NFS <nfs>
+ OpenStack Keystone Integration <keystone>
+ OpenStack Barbican Integration <barbican>
+ HashiCorp Vault Integration <vault>
+ KMIP Integration <kmip>
+ Open Policy Agent Integration <opa>
+ Multi-tenancy <multitenancy>
+ Compression <compression>
+ LDAP Authentication <ldap-auth>
+ Server-Side Encryption <encryption>
+ Bucket Policy <bucketpolicy>
+ Dynamic bucket index resharding <dynamicresharding>
+ Multi factor authentication <mfa>
+ Sync Modules <sync-modules>
+ Bucket Notifications <notifications>
+ Data Layout in RADOS <layout>
+ STS <STS>
+ STS Lite <STSLite>
+ Keycloak <keycloak>
+ Session Tags <session-tags>
+ Role <role>
+ Orphan List and Associated Tooling <orphans>
+ OpenID Connect Provider <oidc>
+ troubleshooting
+ Manpage radosgw <../../man/8/radosgw>
+ Manpage radosgw-admin <../../man/8/radosgw-admin>
+ QAT Acceleration for Encryption and Compression <qat-accel>
+ S3-select <s3select>
+ Lua Scripting <lua-scripting>
+ D3N Data Cache <d3n_datacache>
+ Cloud Transition <cloud-transition>
+
diff --git a/doc/radosgw/keycloak.rst b/doc/radosgw/keycloak.rst
new file mode 100644
index 000000000..ec285a62f
--- /dev/null
+++ b/doc/radosgw/keycloak.rst
@@ -0,0 +1,138 @@
+.. _radosgw_keycloak:
+
+=================================
+Integrating Keycloak with RadosGW
+=================================
+
+If Keycloak is set up as an OpenID Connect Identity Provider, it can be used by
+mobile apps and web apps to authenticate their users. By using the web token
+returned by the authentication process, a mobile app or web app can call
+AssumeRoleWithWebIdentity, receive a set of temporary S3 credentials, and use
+those credentials to make S3 calls.
+
+Setting up Keycloak
+===================
+
+Documentation for installing and operating Keycloak can be found here:
+https://www.keycloak.org/guides.
+
+Configuring Keycloak to talk to RGW
+===================================
+
+To configure Keycloak to talk to RGW, add the following configurables::
+
+ [client.radosgw.gateway]
+ rgw sts key = {sts key for encrypting/ decrypting the session token}
+ rgw s3 auth use sts = true
+
+Fetching a web token with Keycloak
+==================================
+
+Several examples of apps authenticating with Keycloak can be found here:
+https://github.com/keycloak/keycloak-quickstarts/blob/latest/docs/getting-started.md.
+
+Here you might consider the example of the app-profile-jee-jsp app (in the link
+above). To fetch the access token (web token) for such an application using the
+grant type 'client_credentials', one can use client id and client secret as
+follows::
+
+ KC_REALM=demo
+ KC_CLIENT=<client id>
+ KC_CLIENT_SECRET=<client secret>
+ KC_SERVER=<host>:8080
+ KC_CONTEXT=auth
+
+ # Request Tokens for credentials
+ KC_RESPONSE=$( \
+ curl -k -v -X POST \
+ -H "Content-Type: application/x-www-form-urlencoded" \
+ -d "scope=openid" \
+ -d "grant_type=client_credentials" \
+ -d "client_id=$KC_CLIENT" \
+ -d "client_secret=$KC_CLIENT_SECRET" \
+ "http://$KC_SERVER/$KC_CONTEXT/realms/$KC_REALM/protocol/openid-connect/token" \
+ | jq .
+ )
+
+ KC_ACCESS_TOKEN=$(echo $KC_RESPONSE| jq -r .access_token)
+
+It is also possible to fetch an access token for a particular user with the
+grant type 'password'. To fetch such an access token, use client id, client
+secret, username, and password as follows::
+
+ KC_REALM=demo
+ KC_USERNAME=<username>
+ KC_PASSWORD=<userpassword>
+ KC_CLIENT=<client id>
+ KC_CLIENT_SECRET=<client secret>
+ KC_SERVER=<host>:8080
+ KC_CONTEXT=auth
+
+ # Request Tokens for credentials
+ KC_RESPONSE=$( \
+ curl -k -v -X POST \
+ -H "Content-Type: application/x-www-form-urlencoded" \
+ -d "scope=openid" \
+ -d "grant_type=password" \
+ -d "client_id=$KC_CLIENT" \
+ -d "client_secret=$KC_CLIENT_SECRET" \
+ -d "username=$KC_USERNAME" \
+ -d "password=$KC_PASSWORD" \
+ "http://$KC_SERVER/$KC_CONTEXT/realms/$KC_REALM/protocol/openid-connect/token" \
+ | jq .
+ )
+
+ KC_ACCESS_TOKEN=$(echo $KC_RESPONSE| jq -r .access_token)
+
+``KC_ACCESS_TOKEN`` can be used to invoke ``AssumeRoleWithWebIdentity``: see
+:doc:`STS`.
+
+Adding tags to a user in Keycloak
+=================================
+
+To create a user in Keycloak and add tags to it as its attributes, follow these
+steps:
+
+#. Add a user:
+
+ .. image:: ../images/keycloak-adduser.png
+ :align: center
+
+#. Add user details:
+
+ .. image:: ../images/keycloak-userdetails.png
+ :align: center
+
+#. Add user credentials:
+
+ .. image:: ../images/keycloak-usercredentials.png
+ :align: center
+
+#. Add tags to the 'attributes' tab of the user:
+
+ .. image:: ../images/keycloak-usertags.png
+ :align: center
+
+#. Add a protocol mapper that maps the user attribute to a client:
+
+ .. image:: ../images/keycloak-userclientmapper.png
+ :align: center
+
+After these steps have been completed, the tag 'Department' will appear in the
+JWT (web token), under the 'https://aws.amazon.com/tags' namespace.
+
+Tags can be verified by performing token introspection on a JWT. To introspect
+a token, use ``client id`` and ``client secret`` as follows::
+
+ KC_REALM=demo
+ KC_CLIENT=<client id>
+ KC_CLIENT_SECRET=<client secret>
+ KC_SERVER=<host>:8080
+ KC_CONTEXT=auth
+
+ curl -k -v \
+ -X POST \
+ -u "$KC_CLIENT:$KC_CLIENT_SECRET" \
+ -d "token=$KC_ACCESS_TOKEN" \
+ "http://$KC_SERVER/$KC_CONTEXT/realms/$KC_REALM/protocol/openid-connect/token/introspect" \
+ | jq .
diff --git a/doc/radosgw/keystone.rst b/doc/radosgw/keystone.rst
new file mode 100644
index 000000000..20edc3d24
--- /dev/null
+++ b/doc/radosgw/keystone.rst
@@ -0,0 +1,179 @@
+=====================================
+ Integrating with OpenStack Keystone
+=====================================
+
+It is possible to integrate the Ceph Object Gateway with Keystone, the OpenStack
+identity service. This sets up the gateway to accept Keystone as the users
+authority. A user that Keystone authorizes to access the gateway will also be
+automatically created on the Ceph Object Gateway (if didn't exist beforehand). A
+token that Keystone validates will be considered as valid by the gateway.
+
+The following configuration options are available for Keystone integration::
+
+ [client.radosgw.gateway]
+ rgw keystone api version = {keystone api version}
+ rgw keystone url = {keystone server url:keystone server admin port}
+ rgw keystone admin token = {keystone admin token}
+ rgw keystone admin token path = {path to keystone admin token} #preferred
+ rgw keystone accepted roles = {accepted user roles}
+ rgw keystone token cache size = {number of tokens to cache}
+ rgw keystone implicit tenants = {true for private tenant for each new user}
+
+It is also possible to configure a Keystone service tenant, user & password for
+Keystone (for v2.0 version of the OpenStack Identity API), similar to the way
+OpenStack services tend to be configured, this avoids the need for setting the
+shared secret ``rgw keystone admin token`` in the configuration file, which is
+recommended to be disabled in production environments. The service tenant
+credentials should have admin privileges, for more details refer the `OpenStack
+Keystone documentation`_, which explains the process in detail. The requisite
+configuration options for are::
+
+ rgw keystone admin user = {keystone service tenant user name}
+ rgw keystone admin password = {keystone service tenant user password}
+ rgw keystone admin password = {keystone service tenant user password path} # preferred
+ rgw keystone admin tenant = {keystone service tenant name}
+
+
+A Ceph Object Gateway user is mapped into a Keystone ``tenant``. A Keystone user
+has different roles assigned to it on possibly more than a single tenant. When
+the Ceph Object Gateway gets the ticket, it looks at the tenant, and the user
+roles that are assigned to that ticket, and accepts/rejects the request
+according to the ``rgw keystone accepted roles`` configurable.
+
+For a v3 version of the OpenStack Identity API you should replace
+``rgw keystone admin tenant`` with::
+
+ rgw keystone admin domain = {keystone admin domain name}
+ rgw keystone admin project = {keystone admin project name}
+
+For compatibility with previous versions of ceph, it is also
+possible to set ``rgw keystone implicit tenants`` to either
+``s3`` or ``swift``. This has the effect of splitting
+the identity space such that the indicated protocol will
+only use implicit tenants, and the other protocol will
+never use implicit tenants. Some older versions of ceph
+only supported implicit tenants with swift.
+
+Ocata (and later)
+-----------------
+
+Keystone itself needs to be configured to point to the Ceph Object Gateway as an
+object-storage endpoint::
+
+ openstack service create --name=swift \
+ --description="Swift Service" \
+ object-store
+ +-------------+----------------------------------+
+ | Field | Value |
+ +-------------+----------------------------------+
+ | description | Swift Service |
+ | enabled | True |
+ | id | 37c4c0e79571404cb4644201a4a6e5ee |
+ | name | swift |
+ | type | object-store |
+ +-------------+----------------------------------+
+
+ openstack endpoint create --region RegionOne \
+ --publicurl "http://radosgw.example.com:8080/swift/v1" \
+ --adminurl "http://radosgw.example.com:8080/swift/v1" \
+ --internalurl "http://radosgw.example.com:8080/swift/v1" \
+ swift
+ +--------------+------------------------------------------+
+ | Field | Value |
+ +--------------+------------------------------------------+
+ | adminurl | http://radosgw.example.com:8080/swift/v1 |
+ | id | e4249d2b60e44743a67b5e5b38c18dd3 |
+ | internalurl | http://radosgw.example.com:8080/swift/v1 |
+ | publicurl | http://radosgw.example.com:8080/swift/v1 |
+ | region | RegionOne |
+ | service_id | 37c4c0e79571404cb4644201a4a6e5ee |
+ | service_name | swift |
+ | service_type | object-store |
+ +--------------+------------------------------------------+
+
+ $ openstack endpoint show object-store
+ +--------------+------------------------------------------+
+ | Field | Value |
+ +--------------+------------------------------------------+
+ | adminurl | http://radosgw.example.com:8080/swift/v1 |
+ | enabled | True |
+ | id | e4249d2b60e44743a67b5e5b38c18dd3 |
+ | internalurl | http://radosgw.example.com:8080/swift/v1 |
+ | publicurl | http://radosgw.example.com:8080/swift/v1 |
+ | region | RegionOne |
+ | service_id | 37c4c0e79571404cb4644201a4a6e5ee |
+ | service_name | swift |
+ | service_type | object-store |
+ +--------------+------------------------------------------+
+
+.. note:: If your radosgw ``ceph.conf`` sets the configuration option
+ ``rgw swift account in url = true``, your ``object-store``
+ endpoint URLs must be set to include the suffix
+ ``/v1/AUTH_%(tenant_id)s`` (instead of just ``/v1``).
+
+The Keystone URL is the Keystone admin RESTful API URL. The admin token is the
+token that is configured internally in Keystone for admin requests.
+
+OpenStack Keystone may be terminated with a self signed ssl certificate, in
+order for radosgw to interact with Keystone in such a case, you could either
+install Keystone's ssl certificate in the node running radosgw. Alternatively
+radosgw could be made to not verify the ssl certificate at all (similar to
+OpenStack clients with a ``--insecure`` switch) by setting the value of the
+configurable ``rgw keystone verify ssl`` to false.
+
+
+.. _OpenStack Keystone documentation: http://docs.openstack.org/developer/keystone/configuringservices.html#setting-up-projects-users-and-roles
+
+Cross Project(Tenant) Access
+----------------------------
+
+In order to let a project (earlier called a 'tenant') access buckets belonging to a different project, the following config option needs to be enabled::
+
+ rgw swift account in url = true
+
+The Keystone object-store endpoint must accordingly be configured to include the AUTH_%(project_id)s suffix::
+
+ openstack endpoint create --region RegionOne \
+ --publicurl "http://radosgw.example.com:8080/swift/v1/AUTH_$(project_id)s" \
+ --adminurl "http://radosgw.example.com:8080/swift/v1/AUTH_$(project_id)s" \
+ --internalurl "http://radosgw.example.com:8080/swift/v1/AUTH_$(project_id)s" \
+ swift
+ +--------------+--------------------------------------------------------------+
+ | Field | Value |
+ +--------------+--------------------------------------------------------------+
+ | adminurl | http://radosgw.example.com:8080/swift/v1/AUTH_$(project_id)s |
+ | id | e4249d2b60e44743a67b5e5b38c18dd3 |
+ | internalurl | http://radosgw.example.com:8080/swift/v1/AUTH_$(project_id)s |
+ | publicurl | http://radosgw.example.com:8080/swift/v1/AUTH_$(project_id)s |
+ | region | RegionOne |
+ | service_id | 37c4c0e79571404cb4644201a4a6e5ee |
+ | service_name | swift |
+ | service_type | object-store |
+ +--------------+--------------------------------------------------------------+
+
+Keystone integration with the S3 API
+------------------------------------
+
+It is possible to use Keystone for authentication even when using the
+S3 API (with AWS-like access and secret keys), if the ``rgw s3 auth
+use keystone`` option is set. For details, see
+:doc:`s3/authentication`.
+
+Service token support
+---------------------
+
+Service tokens can be enabled to support RadosGW Keystone integration
+to allow expired tokens when coupled with a valid service token in the request.
+
+Enable the support with ``rgw keystone service token enabled`` and use the
+``rgw keystone service token accepted roles`` option to specify which roles are considered
+service roles.
+
+The ``rgw keystone expired token cache expiration`` option can be used to tune the cache
+expiration for an expired token allowed with a service token, please note that this must
+be lower than the ``[token]/allow_expired_window`` option in the Keystone configuration.
+
+Enabling this will cause an expired token given in the X-Auth-Token header to be allowed
+if coupled with a X-Service-Token header that contains a valid token with the accepted
+roles. This can allow long running processes using a user token in X-Auth-Token to function
+beyond the expiration of the token.
diff --git a/doc/radosgw/kmip.rst b/doc/radosgw/kmip.rst
new file mode 100644
index 000000000..988897121
--- /dev/null
+++ b/doc/radosgw/kmip.rst
@@ -0,0 +1,219 @@
+================
+KMIP Integration
+================
+
+`KMIP`_ can be used as a secure key management service for
+`Server-Side Encryption`_ (SSE-KMS).
+
+.. ditaa::
+
+ +---------+ +---------+ +------+ +-------+
+ | Client | | RadosGW | | KMIP | | OSD |
+ +---------+ +---------+ +------+ +-------+
+ | create secret | | |
+ | key for key ID | | |
+ |-----------------+---------------->| |
+ | | | |
+ | upload object | | |
+ | with key ID | | |
+ |---------------->| request secret | |
+ | | key for key ID | |
+ | |---------------->| |
+ | |<----------------| |
+ | | return secret | |
+ | | key | |
+ | | | |
+ | | encrypt object | |
+ | | with secret key | |
+ | |--------------+ | |
+ | | | | |
+ | |<-------------+ | |
+ | | | |
+ | | store encrypted | |
+ | | object | |
+ | |------------------------------>|
+
+#. `Setting KMIP Access for Ceph`_
+#. `Creating Keys in KMIP`_
+#. `Configure the Ceph Object Gateway`_
+#. `Upload object`_
+
+Before you can use KMIP with ceph, you will need to do three things.
+You will need to associate ceph with client information in KMIP,
+and configure ceph to use that client information.
+You will also need to create 1 or more keys in KMIP.
+
+Setting KMIP Access for Ceph
+============================
+
+Setting up Ceph in KMIP is very dependent on the mechanism(s) supported
+by your implementation of KMIP. Two implementations are described
+here,
+
+1. `IBM Security Guardium Key Lifecycle Manager (SKLM)`__. This is a well
+ supported commercial product.
+
+__ SKLM_
+
+2. PyKMIP_. This is a small python project, suitable for experimental
+ and testing use only.
+
+Using IBM SKLM
+--------------
+
+IBM SKLM__ supports client authentication using certificates.
+Certificates may either be self-signed certificates created,
+for instance, using openssl, or certificates may be created
+using SKLM. Ceph should then be configured (see below) to
+use KMIP and an attempt made to use it. This will fail,
+but it will leave an "untrusted client device certificate" in SKLM.
+This can be then upgraded to a registered client using the web
+interface to complete the registration process.
+
+__ SKLM_
+
+Find untrusted clients under ``Advanced Configuration``,
+``Client Device Communication Certificates``. Select
+``Modify SSL/KMIP Certificates for Clients``, then toggle the flag
+``allow the server to trust this certificate and communicate...``.
+
+Using PyKMIP
+------------
+
+PyKMIP_ has no special registration process, it simply
+trusts the certificate. However, the certificate has to
+be issued by a certificate authority that is trusted by
+pykmip. PyKMIP also prefers that the certificate contain
+an extension for "extended key usage". However, that
+can be defeated by specifying ``enable_tls_client_auth=False``
+in the server configuration.
+
+Creating Keys in KMIP
+=====================
+
+Some KMIP implementations come with a web interface or other
+administrative tools to create and manage keys. Refer to your
+documentation on that if you wish to use it. The KMIP protocol can also
+be used to create and manage keys. PyKMIP comes with a python client
+library that can be used this way.
+
+In preparation to using the pykmip client, you'll need to have a valid
+kmip client key & certificate, such as the one you created for ceph.
+
+Next, you'll then need to download and install it::
+
+ virtualenv $HOME/my-kmip-env
+ source $HOME/my-kmip-env/bin/activate
+ pip install pykmip
+
+Then you'll need to prepare a configuration file
+for the client, something like this::
+
+ cat <<EOF >$HOME/my-kmip-configuration
+ [client]
+ host={hostname}
+ port=5696
+ certfile={clientcert}
+ keyfile={clientkey}
+ ca_certs={clientca}
+ ssl_version=PROTOCOL_TLSv1_2
+ EOF
+
+You will need to replace {hostname} with the name of your kmip host,
+also replace {clientcert} {clientkey} and {clientca} with pathnames to
+a suitable pem encoded certificate, such as the one you created for
+ceph to use.
+
+Now, you can run this python script directly from
+the shell::
+
+ python
+ from kmip.pie import client
+ from kmip import enums
+ import ssl
+ import os
+ import sys
+ import json
+ c = client.ProxyKmipClient(config_file=os.environ['HOME']+"/my-kmip-configuration")
+
+ while True:
+ l=sys.stdin.readline()
+ keyname=l.strip()
+ if keyname == "": break
+ with c:
+ key_id = c.create(
+ enums.CryptographicAlgorithm.AES,
+ 256,
+ operation_policy_name='default',
+ name=keyname,
+ cryptographic_usage_mask=[
+ enums.CryptographicUsageMask.ENCRYPT,
+ enums.CryptographicUsageMask.DECRYPT
+ ]
+ )
+ c.activate(key_id)
+ attrs = c.get_attributes(uid=key_id)
+ r = {}
+ for a in attrs[1]:
+ r[str(a.attribute_name)] = str(a.attribute_value)
+ print (json.dumps(r))
+
+If this is all entered at the shell prompt, python will
+prompt with ">>>" then "..." until the script is read in,
+after which it will read and process names with no prompt
+until a blank line or end of file (^D) is given it, or
+an error occurs. Of course you can turn this into a regular
+python script if you prefer.
+
+Configure the Ceph Object Gateway
+=================================
+
+Edit the Ceph configuration file to enable Vault as a KMS backend for
+server-side encryption::
+
+ rgw crypt s3 kms backend = kmip
+ rgw crypt kmip ca path: /etc/ceph/kmiproot.crt
+ rgw crypt kmip client cert: /etc/ceph/kmip-client.crt
+ rgw crypt kmip client key: /etc/ceph/private/kmip-client.key
+ rgw crypt kmip kms key template: pykmip-$keyid
+
+You may need to change the paths above to match where
+you actually want to store kmip certificate data.
+
+The kmip key template describes how ceph will modify
+the name given to it before it looks it up
+in kmip. The default is just "$keyid".
+If you don't want ceph to see all your kmip
+keys, you can use this to limit ceph to just the
+designated subset of your kmip key namespace.
+
+Upload object
+=============
+
+When uploading an object to the Gateway, provide the SSE key ID in the request.
+As an example, for the kv engine, using the AWS command-line client::
+
+ aws --endpoint=http://radosgw:8000 s3 cp plaintext.txt \
+ s3://mybucket/encrypted.txt --sse=aws:kms --sse-kms-key-id mybucketkey
+
+As an example, for the transit engine, using the AWS command-line client::
+
+ aws --endpoint=http://radosgw:8000 s3 cp plaintext.txt \
+ s3://mybucket/encrypted.txt --sse=aws:kms --sse-kms-key-id mybucketkey
+
+The Object Gateway will fetch the key from Vault, encrypt the object and store
+it in the bucket. Any request to download the object will make the Gateway
+automatically retrieve the correspondent key from Vault and decrypt the object.
+
+Note that the secret will be fetched from kmip using a name constructed
+from the key template, replacing ``$keyid`` with the key provided.
+
+With the ceph configuration given above,
+radosgw would fetch the secret from::
+
+ pykmip-mybucketkey
+
+.. _Server-Side Encryption: ../encryption
+.. _KMIP: http://www.oasis-open.org/committees/kmip/
+.. _SKLM: https://www.ibm.com/products/ibm-security-key-lifecycle-manager
+.. _PyKMIP: https://pykmip.readthedocs.io/en/latest/
diff --git a/doc/radosgw/layout.rst b/doc/radosgw/layout.rst
new file mode 100644
index 000000000..723adf827
--- /dev/null
+++ b/doc/radosgw/layout.rst
@@ -0,0 +1,208 @@
+===========================
+ Rados Gateway Data Layout
+===========================
+
+Although the source code is the ultimate guide, this document helps
+new developers to get up to speed with the implementation details.
+
+Introduction
+------------
+
+Swift offers something called a *container*, which we use interchangeably with
+the term *bucket*, so we say that RGW's buckets implement Swift containers.
+
+This document does not consider how RGW operates on these structures,
+e.g. the use of encode() and decode() methods for serialization and so on.
+
+Conceptual View
+---------------
+
+Although RADOS only knows about pools and objects with their xattrs and
+omap[1], conceptually RGW organizes its data into three different kinds:
+metadata, bucket index, and data.
+
+Metadata
+^^^^^^^^
+
+We have 3 'sections' of metadata: 'user', 'bucket', and 'bucket.instance'.
+You can use the following commands to introspect metadata entries: ::
+
+ $ radosgw-admin metadata list
+ $ radosgw-admin metadata list bucket
+ $ radosgw-admin metadata list bucket.instance
+ $ radosgw-admin metadata list user
+
+ $ radosgw-admin metadata get bucket:<bucket>
+ $ radosgw-admin metadata get bucket.instance:<bucket>:<bucket_id>
+ $ radosgw-admin metadata get user:<user> # get or set
+
+Some variables have been used in above commands, they are:
+
+- user: Holds user information
+- bucket: Holds a mapping between bucket name and bucket instance id
+- bucket.instance: Holds bucket instance information[2]
+
+Every metadata entry is kept on a single RADOS object. See below for implementation details.
+
+Note that the metadata is not indexed. When listing a metadata section we do a
+RADOS ``pgls`` operation on the containing pool.
+
+Bucket Index
+^^^^^^^^^^^^
+
+It's a different kind of metadata, and kept separately. The bucket index holds
+a key-value map in RADOS objects. By default it is a single RADOS object per
+bucket, but it is possible since Hammer to shard that map over multiple RADOS
+objects. The map itself is kept in omap, associated with each RADOS object.
+The key of each omap is the name of the objects, and the value holds some basic
+metadata of that object -- metadata that shows up when listing the bucket.
+Also, each omap holds a header, and we keep some bucket accounting metadata
+in that header (number of objects, total size, etc.).
+
+Note that we also hold other information in the bucket index, and it's kept in
+other key namespaces. We can hold the bucket index log there, and for versioned
+objects there is more information that we keep on other keys.
+
+Data
+^^^^
+
+Objects data is kept in one or more RADOS objects for each rgw object.
+
+Object Lookup Path
+------------------
+
+When accessing objects, REST APIs come to RGW with three parameters:
+account information (access key in S3 or account name in Swift),
+bucket or container name, and object name (or key). At present, RGW only
+uses account information to find out the user ID and for access control.
+Only the bucket name and object key are used to address the object in a pool.
+
+The user ID in RGW is a string, typically the actual user name from the user
+credentials and not a hashed or mapped identifier.
+
+When accessing a user's data, the user record is loaded from an object
+"<user_id>" in pool "default.rgw.meta" with namespace "users.uid".
+
+Bucket names are represented in the pool "default.rgw.meta" with namespace
+"root". Bucket record is
+loaded in order to obtain so-called marker, which serves as a bucket ID.
+
+The object is located in pool "default.rgw.buckets.data".
+Object name is "<marker>_<key>",
+for example "default.7593.4_image.png", where the marker is "default.7593.4"
+and the key is "image.png". Since these concatenated names are not parsed,
+only passed down to RADOS, the choice of the separator is not important and
+causes no ambiguity. For the same reason, slashes are permitted in object
+names (keys).
+
+It is also possible to create multiple data pools and make it so that
+different users\` buckets will be created in different RADOS pools by default,
+thus providing the necessary scaling. The layout and naming of these pools
+is controlled by a 'policy' setting.[3]
+
+An RGW object may consist of several RADOS objects, the first of which
+is the head that contains the metadata, such as manifest, ACLs, content type,
+ETag, and user-defined metadata. The metadata is stored in xattrs.
+The head may also contain up to :confval:`rgw_max_chunk_size` of object data, for efficiency
+and atomicity. The manifest describes how each object is laid out in RADOS
+objects.
+
+Bucket and Object Listing
+-------------------------
+
+Buckets that belong to a given user are listed in an omap of an object named
+"<user_id>.buckets" (for example, "foo.buckets") in pool "default.rgw.meta"
+with namespace "users.uid".
+These objects are accessed when listing buckets, when updating bucket
+contents, and updating and retrieving bucket statistics (e.g. for quota).
+
+See the user-visible, encoded class 'cls_user_bucket_entry' and its
+nested class 'cls_user_bucket' for the values of these omap entries.
+
+These listings are kept consistent with buckets in pool ".rgw".
+
+Objects that belong to a given bucket are listed in a bucket index,
+as discussed in sub-section 'Bucket Index' above. The default naming
+for index objects is ".dir.<marker>" in pool "default.rgw.buckets.index".
+
+Footnotes
+---------
+
+[1] Omap is a key-value store, associated with an object, in a way similar
+to how Extended Attributes associate with a POSIX file. An object's omap
+is not physically located in the object's storage, but its precise
+implementation is invisible and immaterial to RADOS Gateway.
+In Hammer, LevelDB is used to store omap data within each OSD; later releases
+default to RocksDB but can be configured to use LevelDB.
+
+[2] Before the Dumpling release, the 'bucket.instance' metadata did not
+exist and the 'bucket' metadata contained its information. It is possible
+to encounter such buckets in old installations.
+
+[3] Pool names changed with the Infernalis release.
+If you are looking at an older setup, some details may be different. In
+particular there was a different pool for each of the namespaces that are
+now being used inside the ``default.root.meta`` pool.
+
+Appendix: Compendium
+--------------------
+
+Known pools:
+
+.rgw.root
+ Unspecified region, zone, and global information records, one per object.
+
+<zone>.rgw.control
+ notify.<N>
+
+<zone>.rgw.meta
+ Multiple namespaces with different kinds of metadata:
+
+ namespace: root
+ <bucket>
+ .bucket.meta.<bucket>:<marker> # see put_bucket_instance_info()
+
+ The tenant is used to disambiguate buckets, but not bucket instances.
+ Example::
+
+ .bucket.meta.prodtx:test%25star:default.84099.6
+ .bucket.meta.testcont:default.4126.1
+ .bucket.meta.prodtx:testcont:default.84099.4
+ prodtx/testcont
+ prodtx/test%25star
+ testcont
+
+ namespace: users.uid
+ Contains _both_ per-user information (RGWUserInfo) in "<user>" objects
+ and per-user lists of buckets in omaps of "<user>.buckets" objects.
+ The "<user>" may contain the tenant if non-empty, for example::
+
+ prodtx$prodt
+ test2.buckets
+ prodtx$prodt.buckets
+ test2
+
+ namespace: users.email
+ Unimportant
+
+ namespace: users.keys
+ 47UA98JSTJZ9YAN3OS3O
+
+ This allows ``radosgw`` to look up users by their access keys during authentication.
+
+ namespace: users.swift
+ test:tester
+
+<zone>.rgw.buckets.index
+ Objects are named ".dir.<marker>", each contains a bucket index.
+ If the index is sharded, each shard appends the shard index after
+ the marker.
+
+<zone>.rgw.buckets.data
+ default.7593.4__shadow_.488urDFerTYXavx4yAd-Op8mxehnvTI_1
+ <marker>_<key>
+
+An example of a marker would be "default.16004.1" or "default.7593.4".
+The current format is "<zone>.<instance_id>.<bucket_id>". But once
+generated, a marker is not parsed again, so its format may change
+freely in the future.
diff --git a/doc/radosgw/ldap-auth.rst b/doc/radosgw/ldap-auth.rst
new file mode 100644
index 000000000..486d0c623
--- /dev/null
+++ b/doc/radosgw/ldap-auth.rst
@@ -0,0 +1,167 @@
+===================
+LDAP Authentication
+===================
+
+.. versionadded:: Jewel
+
+You can delegate the Ceph Object Gateway authentication to an LDAP server.
+
+How it works
+============
+
+The Ceph Object Gateway extracts the users LDAP credentials from a token. A
+search filter is constructed with the user name. The Ceph Object Gateway uses
+the configured service account to search the directory for a matching entry. If
+an entry is found, the Ceph Object Gateway attempts to bind to the found
+distinguished name with the password from the token. If the credentials are
+valid, the bind will succeed, and the Ceph Object Gateway will grant access and
+radosgw-user will be created with the provided username.
+
+You can limit the allowed users by setting the base for the search to a
+specific organizational unit or by specifying a custom search filter, for
+example requiring specific group membership, custom object classes, or
+attributes.
+
+The LDAP credentials must be available on the server to perform the LDAP
+authentication. Make sure to set the ``rgw`` log level low enough to hide the
+base-64-encoded credentials / access tokens.
+
+Requirements
+============
+
+- **LDAP or Active Directory:** A running LDAP instance accessible by the Ceph
+ Object Gateway
+- **Service account:** LDAP credentials to be used by the Ceph Object Gateway
+ with search permissions
+- **User account:** At least one user account in the LDAP directory
+- **Do not overlap LDAP and local users:** You should not use the same user
+ names for local users and for users being authenticated by using LDAP. The
+ Ceph Object Gateway cannot distinguish them and it treats them as the same
+ user.
+
+Sanity checks
+=============
+
+Use the ``ldapsearch`` utility to verify the service account or the LDAP connection:
+
+::
+
+ # ldapsearch -x -D "uid=ceph,ou=system,dc=example,dc=com" -W \
+ -H ldaps://example.com -b "ou=users,dc=example,dc=com" 'uid=*' dn
+
+.. note:: Make sure to use the same LDAP parameters like in the Ceph configuration file to
+ eliminate possible problems.
+
+Configuring the Ceph Object Gateway to use LDAP authentication
+==============================================================
+
+The following parameters in the Ceph configuration file are related to the LDAP
+authentication:
+
+- ``rgw_s3_auth_use_ldap``: Set this to ``true`` to enable S3 authentication with LDAP
+- ``rgw_ldap_uri``: Specifies the LDAP server to use. Make sure to use the
+ ``ldaps://<fqdn>:<port>`` parameter to not transmit clear text credentials
+ over the wire.
+- ``rgw_ldap_binddn``: The Distinguished Name (DN) of the service account used
+ by the Ceph Object Gateway
+- ``rgw_ldap_secret``: Path to file containing credentials for ``rgw_ldap_binddn``
+- ``rgw_ldap_searchdn``: Specifies the base in the directory information tree
+ for searching users. This might be your users organizational unit or some
+ more specific Organizational Unit (OU).
+- ``rgw_ldap_dnattr``: The attribute being used in the constructed search
+ filter to match a username. Depending on your Directory Information Tree
+ (DIT) this would probably be ``uid`` or ``cn``. The generated filter string
+ will be, e.g., ``cn=some_username``.
+- ``rgw_ldap_searchfilter``: If not specified, the Ceph Object Gateway
+ automatically constructs the search filter with the ``rgw_ldap_dnattr``
+ setting. Use this parameter to narrow the list of allowed users in very
+ flexible ways. Consult the *Using a custom search filter to limit user access
+ section* for details
+
+Using a custom search filter to limit user access
+=================================================
+
+There are two ways to use the ``rgw_search_filter`` parameter:
+
+Specifying a partial filter to further limit the constructed search filter
+--------------------------------------------------------------------------
+
+An example for a partial filter:
+
+::
+
+ "objectclass=inetorgperson"
+
+The Ceph Object Gateway will generate the search filter as usual with the
+user name from the token and the value of ``rgw_ldap_dnattr``. The constructed
+filter is then combined with the partial filter from the ``rgw_search_filter``
+attribute. Depending on the user name and the settings the final search filter
+might become:
+
+::
+
+ "(&(uid=hari)(objectclass=inetorgperson))"
+
+So user ``hari`` will only be granted access if he is found in the LDAP
+directory, has an object class of ``inetorgperson``, and did specify a valid
+password.
+
+Specifying a complete filter
+----------------------------
+
+A complete filter must contain a ``@USERNAME@`` token which will be substituted
+with the user name during the authentication attempt. The ``rgw_ldap_dnattr``
+parameter is not used anymore in this case. For example, to limit valid users
+to a specific group, use the following filter:
+
+::
+
+ "(&(uid=@USERNAME@)(memberOf=cn=ceph-users,ou=groups,dc=mycompany,dc=com))"
+
+.. note:: Using the ``memberOf`` attribute in LDAP searches requires server side
+ support from you specific LDAP server implementation.
+
+Generating an access token for LDAP authentication
+==================================================
+
+The ``radosgw-token`` utility generates the access token based on the LDAP
+user name and password. It will output a base-64 encoded string which is the
+access token.
+
+::
+
+ # export RGW_ACCESS_KEY_ID="<username>"
+ # export RGW_SECRET_ACCESS_KEY="<password>"
+ # radosgw-token --encode
+
+.. important:: The access token is a base-64 encoded JSON struct and contains
+ the LDAP credentials as a clear text.
+
+Alternatively, users can also generate the token manually by base-64-encoding
+this JSON snippet, if they do not have the ``radosgw-token`` tool installed.
+
+::
+
+ {
+ "RGW_TOKEN": {
+ "version": 1,
+ "type": "ldap",
+ "id": "your_username",
+ "key": "your_clear_text_password_here"
+ }
+ }
+
+Using the access token
+======================
+
+Use your favorite S3 client and specify the token as the access key in your
+client or environment variables.
+
+::
+
+ # export AWS_ACCESS_KEY_ID=<base64-encoded token generated by radosgw-token>
+ # export AWS_SECRET_ACCESS_KEY="" # define this with an empty string, otherwise tools might complain about missing env variables.
+
+.. important:: The access token is a base-64 encoded JSON struct and contains
+ the LDAP credentials as a clear text. DO NOT share it unless
+ you want to share your clear text password!
diff --git a/doc/radosgw/lua-scripting.rst b/doc/radosgw/lua-scripting.rst
new file mode 100644
index 000000000..c85f72a6e
--- /dev/null
+++ b/doc/radosgw/lua-scripting.rst
@@ -0,0 +1,570 @@
+=============
+Lua Scripting
+=============
+
+.. versionadded:: Pacific
+
+.. contents::
+
+This feature allows users to assign execution context to Lua scripts. The supported contexts are:
+
+ - ``prerequest`` which will execute a script before each operation is performed
+ - ``postrequest`` which will execute after each operation is performed
+ - ``background`` which will execute within a specified time interval
+ - ``getdata`` which will execute on objects' data when objects are downloaded
+ - ``putdata`` which will execute on objects' data when objects are uploaded
+
+A request (pre or post) or data (get or put) context script may be constrained to operations belonging to a specific tenant's users.
+The request context script can also access fields in the request and modify certain fields, as well as the `Global RGW Table`_.
+The data context script can access the content of the object as well as the request fields and the `Global RGW Table`_.
+All Lua language features can be used in all contexts.
+
+By default, all Lua standard libraries are available in the script, however, in order to allow for other Lua modules to be used in the script, we support adding packages to an allowlist:
+
+ - All packages in the allowlist are being re-installed using the luarocks package manager on radosgw restart. Therefore a restart is needed for adding or removing of packages to take effect
+ - To add a package that contains C source code that needs to be compiled, use the ``--allow-compilation`` flag. In this case a C compiler needs to be available on the host
+ - Lua packages are installed in, and used from, a directory local to the radosgw. Meaning that Lua packages in the allowlist are separated from any Lua packages available on the host.
+ By default, this directory would be ``/tmp/luarocks/<entity name>``. Its prefix part (``/tmp/luarocks/``) could be set to a different location via the ``rgw_luarocks_location`` configuration parameter.
+ Note that this parameter should not be set to one of the default locations where luarocks install packages (e.g. ``$HOME/.luarocks``, ``/usr/lib64/lua``, ``/usr/share/lua``).
+
+
+.. toctree::
+ :maxdepth: 1
+
+
+Script Management via CLI
+-------------------------
+
+To upload a script:
+
+
+::
+
+ # radosgw-admin script put --infile={lua-file-path} --context={prerequest|postrequest|background|getdata|putdata} [--tenant={tenant-name}]
+
+
+* When uploading a script with the ``background`` context, a tenant name should not be specified.
+* When uploading a script into a cluster deployed with cephadm, use the following command:
+
+::
+
+ # cephadm shell radosgw-admin script put --infile=/rootfs/{lua-file-path} --context={prerequest|postrequest|background|getdata|putdata} [--tenant={tenant-name}]
+
+
+To print the content of the script to standard output:
+
+::
+
+ # radosgw-admin script get --context={prerequest|postrequest|background|getdata|putdata} [--tenant={tenant-name}]
+
+
+To remove the script:
+
+::
+
+ # radosgw-admin script rm --context={prerequest|postrequest|background|getdata|putdata} [--tenant={tenant-name}]
+
+
+Package Management via CLI
+--------------------------
+
+To add a package to the allowlist:
+
+::
+
+ # radosgw-admin script-package add --package={package name} [--allow-compilation]
+
+
+To add a specific version of a package to the allowlist:
+
+::
+
+ # radosgw-admin script-package add --package='{package name} {package version}' [--allow-compilation]
+
+
+* When adding a different version of a package which already exists in the list, the newly
+ added version will override the existing one.
+
+* When adding a package without a version specified, the latest version of the package
+ will be added.
+
+
+To remove a package from the allowlist:
+
+::
+
+ # radosgw-admin script-package rm --package={package name}
+
+
+To remove a specific version of a package from the allowlist:
+
+::
+
+ # radosgw-admin script-package rm --package='{package name} {package version}'
+
+
+* When removing a package without a version specified, any existing versions of the
+ package will be removed.
+
+
+To print the list of packages in the allowlist:
+
+::
+
+ # radosgw-admin script-package list
+
+
+Context Free Functions
+----------------------
+Debug Log
+~~~~~~~~~
+The ``RGWDebugLog()`` function accepts a string and prints it to the debug log with priority 20.
+Each log message is prefixed ``Lua INFO:``. This function has no return value.
+
+Request Fields
+-----------------
+
+.. warning:: This feature is experimental. Fields may be removed or renamed in the future.
+
+.. note::
+
+ - Although Lua is a case-sensitive language, field names provided by the radosgw are case-insensitive. Function names remain case-sensitive.
+ - Fields marked "optional" can have a nil value.
+ - Fields marked as "iterable" can be used by the pairs() function and with the # length operator.
+ - All table fields can be used with the bracket operator ``[]``.
+ - ``time`` fields are strings with the following format: ``%Y-%m-%d %H:%M:%S``.
+
+
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| Field | Type | Description | Iterable | Writeable | Optional |
++====================================================+==========+==============================================================+==========+===========+==========+
+| ``Request.RGWOp`` | string | radosgw operation | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.DecodedURI`` | string | decoded URI | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.ContentLength`` | integer | size of the request | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.GenericAttributes`` | table | string to string generic attributes map | yes | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Response`` | table | response to the request | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Response.HTTPStatusCode`` | integer | HTTP status code | no | yes | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Response.HTTPStatus`` | string | HTTP status text | no | yes | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Response.RGWCode`` | integer | radosgw error code | no | yes | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Response.Message`` | string | response message | no | yes | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.SwiftAccountName`` | string | swift account name | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket`` | table | info on the bucket | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.Tenant`` | string | tenant of the bucket | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.Name`` | string | bucket name (writeable only in ``prerequest`` context) | no | yes | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.Marker`` | string | bucket marker (initial id) | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.Id`` | string | bucket id | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.Count`` | integer | number of objects in the bucket | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.Size`` | integer | total size of objects in the bucket | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.ZoneGroupId`` | string | zone group of the bucket | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.CreationTime`` | time | creation time of the bucket | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.MTime`` | time | modification time of the bucket | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.Quota`` | table | bucket quota | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.Quota.MaxSize`` | integer | bucket quota max size | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.Quota.MaxObjects`` | integer | bucket quota max number of objects | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Reques.Bucket.Quota.Enabled`` | boolean | bucket quota is enabled | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.Quota.Rounded`` | boolean | bucket quota is rounded to 4K | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.PlacementRule`` | table | bucket placement rule | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.PlacementRule.Name`` | string | bucket placement rule name | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.PlacementRule.StorageClass`` | string | bucket placement rule storage class | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.User`` | table | bucket owner | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.User.Tenant`` | string | bucket owner tenant | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Bucket.User.Id`` | string | bucket owner id | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Object`` | table | info on the object | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Object.Name`` | string | object name | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Object.Instance`` | string | object version | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Object.Id`` | string | object id | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Object.Size`` | integer | object size | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Object.MTime`` | time | object mtime | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.CopyFrom`` | table | information on copy operation | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.CopyFrom.Tenant`` | string | tenant of the object copied from | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.CopyFrom.Bucket`` | string | bucket of the object copied from | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.CopyFrom.Object`` | table | object copied from. See: ``Request.Object`` | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.ObjectOwner`` | table | object owner | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.ObjectOwner.DisplayName`` | string | object owner display name | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.ObjectOwner.User`` | table | object user. See: ``Request.Bucket.User`` | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.ZoneGroup.Name`` | string | name of zone group | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.ZoneGroup.Endpoint`` | string | endpoint of zone group | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserAcl`` | table | user ACL | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserAcl.Owner`` | table | user ACL owner. See: ``Request.ObjectOwner`` | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserAcl.Grants`` | table | user ACL map of string to grant | yes | no | no |
+| | | note: grants without an Id are not presented when iterated | | | |
+| | | and only one of them can be accessed via brackets | | | |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserAcl.Grants["<name>"]`` | table | user ACL grant | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserAcl.Grants["<name>"].Type`` | integer | user ACL grant type | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserAcl.Grants["<name>"].User`` | table | user ACL grant user | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserAcl.Grants["<name>"].User.Tenant`` | table | user ACL grant user tenant | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserAcl.Grants["<name>"].User.Id`` | table | user ACL grant user id | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserAcl.Grants["<name>"].GroupType`` | integer | user ACL grant group type | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserAcl.Grants["<name>"].Referer`` | string | user ACL grant referer | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.BucketAcl`` | table | bucket ACL. See: ``Request.UserAcl`` | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.ObjectAcl`` | table | object ACL. See: ``Request.UserAcl`` | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Environment`` | table | string to string environment map | yes | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Policy`` | table | policy | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Policy.Text`` | string | policy text | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Policy.Id`` | string | policy Id | no | no | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Policy.Statements`` | table | list of string statements | yes | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserPolicies`` | table | list of user policies | yes | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.UserPolicies[<index>]`` | table | user policy. See: ``Request.Policy`` | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.RGWId`` | string | radosgw host id: ``<host>-<zone>-<zonegroup>`` | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.HTTP`` | table | HTTP header | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.HTTP.Parameters`` | table | string to string parameter map | yes | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.HTTP.Resources`` | table | string to string resource map | yes | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.HTTP.Metadata`` | table | string to string metadata map | yes | yes | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.HTTP.StorageClass`` | string | storage class | no | yes | yes |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.HTTP.Host`` | string | host name | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.HTTP.Method`` | string | HTTP method | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.HTTP.URI`` | string | URI | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.HTTP.QueryString`` | string | HTTP query string | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.HTTP.Domain`` | string | domain name | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Time`` | time | request time | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Dialect`` | string | "S3" or "Swift" | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Id`` | string | request Id | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.TransactionId`` | string | transaction Id | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Tags`` | table | object tags map | yes | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.User`` | table | user that triggered the request | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.User.Tenant`` | string | triggering user tenant | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.User.Id`` | string | triggering user id | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Trace`` | table | info on trace | no | no | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+| ``Request.Trace.Enable`` | boolean | tracing is enabled | no | yes | no |
++----------------------------------------------------+----------+--------------------------------------------------------------+----------+-----------+----------+
+
+Request Functions
+--------------------
+Operations Log
+~~~~~~~~~~~~~~
+The ``Request.Log()`` function prints the requests into the operations log. This function has no parameters. It returns 0 for success and an error code if it fails.
+
+Tracing
+~~~~~~~
+Tracing functions can be used only in the ``postrequest`` context.
+
+- ``Request.Trace.SetAttribute(<key>, <value>)`` - sets the attribute for the request's trace.
+ The function takes two arguments: the first is the ``key``, which should be a string, and the second is the ``value``, which can either be a string or a number (integer or double).
+ You may then locate specific traces by using this attribute.
+
+- ``Request.Trace.AddEvent(<name>, <attributes>)`` - adds an event to the first span of the request's trace
+ An event is defined by event name, event time, and zero or more event attributes.
+ The function accepts one or two arguments: A string containing the event ``name`` should be the first argument, followed by the event ``attributes``, which is optional for events without attributes.
+ An event's attributes must be a table of strings.
+
+Background Context
+--------------------
+The ``background`` context may be used for purposes that include analytics, monitoring, caching data for other context executions.
+- Background script execution default interval is 5 seconds.
+
+Data Context
+--------------------
+Both ``getdata`` and ``putdata`` contexts have the following fields:
+- ``Data`` which is read-only and iterable (byte by byte). In case that an object is uploaded or retrieved in multiple chunks, the ``Data`` field will hold data of one chunk at a time.
+- ``Offset`` which is holding the offset of the chunk within the entire object.
+- The ``Request`` fields and the background ``RGW`` table are also available in these contexts.
+
+Global RGW Table
+--------------------
+The ``RGW`` Lua table is accessible from all contexts and saves data written to it
+during execution so that it may be read and used later during other executions, from the same context of a different one.
+- Each RGW instance has its own private and ephemeral ``RGW`` Lua table that is lost when the daemon restarts. Note that ``background`` context scripts will run on every instance.
+- The maximum number of entries in the table is 100,000. Each entry has a string key a value with a combined length of no more than 1KB.
+A Lua script will abort with an error if the number of entries or entry size exceeds these limits.
+- The ``RGW`` Lua table uses string indices and can store values of type: string, integer, double and boolean
+
+Increment/Decrement Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Since entries in the ``RGW`` table could be accessed from multiple places at the same time we need a way
+to atomically increment and decrement numeric values in it. For that the following functions should be used:
+- ``RGW.increment(<key>, [value])`` would increment the value of ``key`` by ``value`` if value is provided or by 1 if not
+- ``RGW.decrement(<key>, [value])`` would decrement the value of ``key`` by ``value`` if value is provided or by 1 if not
+- if the value of ``key`` is not numeric, the execution of the script would fail
+- if we try to increment or decrement by non-numeric values, the execution of the script would fail
+
+
+Lua Code Samples
+----------------
+- Print information on source and destination objects in case of copy:
+
+.. code-block:: lua
+
+ function print_object(object)
+ RGWDebugLog(" Name: " .. object.Name)
+ RGWDebugLog(" Instance: " .. object.Instance)
+ RGWDebugLog(" Id: " .. object.Id)
+ RGWDebugLog(" Size: " .. object.Size)
+ RGWDebugLog(" MTime: " .. object.MTime)
+ end
+
+ if Request.CopyFrom and Request.Object and Request.CopyFrom.Object then
+ RGWDebugLog("copy from object:")
+ print_object(Request.CopyFrom.Object)
+ RGWDebugLog("to object:")
+ print_object(Request.Object)
+ end
+
+- Print ACLs via a "generic function":
+
+.. code-block:: lua
+
+ function print_owner(owner)
+ RGWDebugLog("Owner:")
+ RGWDebugLog(" Display Name: " .. owner.DisplayName)
+ RGWDebugLog(" Id: " .. owner.User.Id)
+ RGWDebugLog(" Tenant: " .. owner.User.Tenant)
+ end
+
+ function print_acl(acl_type)
+ index = acl_type .. "ACL"
+ acl = Request[index]
+ if acl then
+ RGWDebugLog(acl_type .. "ACL Owner")
+ print_owner(acl.Owner)
+ RGWDebugLog(" there are " .. #acl.Grants .. " grant for owner")
+ for k,v in pairs(acl.Grants) do
+ RGWDebugLog(" Grant Key: " .. k)
+ RGWDebugLog(" Grant Type: " .. v.Type)
+ RGWDebugLog(" Grant Group Type: " .. v.GroupType)
+ RGWDebugLog(" Grant Referer: " .. v.Referer)
+ RGWDebugLog(" Grant User Tenant: " .. v.User.Tenant)
+ RGWDebugLog(" Grant User Id: " .. v.User.Id)
+ end
+ else
+ RGWDebugLog("no " .. acl_type .. " ACL in request: " .. Request.Id)
+ end
+ end
+
+ print_acl("User")
+ print_acl("Bucket")
+ print_acl("Object")
+
+- Use of operations log only in case of errors:
+
+.. code-block:: lua
+
+ if Request.Response.HTTPStatusCode ~= 200 then
+ RGWDebugLog("request is bad, use ops log")
+ rc = Request.Log()
+ RGWDebugLog("ops log return code: " .. rc)
+ end
+
+- Set values into the error message:
+
+.. code-block:: lua
+
+ if Request.Response.HTTPStatusCode == 500 then
+ Request.Response.Message = "<Message> something bad happened :-( </Message>"
+ end
+
+- Add metadata to objects that was not originally sent by the client:
+
+In the ``prerequest`` context we should add:
+
+.. code-block:: lua
+
+ if Request.RGWOp == 'put_obj' then
+ Request.HTTP.Metadata["x-amz-meta-mydata"] = "my value"
+ end
+
+In the ``postrequest`` context we look at the metadata:
+
+.. code-block:: lua
+
+ RGWDebugLog("number of metadata entries is: " .. #Request.HTTP.Metadata)
+ for k, v in pairs(Request.HTTP.Metadata) do
+ RGWDebugLog("key=" .. k .. ", " .. "value=" .. v)
+ end
+
+- Use modules to create Unix socket based, JSON encoded, "access log":
+
+First we should add the following packages to the allowlist:
+
+::
+
+ # radosgw-admin script-package add --package=luajson
+ # radosgw-admin script-package add --package=luasocket --allow-compilation
+
+
+Then, do a restart for the radosgw and upload the following script to the ``postrequest`` context:
+
+.. code-block:: lua
+
+ if Request.RGWOp == "get_obj" then
+ local json = require("json")
+ local socket = require("socket")
+ local unix = require("socket.unix")
+ local s = assert(unix())
+ E = {}
+
+ msg = {bucket = (Request.Bucket or (Request.CopyFrom or E).Bucket).Name,
+ time = Request.Time,
+ operation = Request.RGWOp,
+ http_status = Request.Response.HTTPStatusCode,
+ error_code = Request.Response.HTTPStatus,
+ object_size = Request.Object.Size,
+ trans_id = Request.TransactionId}
+
+ assert(s:connect("/tmp/socket"))
+ assert(s:send(json.encode(msg).."\n"))
+ assert(s:close())
+ end
+
+
+- Trace only requests of specific bucket
+
+Tracing is disabled by default, so we should enable tracing for this specific bucket
+
+.. code-block:: lua
+
+ if Request.Bucket.Name == "my-bucket" then
+ Request.Trace.Enable = true
+ end
+
+
+If `tracing is enabled <https://docs.ceph.com/en/latest/jaegertracing/#how-to-enable-tracing-in-ceph/>`_ on the RGW, the value of Request.Trace.Enable is true, so we should disable tracing for all other requests that do not match the bucket name.
+In the ``prerequest`` context:
+
+.. code-block:: lua
+
+ if Request.Bucket.Name ~= "my-bucket" then
+ Request.Trace.Enable = false
+ end
+
+Note that changing ``Request.Trace.Enable`` does not change the tracer's state, but disables or enables the tracing for the request only.
+
+
+- Add Information for requests traces
+
+in ``postrequest`` context, we can add attributes and events to the request's trace.
+
+.. code-block:: lua
+
+ Request.Trace.AddEvent("lua script execution started")
+
+ Request.Trace.SetAttribute("HTTPStatusCode", Request.Response.HTTPStatusCode)
+
+ event_attrs = {}
+ for k,v in pairs(Request.GenericAttributes) do
+ event_attrs[k] = v
+ end
+
+ Request.Trace.AddEvent("second event", event_attrs)
+
+- The entropy value of an object could be used to detect whether the object is encrypted.
+ The following script calculates the entropy and size of uploaded objects and print to debug log
+
+in the ``putdata`` context, add the following script
+
+.. code-block:: lua
+
+ function object_entropy()
+ local byte_hist = {}
+ local byte_hist_size = 256
+ for i = 1,byte_hist_size do
+ byte_hist[i] = 0
+ end
+ local total = 0
+
+ for i, c in pairs(Data) do
+ local byte = c:byte() + 1
+ byte_hist[byte] = byte_hist[byte] + 1
+ total = total + 1
+ end
+
+ entropy = 0
+
+ for _, count in ipairs(byte_hist) do
+ if count ~= 0 then
+ local p = 1.0 * count / total
+ entropy = entropy - (p * math.log(p)/math.log(byte_hist_size))
+ end
+ end
+
+ return entropy
+ end
+
+ local full_name = Request.Bucket.Name.."\\"..Request.Object.Name
+ RGWDebugLog("entropy of chunk of: " .. full_name .. " at offset:" .. tostring(Offset) .. " is: " .. tostring(object_entropy()))
+ RGWDebugLog("payload size of chunk of: " .. full_name .. " is: " .. #Data)
+
diff --git a/doc/radosgw/mfa.rst b/doc/radosgw/mfa.rst
new file mode 100644
index 000000000..416f23af1
--- /dev/null
+++ b/doc/radosgw/mfa.rst
@@ -0,0 +1,102 @@
+.. _rgw_mfa:
+
+==========================================
+RGW Support for Multifactor Authentication
+==========================================
+
+.. versionadded:: Mimic
+
+The S3 multifactor authentication (MFA) feature allows
+users to require the use of one-time password when removing
+objects on certain buckets. The buckets need to be configured
+with versioning and MFA enabled which can be done through
+the S3 api.
+
+Time-based one time password tokens can be assigned to a user
+through radosgw-admin. Each token has a secret seed, and a serial
+id that is assigned to it. Tokens are added to the user, can
+be listed, removed, and can also be re-synchronized.
+
+Multisite
+=========
+
+While the MFA IDs are set on the user's metadata, the
+actual MFA one time password configuration resides in the local zone's
+osds. Therefore, in a multi-site environment it is advisable to use
+different tokens for different zones.
+
+
+Terminology
+=============
+
+-``TOTP``: Time-based One Time Password
+
+-``token serial``: a string that represents the ID of a TOTP token
+
+-``token seed``: the secret seed that is used to calculate the TOTP
+
+-``totp seconds``: the time resolution that is being used for TOTP generation
+
+-``totp window``: the number of TOTP tokens that are checked before and after the current token when validating token
+
+-``totp pin``: the valid value of a TOTP token at a certain time
+
+
+Admin commands
+==============
+
+Create a new MFA TOTP token
+------------------------------------
+
+::
+
+ # radosgw-admin mfa create --uid=<user-id> \
+ --totp-serial=<serial> \
+ --totp-seed=<seed> \
+ [ --totp-seed-type=<hex|base32> ] \
+ [ --totp-seconds=<num-seconds> ] \
+ [ --totp-window=<twindow> ]
+
+List MFA TOTP tokens
+---------------------
+
+::
+
+ # radosgw-admin mfa list --uid=<user-id>
+
+
+Show MFA TOTP token
+------------------------------------
+
+::
+
+ # radosgw-admin mfa get --uid=<user-id> --totp-serial=<serial>
+
+
+Delete MFA TOTP token
+------------------------
+
+::
+
+ # radosgw-admin mfa remove --uid=<user-id> --totp-serial=<serial>
+
+
+Check MFA TOTP token
+--------------------------------
+
+Test a TOTP token pin, needed for validating that TOTP functions correctly. ::
+
+ # radosgw-admin mfa check --uid=<user-id> --totp-serial=<serial> \
+ --totp-pin=<pin>
+
+
+Re-sync MFA TOTP token
+--------------------------------
+
+In order to re-sync the TOTP token (in case of time skew). This requires
+feeding two consecutive pins: the previous pin, and the current pin. ::
+
+ # radosgw-admin mfa resync --uid=<user-id> --totp-serial=<serial> \
+ --totp-pin=<prev-pin> --totp=pin=<current-pin>
+
+
diff --git a/doc/radosgw/multisite-sync-policy.rst b/doc/radosgw/multisite-sync-policy.rst
new file mode 100644
index 000000000..7a6e7105c
--- /dev/null
+++ b/doc/radosgw/multisite-sync-policy.rst
@@ -0,0 +1,716 @@
+.. _radosgw-multisite-sync-policy:
+
+=====================
+Multisite Sync Policy
+=====================
+
+.. versionadded:: Octopus
+
+Multisite bucket-granularity sync policy provides fine grained control of data movement between buckets in different zones. It extends the zone sync mechanism. Previously buckets were being treated symmetrically, that is -- each (data) zone holds a mirror of that bucket that should be the same as all the other zones. Whereas leveraging the bucket-granularity sync policy is possible for buckets to diverge, and a bucket can pull data from other buckets (ones that don't share its name or its ID) in different zone. The sync process was assuming therefore that the bucket sync source and the bucket sync destination were always referring to the same bucket, now that is not the case anymore.
+
+The sync policy supersedes the old zonegroup coarse configuration (sync_from*). The sync policy can be configured at the zonegroup level (and if it is configured it replaces the old style config), but it can also be configured at the bucket level.
+
+In the sync policy multiple groups that can contain lists of data-flow configurations can be defined, as well as lists of pipe configurations. The data-flow defines the flow of data between the different zones. It can define symmetrical data flow, in which multiple zones sync data from each other, and it can define directional data flow, in which the data moves in one way from one zone to another.
+
+A pipe defines the actual buckets that can use these data flows, and the properties that are associated with it (for example: source object prefix).
+
+A sync policy group can be in 3 states:
+
++----------------------------+----------------------------------------+
+| Value | Description |
++============================+========================================+
+| ``enabled`` | sync is allowed and enabled |
++----------------------------+----------------------------------------+
+| ``allowed`` | sync is allowed |
++----------------------------+----------------------------------------+
+| ``forbidden`` | sync (as defined by this group) is not |
+| | allowed and can override other groups |
++----------------------------+----------------------------------------+
+
+A policy can be defined at the bucket level. A bucket level sync policy inherits the data flow of the zonegroup policy, and can only define a subset of what the zonegroup allows.
+
+A wildcard zone, and a wildcard bucket parameter in the policy defines all relevant zones, or all relevant buckets. In the context of a bucket policy it means the current bucket instance. A disaster recovery configuration where entire zones are mirrored doesn't require configuring anything on the buckets. However, for a fine grained bucket sync it would be better to configure the pipes to be synced by allowing (status=allowed) them at the zonegroup level (e.g., using wildcards), but only enable the specific sync at the bucket level (status=enabled). If needed, the policy at the bucket level can limit the data movement to specific relevant zones.
+
+.. important:: Any changes to the zonegroup policy needs to be applied on the
+ zonegroup master zone, and require period update and commit. Changes
+ to the bucket policy needs to be applied on the zonegroup master
+ zone. The changes are dynamically handled by rgw.
+
+
+S3 Replication API
+~~~~~~~~~~~~~~~~~~
+
+The S3 bucket replication api has also been implemented, and allows users to create replication rules between different buckets. Note though that while the AWS replication feature allows bucket replication within the same zone, rgw does not allow it at the moment. However, the rgw api also added a new 'Zone' array that allows users to select to what zones the specific bucket will be synced.
+
+
+Sync Policy Control Reference
+=============================
+
+
+Get Sync Policy
+~~~~~~~~~~~~~~~
+
+To retrieve the current zonegroup sync policy, or a specific bucket policy:
+
+::
+
+ # radosgw-admin sync policy get [--bucket=<bucket>]
+
+
+Create Sync Policy Group
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To create a sync policy group:
+
+::
+
+ # radosgw-admin sync group create [--bucket=<bucket>] \
+ --group-id=<group-id> \
+ --status=<enabled | allowed | forbidden> \
+
+
+Modify Sync Policy Group
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To modify a sync policy group:
+
+::
+
+ # radosgw-admin sync group modify [--bucket=<bucket>] \
+ --group-id=<group-id> \
+ --status=<enabled | allowed | forbidden> \
+
+
+Show Sync Policy Group
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To show a sync policy group:
+
+::
+
+ # radosgw-admin sync group get [--bucket=<bucket>] \
+ --group-id=<group-id>
+
+
+Remove Sync Policy Group
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To remove a sync policy group:
+
+::
+
+ # radosgw-admin sync group remove [--bucket=<bucket>] \
+ --group-id=<group-id>
+
+
+
+Create Sync Flow
+~~~~~~~~~~~~~~~~
+
+- To create or update directional sync flow:
+
+::
+
+ # radosgw-admin sync group flow create [--bucket=<bucket>] \
+ --group-id=<group-id> \
+ --flow-id=<flow-id> \
+ --flow-type=directional \
+ --source-zone=<source_zone> \
+ --dest-zone=<dest_zone>
+
+
+- To create or update symmetrical sync flow:
+
+::
+
+ # radosgw-admin sync group flow create [--bucket=<bucket>] \
+ --group-id=<group-id> \
+ --flow-id=<flow-id> \
+ --flow-type=symmetrical \
+ --zones=<zones>
+
+
+Where zones are a comma separated list of all the zones that need to add to the flow.
+
+
+Remove Sync Flow Zones
+~~~~~~~~~~~~~~~~~~~~~~
+
+- To remove directional sync flow:
+
+::
+
+ # radosgw-admin sync group flow remove [--bucket=<bucket>] \
+ --group-id=<group-id> \
+ --flow-id=<flow-id> \
+ --flow-type=directional \
+ --source-zone=<source_zone> \
+ --dest-zone=<dest_zone>
+
+
+- To remove specific zones from symmetrical sync flow:
+
+::
+
+ # radosgw-admin sync group flow remove [--bucket=<bucket>] \
+ --group-id=<group-id> \
+ --flow-id=<flow-id> \
+ --flow-type=symmetrical \
+ --zones=<zones>
+
+
+Where zones are a comma separated list of all zones to remove from the flow.
+
+
+- To remove symmetrical sync flow:
+
+::
+
+ # radosgw-admin sync group flow remove [--bucket=<bucket>] \
+ --group-id=<group-id> \
+ --flow-id=<flow-id> \
+ --flow-type=symmetrical
+
+
+Create Sync Pipe
+~~~~~~~~~~~~~~~~
+
+To create sync group pipe, or update its parameters:
+
+
+::
+
+ # radosgw-admin sync group pipe create [--bucket=<bucket>] \
+ --group-id=<group-id> \
+ --pipe-id=<pipe-id> \
+ --source-zones=<source_zones> \
+ [--source-bucket=<source_buckets>] \
+ [--source-bucket-id=<source_bucket_id>] \
+ --dest-zones=<dest_zones> \
+ [--dest-bucket=<dest_buckets>] \
+ [--dest-bucket-id=<dest_bucket_id>] \
+ [--prefix=<source_prefix>] \
+ [--prefix-rm] \
+ [--tags-add=<tags>] \
+ [--tags-rm=<tags>] \
+ [--dest-owner=<owner>] \
+ [--storage-class=<storage_class>] \
+ [--mode=<system | user>] \
+ [--uid=<user_id>]
+
+
+Zones are either a list of zones, or '*' (wildcard). Wildcard zones mean any zone that matches the sync flow rules.
+Buckets are either a bucket name, or '*' (wildcard). Wildcard bucket means the current bucket
+Prefix can be defined to filter source objects.
+Tags are passed by a comma separated list of 'key=value'.
+Destination owner can be set to force a destination owner of the objects. If user mode is selected, only the destination bucket owner can be set.
+Destination storage class can also be configured.
+User id can be set for user mode, and will be the user under which the sync operation will be executed (for permissions validation).
+
+
+Remove Sync Pipe
+~~~~~~~~~~~~~~~~
+
+To remove specific sync group pipe params, or the entire pipe:
+
+
+::
+
+ # radosgw-admin sync group pipe remove [--bucket=<bucket>] \
+ --group-id=<group-id> \
+ --pipe-id=<pipe-id> \
+ [--source-zones=<source_zones>] \
+ [--source-bucket=<source_buckets>] \
+ [--source-bucket-id=<source_bucket_id>] \
+ [--dest-zones=<dest_zones>] \
+ [--dest-bucket=<dest_buckets>] \
+ [--dest-bucket-id=<dest_bucket_id>]
+
+
+Sync Info
+~~~~~~~~~
+
+To get information about the expected sync sources and targets (as defined by the sync policy):
+
+::
+
+ # radosgw-admin sync info [--bucket=<bucket>] \
+ [--effective-zone-name=<zone>]
+
+
+Since a bucket can define a policy that defines data movement from it towards a different bucket at a different zone, when the policy is created we also generate a list of bucket dependencies that are used as hints when a sync of any particular bucket happens. The fact that a bucket references another bucket does not mean it actually syncs to/from it, as the data flow might not permit it.
+
+
+Examples
+========
+
+The system in these examples includes 3 zones: ``us-east`` (the master zone), ``us-west``, ``us-west-2``.
+
+Example 1: Two Zones, Complete Mirror
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This is similar to older (pre ``Octopus``) sync capabilities, but being done via the new sync policy engine. Note that changes to the zonegroup sync policy require a period update and commit.
+
+
+::
+
+ [us-east] $ radosgw-admin sync group create --group-id=group1 --status=allowed
+ [us-east] $ radosgw-admin sync group flow create --group-id=group1 \
+ --flow-id=flow-mirror --flow-type=symmetrical \
+ --zones=us-east,us-west
+ [us-east] $ radosgw-admin sync group pipe create --group-id=group1 \
+ --pipe-id=pipe1 --source-zones='*' \
+ --source-bucket='*' --dest-zones='*' \
+ --dest-bucket='*'
+ [us-east] $ radosgw-admin sync group modify --group-id=group1 --status=enabled
+ [us-east] $ radosgw-admin period update --commit
+
+ $ radosgw-admin sync info --bucket=buck
+ {
+ "sources": [
+ {
+ "id": "pipe1",
+ "source": {
+ "zone": "us-west",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ "dest": {
+ "zone": "us-east",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ "params": {
+ ...
+ }
+ }
+ ],
+ "dests": [
+ {
+ "id": "pipe1",
+ "source": {
+ "zone": "us-east",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ "dest": {
+ "zone": "us-west",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ ...
+ }
+ ],
+ ...
+ }
+ }
+
+
+Note that the "id" field in the output above reflects the pipe rule
+that generated that entry, a single rule can generate multiple sync
+entries as can be seen in the example.
+
+::
+
+ [us-west] $ radosgw-admin sync info --bucket=buck
+ {
+ "sources": [
+ {
+ "id": "pipe1",
+ "source": {
+ "zone": "us-east",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ "dest": {
+ "zone": "us-west",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ ...
+ }
+ ],
+ "dests": [
+ {
+ "id": "pipe1",
+ "source": {
+ "zone": "us-west",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ "dest": {
+ "zone": "us-east",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ ...
+ }
+ ],
+ ...
+ }
+
+
+
+Example 2: Directional, Entire Zone Backup
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Also similar to older sync capabilities. In here we add a third zone, ``us-west-2`` that will be a replica of ``us-west``, but data will not be replicated back from it.
+
+::
+
+ [us-east] $ radosgw-admin sync group flow create --group-id=group1 \
+ --flow-id=us-west-backup --flow-type=directional \
+ --source-zone=us-west --dest-zone=us-west-2
+ [us-east] $ radosgw-admin period update --commit
+
+
+Note that us-west has two dests:
+
+::
+
+ [us-west] $ radosgw-admin sync info --bucket=buck
+ {
+ "sources": [
+ {
+ "id": "pipe1",
+ "source": {
+ "zone": "us-east",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ "dest": {
+ "zone": "us-west",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ ...
+ }
+ ],
+ "dests": [
+ {
+ "id": "pipe1",
+ "source": {
+ "zone": "us-west",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ "dest": {
+ "zone": "us-east",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ ...
+ },
+ {
+ "id": "pipe1",
+ "source": {
+ "zone": "us-west",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ "dest": {
+ "zone": "us-west-2",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ ...
+ }
+ ],
+ ...
+ }
+
+
+Whereas us-west-2 has only source and no destinations:
+
+::
+
+ [us-west-2] $ radosgw-admin sync info --bucket=buck
+ {
+ "sources": [
+ {
+ "id": "pipe1",
+ "source": {
+ "zone": "us-west",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ "dest": {
+ "zone": "us-west-2",
+ "bucket": "buck:115b12b3-....4409.1"
+ },
+ ...
+ }
+ ],
+ "dests": [],
+ ...
+ }
+
+
+
+Example 3: Mirror a Specific Bucket
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Using the same group configuration, but this time switching it to ``allowed`` state, which means that sync is allowed but not enabled.
+
+::
+
+ [us-east] $ radosgw-admin sync group modify --group-id=group1 --status=allowed
+ [us-east] $ radosgw-admin period update --commit
+
+
+And we will create a bucket level policy rule for existing bucket ``buck2``. Note that the bucket needs to exist before being able to set this policy, and admin commands that modify bucket policies need to run on the master zone, however, they do not require period update. There is no need to change the data flow, as it is inherited from the zonegroup policy. A bucket policy flow will only be a subset of the flow defined in the zonegroup policy. Same goes for pipes, although a bucket policy can enable pipes that are not enabled (albeit not forbidden) at the zonegroup policy.
+
+::
+
+ [us-east] $ radosgw-admin sync group create --bucket=buck2 \
+ --group-id=buck2-default --status=enabled
+
+ [us-east] $ radosgw-admin sync group pipe create --bucket=buck2 \
+ --group-id=buck2-default --pipe-id=pipe1 \
+ --source-zones='*' --dest-zones='*'
+
+
+
+Example 4: Limit Bucket Sync To Specific Zones
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This will only sync ``buck3`` to ``us-east`` (from any zone that flow allows to sync into ``us-east``).
+
+::
+
+ [us-east] $ radosgw-admin sync group create --bucket=buck3 \
+ --group-id=buck3-default --status=enabled
+
+ [us-east] $ radosgw-admin sync group pipe create --bucket=buck3 \
+ --group-id=buck3-default --pipe-id=pipe1 \
+ --source-zones='*' --dest-zones=us-east
+
+
+
+Example 5: Sync From a Different Bucket
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Note that bucket sync only works (currently) across zones and not within the same zone.
+
+Set ``buck4`` to pull data from ``buck5``:
+
+::
+
+ [us-east] $ radosgw-admin sync group create --bucket=buck4 '
+ --group-id=buck4-default --status=enabled
+
+ [us-east] $ radosgw-admin sync group pipe create --bucket=buck4 \
+ --group-id=buck4-default --pipe-id=pipe1 \
+ --source-zones='*' --source-bucket=buck5 \
+ --dest-zones='*'
+
+
+can also limit it to specific zones, for example the following will
+only sync data originated in us-west:
+
+::
+
+ [us-east] $ radosgw-admin sync group pipe modify --bucket=buck4 \
+ --group-id=buck4-default --pipe-id=pipe1 \
+ --source-zones=us-west --source-bucket=buck5 \
+ --dest-zones='*'
+
+
+Checking the sync info for ``buck5`` on ``us-west`` is interesting:
+
+::
+
+ [us-west] $ radosgw-admin sync info --bucket=buck5
+ {
+ "sources": [],
+ "dests": [],
+ "hints": {
+ "sources": [],
+ "dests": [
+ "buck4:115b12b3-....14433.2"
+ ]
+ },
+ "resolved-hints-1": {
+ "sources": [],
+ "dests": [
+ {
+ "id": "pipe1",
+ "source": {
+ "zone": "us-west",
+ "bucket": "buck5"
+ },
+ "dest": {
+ "zone": "us-east",
+ "bucket": "buck4:115b12b3-....14433.2"
+ },
+ ...
+ },
+ {
+ "id": "pipe1",
+ "source": {
+ "zone": "us-west",
+ "bucket": "buck5"
+ },
+ "dest": {
+ "zone": "us-west-2",
+ "bucket": "buck4:115b12b3-....14433.2"
+ },
+ ...
+ }
+ ]
+ },
+ "resolved-hints": {
+ "sources": [],
+ "dests": []
+ }
+ }
+
+
+Note that there are resolved hints, which means that the bucket ``buck5`` found about ``buck4`` syncing from it indirectly, and not from its own policy (the policy for ``buck5`` itself is empty).
+
+
+Example 6: Sync To Different Bucket
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The same mechanism can work for configuring data to be synced to (vs. synced from as in the previous example). Note that internally data is still pulled from the source at the destination zone:
+
+Set ``buck6`` to "push" data to ``buck5``:
+
+::
+
+ [us-east] $ radosgw-admin sync group create --bucket=buck6 \
+ --group-id=buck6-default --status=enabled
+
+ [us-east] $ radosgw-admin sync group pipe create --bucket=buck6 \
+ --group-id=buck6-default --pipe-id=pipe1 \
+ --source-zones='*' --source-bucket='*' \
+ --dest-zones='*' --dest-bucket=buck5
+
+
+A wildcard bucket name means the current bucket in the context of bucket sync policy.
+
+Combined with the configuration in Example 5, we can now write data to ``buck6`` on ``us-east``, data will sync to ``buck5`` on ``us-west``, and from there it will be distributed to ``buck4`` on ``us-east``, and on ``us-west-2``.
+
+Example 7: Source Filters
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Sync from ``buck8`` to ``buck9``, but only objects that start with ``foo/``:
+
+::
+
+ [us-east] $ radosgw-admin sync group create --bucket=buck8 \
+ --group-id=buck8-default --status=enabled
+
+ [us-east] $ radosgw-admin sync group pipe create --bucket=buck8 \
+ --group-id=buck8-default --pipe-id=pipe-prefix \
+ --prefix=foo/ --source-zones='*' --dest-zones='*' \
+ --dest-bucket=buck9
+
+
+Also sync from ``buck8`` to ``buck9`` any object that has the tags ``color=blue`` or ``color=red``:
+
+::
+
+ [us-east] $ radosgw-admin sync group pipe create --bucket=buck8 \
+ --group-id=buck8-default --pipe-id=pipe-tags \
+ --tags-add=color=blue,color=red --source-zones='*' \
+ --dest-zones='*' --dest-bucket=buck9
+
+
+And we can check the expected sync in ``us-east`` (for example):
+
+::
+
+ [us-east] $ radosgw-admin sync info --bucket=buck8
+ {
+ "sources": [],
+ "dests": [
+ {
+ "id": "pipe-prefix",
+ "source": {
+ "zone": "us-east",
+ "bucket": "buck8:115b12b3-....14433.5"
+ },
+ "dest": {
+ "zone": "us-west",
+ "bucket": "buck9"
+ },
+ "params": {
+ "source": {
+ "filter": {
+ "prefix": "foo/",
+ "tags": []
+ }
+ },
+ ...
+ }
+ },
+ {
+ "id": "pipe-tags",
+ "source": {
+ "zone": "us-east",
+ "bucket": "buck8:115b12b3-....14433.5"
+ },
+ "dest": {
+ "zone": "us-west",
+ "bucket": "buck9"
+ },
+ "params": {
+ "source": {
+ "filter": {
+ "tags": [
+ {
+ "key": "color",
+ "value": "blue"
+ },
+ {
+ "key": "color",
+ "value": "red"
+ }
+ ]
+ }
+ },
+ ...
+ }
+ }
+ ],
+ ...
+ }
+
+
+Note that there aren't any sources, only two different destinations (one for each configuration). When the sync process happens it will select the relevant rule for each object it syncs.
+
+Prefixes and tags can be combined, in which object will need to have both in order to be synced. The priority param can also be passed, and it can be used to determine when there are multiple different rules that are matched (and have the same source and destination), to determine which of the rules to be used.
+
+
+Example 8: Destination Params: Storage Class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Storage class of the destination objects can be configured:
+
+::
+
+ [us-east] $ radosgw-admin sync group create --bucket=buck10 \
+ --group-id=buck10-default --status=enabled
+
+ [us-east] $ radosgw-admin sync group pipe create --bucket=buck10 \
+ --group-id=buck10-default \
+ --pipe-id=pipe-storage-class \
+ --source-zones='*' --dest-zones=us-west-2 \
+ --storage-class=CHEAP_AND_SLOW
+
+
+Example 9: Destination Params: Destination Owner Translation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Set the destination objects owner as the destination bucket owner.
+This requires specifying the uid of the destination bucket:
+
+::
+
+ [us-east] $ radosgw-admin sync group create --bucket=buck11 \
+ --group-id=buck11-default --status=enabled
+
+ [us-east] $ radosgw-admin sync group pipe create --bucket=buck11 \
+ --group-id=buck11-default --pipe-id=pipe-dest-owner \
+ --source-zones='*' --dest-zones='*' \
+ --dest-bucket=buck12 --dest-owner=joe
+
+Example 10: Destination Params: User Mode
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+User mode makes sure that the user has permissions to both read the objects, and write to the destination bucket. This requires that the uid of the user (which in its context the operation executes) is specified.
+
+::
+
+ [us-east] $ radosgw-admin sync group pipe modify --bucket=buck11 \
+ --group-id=buck11-default --pipe-id=pipe-dest-owner \
+ --mode=user --uid=jenny
+
+
+
diff --git a/doc/radosgw/multisite.rst b/doc/radosgw/multisite.rst
new file mode 100644
index 000000000..c7627371d
--- /dev/null
+++ b/doc/radosgw/multisite.rst
@@ -0,0 +1,1690 @@
+.. _multisite:
+
+==========
+Multi-Site
+==========
+
+Single-zone Configurations and Multi-site Configurations
+========================================================
+
+Single-zone Configurations
+--------------------------
+
+A single-zone configuration typically consists of two things:
+
+#. One "zonegroup", which contains one zone.
+#. One or more `ceph-radosgw` instances that have `ceph-radosgw` client
+ requests load-balanced between them.
+
+In a typical single-zone configuration, there are multiple `ceph-radosgw`
+instances that make use of a single Ceph storage cluster.
+
+Varieties of Multi-site Configuration
+-------------------------------------
+
+.. versionadded:: Jewel
+
+Beginning with the Kraken release, Ceph supports several multi-site
+configurations for the Ceph Object Gateway:
+
+- **Multi-zone:** A more advanced topology, the "multi-zone" configuration, is
+ possible. A multi-zone configuration consists of one zonegroup and
+ multiple zones, with each zone consisting of one or more `ceph-radosgw`
+ instances. **Each zone is backed by its own Ceph Storage Cluster.**
+
+ The presence of multiple zones in a given zonegroup provides disaster
+ recovery for that zonegroup in the event that one of the zones experiences a
+ significant failure. Beginning with the Kraken release, each zone is active
+ and can receive write operations. A multi-zone configuration that contains
+ multiple active zones enhances disaster recovery and can also be used as a
+ foundation for content delivery networks.
+
+- **Multi-zonegroups:** Ceph Object Gateway supports multiple zonegroups (which
+ were formerly called "regions"). Each zonegroup contains one or more zones.
+ If two zones are in the same zonegroup, and if that zonegroup is in the same
+ realm as a second zonegroup, then the objects stored in the two zones share
+ a global object namespace. This global object namespace ensures unique
+ object IDs across zonegroups and zones.
+
+ Each bucket is owned by the zonegroup where it was created (except where
+ overridden by the :ref:`LocationConstraint<s3_bucket_placement>` on
+ bucket creation), and its object data will only replicate to other zones in
+ that zonegroup. Any request for data in that bucket that are sent to other
+ zonegroups will redirect to the zonegroup where the bucket resides.
+
+ It can be useful to create multiple zonegroups when you want to share a
+ namespace of users and buckets across many zones, but isolate the object data
+ to a subset of those zones. It might be that you have several connected sites
+ that share storage, but only require a single backup for purposes of disaster
+ recovery. In such a case, it could make sense to create several zonegroups
+ with only two zones each to avoid replicating all objects to all zones.
+
+ In other cases, it might make more sense to isolate things in separate
+ realms, with each realm having a single zonegroup. Zonegroups provide
+ flexibility by making it possible to control the isolation of data and
+ metadata separately.
+
+- **Multiple Realms:** Beginning with the Kraken release, the Ceph Object
+ Gateway supports "realms", which are containers for zonegroups. Realms make
+ it possible to set policies that apply to multiple zonegroups. Realms have a
+ globally unique namespace and can contain either a single zonegroup or
+ multiple zonegroups. If you choose to make use of multiple realms, you can
+ define multiple namespaces and multiple configurations (this means that each
+ realm can have a configuration that is distinct from the configuration of
+ other realms).
+
+
+Diagram - Replication of Object Data Between Zones
+--------------------------------------------------
+
+The replication of object data between zones within a zonegroup looks
+something like this:
+
+.. image:: ../images/zone-sync.svg
+ :align: center
+
+At the top of this diagram, we see two applications (also known as "clients").
+The application on the right is both writing and reading data from the Ceph
+Cluster, by means of the RADOS Gateway (RGW). The application on the left is
+only *reading* data from the Ceph Cluster, by means of an instance of RADOS
+Gateway (RGW). In both cases (read-and-write and read-only), the transmssion of
+data is handled RESTfully.
+
+In the middle of this diagram, we see two zones, each of which contains an
+instance of RADOS Gateway (RGW). These instances of RGW are handling the
+movement of data from the applications to the zonegroup. The arrow from the
+master zone (US-EAST) to the secondary zone (US-WEST) represents an act of data
+synchronization.
+
+At the bottom of this diagram, we see the data distributed into the Ceph
+Storage Cluster.
+
+For additional details on setting up a cluster, see `Ceph Object Gateway for
+Production <https://access.redhat.com/documentation/en-us/red_hat_ceph_storage/3/html/ceph_object_gateway_for_production/index/>`__.
+
+Functional Changes from Infernalis
+==================================
+
+Beginning with Kraken, each Ceph Object Gateway can be configured to work in an
+active-active zone mode. This makes it possible to write to non-master zones.
+
+The multi-site configuration is stored within a container called a "realm". The
+realm stores zonegroups, zones, and a time "period" with multiple epochs (which
+(the epochs) are used for tracking changes to the configuration).
+
+Beginning with Kraken, the ``ceph-radosgw`` daemons handle the synchronization
+of data across zones, which eliminates the need for a separate synchronization
+agent. This new approach to synchronization allows the Ceph Object Gateway to
+operate with an "active-active" configuration instead of with an
+"active-passive" configuration.
+
+Requirements and Assumptions
+============================
+
+A multi-site configuration requires at least two Ceph storage clusters. The
+multi-site configuration must have at least two Ceph object gateway instances
+(one for each Ceph storage cluster).
+
+This guide assumes that at least two Ceph storage clusters are in
+geographically separate locations; however, the configuration can work on the
+same site. This guide also assumes two Ceph object gateway servers named
+``rgw1`` and ``rgw2``.
+
+.. important:: Running a single geographically-distributed Ceph storage cluster
+ is NOT recommended unless you have low latency WAN connections.
+
+A multi-site configuration requires a master zonegroup and a master zone. Each
+zonegroup requires a master zone. Zonegroups may have one or more secondary
+or non-master zones.
+
+In this guide, the ``rgw1`` host will serve as the master zone of the master
+zonegroup; and, the ``rgw2`` host will serve as the secondary zone of the
+master zonegroup.
+
+See `Pools`_ for instructions on creating and tuning pools for Ceph Object
+Storage.
+
+See `Sync Policy Config`_ for instructions on defining fine-grained bucket sync
+policy rules.
+
+.. _master-zone-label:
+
+Configuring a Master Zone
+=========================
+
+All gateways in a multi-site configuration retrieve their configurations from a
+``ceph-radosgw`` daemon that is on a host within both the master zonegroup and
+the master zone. To configure your gateways in a multi-site configuration,
+choose a ``ceph-radosgw`` instance to configure the master zonegroup and
+master zone.
+
+Create a Realm
+--------------
+
+A realm contains the multi-site configuration of zonegroups and zones. The
+realm enforces a globally unique namespace within itself.
+
+#. Create a new realm for the multi-site configuration by opening a command
+ line interface on a host that will serve in the master zonegroup and zone.
+ Then run the following command:
+
+ .. prompt:: bash #
+
+ radosgw-admin realm create --rgw-realm={realm-name} [--default]
+
+ For example:
+
+ .. prompt:: bash #
+
+ radosgw-admin realm create --rgw-realm=movies --default
+
+ .. note:: If you intend the cluster to have a single realm, specify the ``--default`` flag.
+
+ If ``--default`` is specified, ``radosgw-admin`` uses this realm by default.
+
+ If ``--default`` is not specified, you must specify either the ``--rgw-realm`` flag or the ``--realm-id`` flag to identify the realm when adding zonegroups and zones.
+
+#. After the realm has been created, ``radosgw-admin`` echoes back the realm
+ configuration. For example:
+
+ ::
+
+ {
+ "id": "0956b174-fe14-4f97-8b50-bb7ec5e1cf62",
+ "name": "movies",
+ "current_period": "1950b710-3e63-4c41-a19e-46a715000980",
+ "epoch": 1
+ }
+
+ .. note:: Ceph generates a unique ID for the realm, which can be used to rename the realm if the need arises.
+
+Create a Master Zonegroup
+--------------------------
+
+A realm must have at least one zonegroup which serves as the master zonegroup
+for the realm.
+
+#. To create a new master zonegroup for the multi-site configuration, open a
+ command-line interface on a host in the master zonegroup and zone. Then
+ run the following command:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup create --rgw-zonegroup={name} --endpoints={url} [--rgw-realm={realm-name}|--realm-id={realm-id}] --master --default
+
+ For example:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup create --rgw-zonegroup=us --endpoints=http://rgw1:80 --rgw-realm=movies --master --default
+
+ .. note:: If the realm will have only a single zonegroup, specify the ``--default`` flag.
+
+ If ``--default`` is specified, ``radosgw-admin`` uses this zonegroup by default when adding new zones.
+
+ If ``--default`` is not specified, you must use either the ``--rgw-zonegroup`` flag or the ``--zonegroup-id`` flag to identify the zonegroup when adding or modifying zones.
+
+#. After creating the master zonegroup, ``radosgw-admin`` echoes back the
+ zonegroup configuration. For example:
+
+ ::
+
+ {
+ "id": "f1a233f5-c354-4107-b36c-df66126475a6",
+ "name": "us",
+ "api_name": "us",
+ "is_master": "true",
+ "endpoints": [
+ "http:\/\/rgw1:80"
+ ],
+ "hostnames": [],
+ "hostnames_s3website": [],
+ "master_zone": "",
+ "zones": [],
+ "placement_targets": [],
+ "default_placement": "",
+ "realm_id": "0956b174-fe14-4f97-8b50-bb7ec5e1cf62"
+ }
+
+Create a Master Zone
+--------------------
+
+.. important:: Zones must be created on a Ceph Object Gateway node that will be
+ within the zone.
+
+Create a new master zone for the multi-site configuration by opening a command
+line interface on a host that serves in the master zonegroup and zone. Then
+run the following command:
+
+.. prompt:: bash #
+
+ radosgw-admin zone create --rgw-zonegroup={zone-group-name} \
+ --rgw-zone={zone-name} \
+ --master --default \
+ --endpoints={http://fqdn}[,{http://fqdn}]
+
+For example:
+
+.. prompt:: bash #
+
+ radosgw-admin zone create --rgw-zonegroup=us --rgw-zone=us-east \
+ --master --default \
+ --endpoints={http://fqdn}[,{http://fqdn}]
+
+
+.. note:: The ``--access-key`` and ``--secret`` aren’t specified. These
+ settings will be added to the zone once the user is created in the
+ next section.
+
+.. important:: The following steps assume a multi-site configuration that uses
+ newly installed systems that aren’t storing data yet. DO NOT DELETE the
+ ``default`` zone and its pools if you are already using the zone to store
+ data, or the data will be deleted and unrecoverable.
+
+Delete Default Zonegroup and Zone
+----------------------------------
+
+#. Delete the ``default`` zone if it exists. Remove it from the default
+ zonegroup first.
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup delete --rgw-zonegroup=default --rgw-zone=default
+ radosgw-admin period update --commit
+ radosgw-admin zone delete --rgw-zone=default
+ radosgw-admin period update --commit
+ radosgw-admin zonegroup delete --rgw-zonegroup=default
+ radosgw-admin period update --commit
+
+#. Delete the ``default`` pools in your Ceph storage cluster if they exist.
+
+ .. important:: The following step assumes a multi-site configuration that uses newly installed systems that aren’t currently storing data. DO NOT DELETE the ``default`` zonegroup if you are already using it to store data.
+
+ .. prompt:: bash #
+
+ ceph osd pool rm default.rgw.control default.rgw.control --yes-i-really-really-mean-it
+ ceph osd pool rm default.rgw.data.root default.rgw.data.root --yes-i-really-really-mean-it
+ ceph osd pool rm default.rgw.gc default.rgw.gc --yes-i-really-really-mean-it
+ ceph osd pool rm default.rgw.log default.rgw.log --yes-i-really-really-mean-it
+ ceph osd pool rm default.rgw.users.uid default.rgw.users.uid --yes-i-really-really-mean-it
+
+Create a System User
+--------------------
+
+#. The ``ceph-radosgw`` daemons must authenticate before pulling realm and
+ period information. In the master zone, create a "system user" to facilitate
+ authentication between daemons.
+
+ .. prompt:: bash #
+
+ radosgw-admin user create --uid="{user-name}" --display-name="{Display Name}" --system
+
+ For example:
+
+ .. prompt:: bash #
+
+ radosgw-admin user create --uid="synchronization-user" --display-name="Synchronization User" --system
+
+#. Make a note of the ``access_key`` and ``secret_key``. The secondary zones
+ require them to authenticate against the master zone.
+
+#. Add the system user to the master zone:
+
+ .. prompt:: bash #
+
+ radosgw-admin zone modify --rgw-zone={zone-name} --access-key={access-key} --secret={secret}
+ radosgw-admin period update --commit
+
+Update the Period
+-----------------
+
+After updating the master zone configuration, update the period.
+
+.. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+.. note:: Updating the period changes the epoch, and ensures that other zones
+ will receive the updated configuration.
+
+Update the Ceph Configuration File
+----------------------------------
+
+Update the Ceph configuration file on master zone hosts by adding the
+``rgw_zone`` configuration option and the name of the master zone to the
+instance entry.
+
+::
+
+ [client.rgw.{instance-name}]
+ ...
+ rgw_zone={zone-name}
+
+For example:
+
+::
+
+ [client.rgw.rgw1]
+ host = rgw1
+ rgw frontends = "civetweb port=80"
+ rgw_zone=us-east
+
+Start the Gateway
+-----------------
+
+On the object gateway host, start and enable the Ceph Object Gateway
+service:
+
+.. prompt:: bash #
+
+ systemctl start ceph-radosgw@rgw.`hostname -s`
+ systemctl enable ceph-radosgw@rgw.`hostname -s`
+
+.. _secondary-zone-label:
+
+Configuring Secondary Zones
+===========================
+
+Zones that are within a zonegroup replicate all data in order to ensure that
+every zone has the same data. When creating a secondary zone, run the following
+operations on a host identified to serve the secondary zone.
+
+.. note:: To add a second secondary zone (that is, a second non-master zone
+ within a zonegroup that already contains a secondary zone), follow :ref:`the
+ same procedures that are used for adding a secondary
+ zone<radosgw-multisite-secondary-zone-creating>`. Be sure to specify a
+ different zone name than the name of the first secondary zone.
+
+.. important:: Metadata operations (for example, user creation) must be
+ run on a host within the master zone. Bucket operations can be received
+ by the master zone or the secondary zone, but the secondary zone will
+ redirect bucket operations to the master zone. If the master zone is down,
+ bucket operations will fail.
+
+Pulling the Realm Configuration
+-------------------------------
+
+The URL path, access key, and secret of the master zone in the master zone
+group are used to pull the realm configuration to the host. When pulling the
+configuration of a non-default realm, specify the realm using the
+``--rgw-realm`` or ``--realm-id`` configuration options.
+
+.. prompt:: bash #
+
+ radosgw-admin realm pull --url={url-to-master-zone-gateway}
+ --access-key={access-key} --secret={secret}
+
+.. note:: Pulling the realm configuration also retrieves the remote's current
+ period configuration, and makes it the current period on this host as well.
+
+If this realm is the only realm, run the following command to make it the
+default realm:
+
+.. prompt:: bash #
+
+ radosgw-admin realm default --rgw-realm={realm-name}
+
+.. _radosgw-multisite-secondary-zone-creating:
+
+Creating a Secondary Zone
+-------------------------
+
+.. important:: When a zone is created, it must be on a Ceph Object Gateway node
+ within the zone.
+
+In order to create a secondary zone for the multi-site configuration, open a
+command line interface on a host identified to serve the secondary zone.
+Specify the zonegroup ID, the new zone name, and an endpoint for the zone.
+**DO NOT** use the ``--master`` or ``--default`` flags. Beginning in Kraken,
+all zones run in an active-active configuration by default, which means that a
+gateway client may write data to any zone and the zone will replicate the data
+to all other zones within the zonegroup. If you want to prevent the secondary
+zone from accepting write operations, include the ``--read-only`` flag in the
+command in order to create an active-passive configuration between the master
+zone and the secondary zone. In any case, don't forget to provide the
+``access_key`` and ``secret_key`` of the generated system user that is stored
+in the master zone of the master zonegroup. Run the following command:
+
+.. prompt:: bash #
+
+ radosgw-admin zone create --rgw-zonegroup={zone-group-name} \
+ --rgw-zone={zone-name} \
+ --access-key={system-key} --secret={secret} \
+ --endpoints=http://{fqdn}:80 \
+ [--read-only]
+
+For example:
+
+
+.. prompt:: bash #
+
+ radosgw-admin zone create --rgw-zonegroup=us --rgw-zone=us-west \
+ --access-key={system-key} --secret={secret} \
+ --endpoints=http://rgw2:80
+
+.. important:: The following steps assume a multi-site configuration that uses
+ newly installed systems that have not yet begun storing data. **DO NOT
+ DELETE the ``default`` zone or its pools** if you are already using it to
+ store data, or the data will be irretrievably lost.
+
+Delete the default zone if needed:
+
+.. prompt:: bash #
+
+ radosgw-admin zone delete --rgw-zone=default
+
+Finally, delete the default pools in your Ceph storage cluster if needed:
+
+.. prompt:: bash #
+
+ ceph osd pool rm default.rgw.control default.rgw.control --yes-i-really-really-mean-it
+ ceph osd pool rm default.rgw.data.root default.rgw.data.root --yes-i-really-really-mean-it
+ ceph osd pool rm default.rgw.gc default.rgw.gc --yes-i-really-really-mean-it
+ ceph osd pool rm default.rgw.log default.rgw.log --yes-i-really-really-mean-it
+ ceph osd pool rm default.rgw.users.uid default.rgw.users.uid --yes-i-really-really-mean-it
+
+Updating the Ceph Configuration File
+------------------------------------
+
+To update the Ceph configuration file on the secondary zone hosts, add the
+``rgw_zone`` configuration option and the name of the secondary zone to the
+instance entry.
+
+::
+
+ [client.rgw.{instance-name}]
+ ...
+ rgw_zone={zone-name}
+
+For example:
+
+::
+
+ [client.rgw.rgw2]
+ host = rgw2
+ rgw frontends = "civetweb port=80"
+ rgw_zone=us-west
+
+Updating the Period
+-------------------
+
+After updating the master zone configuration, update the period:
+
+.. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+.. note:: Updating the period changes the epoch, and ensures that other zones
+ will receive the updated configuration.
+
+Starting the Gateway
+--------------------
+
+To start the gateway, start and enable the Ceph Object Gateway service by
+running the following commands on the object gateway host:
+
+.. prompt:: bash #
+
+ systemctl start ceph-radosgw@rgw.`hostname -s`
+ systemctl enable ceph-radosgw@rgw.`hostname -s`
+
+Checking Synchronization Status
+-------------------------------
+
+After the secondary zone is up and running, you can check the synchronization
+status. The process of synchronization will copy users and buckets that were
+created in the master zone from the master zone to the secondary zone.
+
+.. prompt:: bash #
+
+ radosgw-admin sync status
+
+The output reports the status of synchronization operations. For example:
+
+::
+
+ realm f3239bc5-e1a8-4206-a81d-e1576480804d (earth)
+ zonegroup c50dbb7e-d9ce-47cc-a8bb-97d9b399d388 (us)
+ zone 4c453b70-4a16-4ce8-8185-1893b05d346e (us-west)
+ metadata sync syncing
+ full sync: 0/64 shards
+ metadata is caught up with master
+ incremental sync: 64/64 shards
+ data sync source: 1ee9da3e-114d-4ae3-a8a4-056e8a17f532 (us-east)
+ syncing
+ full sync: 0/128 shards
+ incremental sync: 128/128 shards
+ data is caught up with source
+
+.. note:: Secondary zones accept bucket operations; however, secondary zones
+ redirect bucket operations to the master zone and then synchronize with the
+ master zone to receive the result of the bucket operations. If the master
+ zone is down, bucket operations executed on the secondary zone will fail,
+ but object operations should succeed.
+
+
+Verifying an Object
+-------------------
+
+By default, after the successful synchronization of an object there is no
+subsequent verification of the object. However, you can enable verification by
+setting :confval:`rgw_sync_obj_etag_verify` to ``true``. After this value is
+set to true, an MD5 checksum is used to verify the integrity of the data that
+was transferred from the source to the destination. This ensures the integrity
+of any object that has been fetched from a remote server over HTTP (including
+multisite sync). This option may decrease the performance of your RGW because
+it requires more computation.
+
+
+Maintenance
+===========
+
+Checking the Sync Status
+------------------------
+
+Information about the replication status of a zone can be queried with:
+
+.. prompt:: bash $
+
+ radosgw-admin sync status
+
+::
+
+ realm b3bc1c37-9c44-4b89-a03b-04c269bea5da (earth)
+ zonegroup f54f9b22-b4b6-4a0e-9211-fa6ac1693f49 (us)
+ zone adce11c9-b8ed-4a90-8bc5-3fc029ff0816 (us-2)
+ metadata sync syncing
+ full sync: 0/64 shards
+ incremental sync: 64/64 shards
+ metadata is behind on 1 shards
+ oldest incremental change not applied: 2017-03-22 10:20:00.0.881361s
+ data sync source: 341c2d81-4574-4d08-ab0f-5a2a7b168028 (us-1)
+ syncing
+ full sync: 0/128 shards
+ incremental sync: 128/128 shards
+ data is caught up with source
+ source: 3b5d1a3f-3f27-4e4a-8f34-6072d4bb1275 (us-3)
+ syncing
+ full sync: 0/128 shards
+ incremental sync: 128/128 shards
+ data is caught up with source
+
+The output might be different, depending on the sync status. During sync, the
+shards are of two types:
+
+- **Behind shards** are shards that require a data sync (either a full data
+ sync or an incremental data sync) in order to be brought up to date.
+
+- **Recovery shards** are shards that encountered an error during sync and have
+ been marked for retry. The error occurs mostly on minor issues, such as
+ acquiring a lock on a bucket. Errors of this kind typically resolve on their
+ own.
+
+Check the logs
+--------------
+
+For multi-site deployments only, you can examine the metadata log (``mdlog``),
+the bucket index log (``bilog``), and the data log (``datalog``). You can list
+them and also trim them. Trimming is not needed in most cases because
+:confval:`rgw_sync_log_trim_interval` is set to 20 minutes by default. It
+should not be necessary to trim the logs unless
+:confval:`rgw_sync_log_trim_interval` has been manually set to 0.
+
+Changing the Metadata Master Zone
+---------------------------------
+
+.. important:: Care must be taken when changing the metadata master zone by
+ promoting a zone to master. A zone that isn't finished syncing metadata from
+ the current master zone will be unable to serve any remaining entries if it
+ is promoted to master, and those metadata changes will be lost. For this
+ reason, we recommend waiting for a zone's ``radosgw-admin sync status`` to
+ complete the process of synchronizing the metadata before promoting the zone
+ to master.
+
+Similarly, if the current master zone is processing changes to metadata at the
+same time that another zone is being promoted to master, these changes are
+likely to be lost. To avoid losing these changes, we recommend shutting down
+any ``radosgw`` instances on the previous master zone. After the new master
+zone has been promoted, the previous master zone's new period can be fetched
+with ``radosgw-admin period pull`` and the gateway(s) can be restarted.
+
+To promote a zone to metadata master, run the following commands on that zone
+(in this example, the zone is zone ``us-2`` in zonegroup ``us``):
+
+.. prompt:: bash $
+
+ radosgw-admin zone modify --rgw-zone=us-2 --master
+ radosgw-admin zonegroup modify --rgw-zonegroup=us --master
+ radosgw-admin period update --commit
+
+This generates a new period, and the radosgw instance(s) in zone ``us-2`` sends
+this period to other zones.
+
+Failover and Disaster Recovery
+==============================
+
+Setting Up Failover to the Secondary Zone
+-----------------------------------------
+
+If the master zone fails, you can fail over to the secondary zone for
+disaster recovery by following these steps:
+
+#. Make the secondary zone the master and default zone. For example:
+
+ .. prompt:: bash #
+
+ radosgw-admin zone modify --rgw-zone={zone-name} --master --default
+
+ By default, Ceph Object Gateway runs in an active-active
+ configuration. However, if the cluster is configured to run in an
+ active-passive configuration, the secondary zone is a read-only zone.
+ To allow the secondary zone to receive write
+ operations, remove its ``--read-only`` status. For example:
+
+ .. prompt:: bash #
+
+ radosgw-admin zone modify --rgw-zone={zone-name} --master --default \
+ --read-only=false
+
+#. Update the period to make the changes take effect.
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+#. Finally, restart the Ceph Object Gateway.
+
+ .. prompt:: bash #
+
+ systemctl restart ceph-radosgw@rgw.`hostname -s`
+
+Reverting from Failover
+-----------------------
+
+If the former master zone recovers, you can revert the failover operation by following these steps:
+
+#. From within the recovered zone, pull the latest realm configuration
+ from the current master zone:
+
+ .. prompt:: bash #
+
+ radosgw-admin realm pull --url={url-to-master-zone-gateway} \
+ --access-key={access-key} --secret={secret}
+
+#. Make the recovered zone the master and default zone:
+
+ .. prompt:: bash #
+
+ radosgw-admin zone modify --rgw-zone={zone-name} --master --default
+
+#. Update the period so that the changes take effect:
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+#. Restart the Ceph Object Gateway in the recovered zone:
+
+ .. prompt:: bash #
+
+ systemctl restart ceph-radosgw@rgw.`hostname -s`
+
+#. If the secondary zone needs to be a read-only configuration, update
+ the secondary zone:
+
+ .. prompt:: bash #
+
+ radosgw-admin zone modify --rgw-zone={zone-name} --read-only
+
+#. Update the period so that the changes take effect:
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+#. Restart the Ceph Object Gateway in the secondary zone:
+
+ .. prompt:: bash #
+
+ systemctl restart ceph-radosgw@rgw.`hostname -s`
+
+.. _rgw-multisite-migrate-from-single-site:
+
+Migrating a Single-Site Deployment to Multi-Site
+=================================================
+
+To migrate from a single-site deployment with a ``default`` zonegroup and zone
+to a multi-site system, follow these steps:
+
+1. Create a realm. Replace ``<name>`` with the realm name:
+
+ .. prompt:: bash #
+
+ radosgw-admin realm create --rgw-realm=<name> --default
+
+2. Rename the default zonegroup and zone. Replace ``<name>`` with the zone name
+ or zonegroup name:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup rename --rgw-zonegroup default --zonegroup-new-name=<name>
+ radosgw-admin zone rename --rgw-zone default --zone-new-name us-east-1 --rgw-zonegroup=<name>
+
+3. Rename the default zonegroup's ``api_name``. Replace ``<name>`` with the zonegroup name:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup modify --api-name=<name> --rgw-zonegroup=<name>
+
+4. Configure the master zonegroup. Replace ``<name>`` with the realm name or
+ zonegroup name. Replace ``<fqdn>`` with the fully qualified domain name(s)
+ in the zonegroup:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup modify --rgw-realm=<name> --rgw-zonegroup=<name> --endpoints http://<fqdn>:80 --master --default
+
+5. Configure the master zone. Replace ``<name>`` with the realm name, zone
+ name, or zonegroup name. Replace ``<fqdn>`` with the fully qualified domain
+ name(s) in the zonegroup:
+
+ .. prompt:: bash #
+
+ radosgw-admin zone modify --rgw-realm=<name> --rgw-zonegroup=<name> \
+ --rgw-zone=<name> --endpoints http://<fqdn>:80 \
+ --access-key=<access-key> --secret=<secret-key> \
+ --master --default
+
+6. Create a system user. Replace ``<user-id>`` with the username. Replace
+ ``<display-name>`` with a display name. The display name is allowed to
+ contain spaces:
+
+ .. prompt:: bash #
+
+ radosgw-admin user create --uid=<user-id> \
+ --display-name="<display-name>" \
+ --access-key=<access-key> \
+ --secret=<secret-key> --system
+
+7. Commit the updated configuration:
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+8. Restart the Ceph Object Gateway:
+
+ .. prompt:: bash #
+
+ systemctl restart ceph-radosgw@rgw.`hostname -s`
+
+After completing this procedure, proceed to `Configure a Secondary
+Zone <#configure-secondary-zones>`_ and create a secondary zone
+in the master zonegroup.
+
+Multi-Site Configuration Reference
+==================================
+
+The following sections provide additional details and command-line
+usage for realms, periods, zonegroups and zones.
+
+For more details on every available configuration option, see
+``src/common/options/rgw.yaml.in``.
+
+Alternatively, go to the :ref:`mgr-dashboard` configuration page (found under
+`Cluster`), where you can view and set all of the options. While on the page,
+set the level to ``advanced`` and search for RGW to see all basic and advanced
+configuration options.
+
+.. _rgw-realms:
+
+Realms
+------
+
+A realm is a globally unique namespace that consists of one or more zonegroups.
+Zonegroups contain one or more zones. Zones contain buckets. Buckets contain
+objects.
+
+Realms make it possible for the Ceph Object Gateway to support multiple
+namespaces and their configurations on the same hardware.
+
+Each realm is associated with a "period". A period represents the state
+of the zonegroup and zone configuration in time. Each time you make a
+change to a zonegroup or zone, you should update and commit the period.
+
+To ensure backward compatibility with Infernalis and earlier releases, the Ceph
+Object Gateway does not by default create a realm. However, as a best practice,
+we recommend that you create realms when creating new clusters.
+
+Create a Realm
+~~~~~~~~~~~~~~
+
+To create a realm, run ``realm create`` and specify the realm name.
+If the realm is the default, specify ``--default``.
+
+.. prompt:: bash #
+
+ radosgw-admin realm create --rgw-realm={realm-name} [--default]
+
+For example:
+
+.. prompt:: bash #
+
+ radosgw-admin realm create --rgw-realm=movies --default
+
+By specifying ``--default``, the realm will be called implicitly with
+each ``radosgw-admin`` call unless ``--rgw-realm`` and the realm name
+are explicitly provided.
+
+Make a Realm the Default
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+One realm in the list of realms should be the default realm. There may be only
+one default realm. If there is only one realm and it wasn’t specified as the
+default realm when it was created, make it the default realm. Alternatively, to
+change which realm is the default, run the following command:
+
+.. prompt:: bash #
+
+ radosgw-admin realm default --rgw-realm=movies
+
+.. note:: When the realm is default, the command line assumes
+ ``--rgw-realm=<realm-name>`` as an argument.
+
+Delete a Realm
+~~~~~~~~~~~~~~
+
+To delete a realm, run ``realm rm`` and specify the realm name:
+
+.. prompt:: bash #
+
+ radosgw-admin realm rm --rgw-realm={realm-name}
+
+For example:
+
+.. prompt:: bash #
+
+ radosgw-admin realm rm --rgw-realm=movies
+
+Get a Realm
+~~~~~~~~~~~
+
+To get a realm, run ``realm get`` and specify the realm name:
+
+.. prompt:: bash #
+
+ radosgw-admin realm get --rgw-realm=<name>
+
+For example:
+
+.. prompt:: bash #
+
+ radosgw-admin realm get --rgw-realm=movies [> filename.json]
+
+::
+
+ {
+ "id": "0a68d52e-a19c-4e8e-b012-a8f831cb3ebc",
+ "name": "movies",
+ "current_period": "b0c5bbef-4337-4edd-8184-5aeab2ec413b",
+ "epoch": 1
+ }
+
+Set a Realm
+~~~~~~~~~~~
+
+To set a realm, run ``realm set``, specify the realm name, and use the
+``--infile=`` option (make sure that the ``--infile`` option has an input file
+name as an argument):
+
+.. prompt:: bash #
+
+ radosgw-admin realm set --rgw-realm=<name> --infile=<infilename>
+
+For example:
+
+.. prompt:: bash #
+
+ radosgw-admin realm set --rgw-realm=movies --infile=filename.json
+
+List Realms
+~~~~~~~~~~~
+
+To list realms, run ``realm list``:
+
+.. prompt:: bash #
+
+ radosgw-admin realm list
+
+List Realm Periods
+~~~~~~~~~~~~~~~~~~
+
+To list realm periods, run ``realm list-periods``:
+
+.. prompt:: bash #
+
+ radosgw-admin realm list-periods
+
+Pull a Realm
+~~~~~~~~~~~~
+
+To pull a realm from the node that contains both the master zonegroup and
+master zone to a node that contains a secondary zonegroup or zone, run ``realm
+pull`` on the node that will receive the realm configuration:
+
+.. prompt:: bash #
+
+ radosgw-admin realm pull --url={url-to-master-zone-gateway} --access-key={access-key} --secret={secret}
+
+Rename a Realm
+~~~~~~~~~~~~~~
+
+A realm is not part of the period. Consequently, any renaming of the realm is
+applied only locally, and will therefore not get pulled when you run ``realm
+pull``. If you are renaming a realm that contains multiple zones, run the
+``rename`` command on each zone.
+
+To rename a realm, run the following:
+
+.. prompt:: bash #
+
+ radosgw-admin realm rename --rgw-realm=<current-name> --realm-new-name=<new-realm-name>
+
+.. note:: DO NOT use ``realm set`` to change the ``name`` parameter. Doing so
+ changes the internal name only. If you use ``realm set`` to change the
+ ``name`` parameter, then ``--rgw-realm`` still expects the realm's old name.
+
+Zonegroups
+-----------
+
+Zonegroups make it possible for the Ceph Object Gateway to support multi-site
+deployments and a global namespace. Zonegroups were formerly called "regions"
+(in releases prior to and including Infernalis).
+
+A zonegroup defines the geographic location of one or more Ceph Object Gateway
+instances within one or more zones.
+
+The configuration of zonegroups differs from typical configuration procedures,
+because not all of the zonegroup configuration settings are stored to a
+configuration file.
+
+You can list zonegroups, get a zonegroup configuration, and set a zonegroup
+configuration.
+
+Creating a Zonegroup
+~~~~~~~~~~~~~~~~~~~~
+
+Creating a zonegroup consists of specifying the zonegroup name. Newly created
+zones reside in the default realm unless a different realm is specified by
+using the option ``--rgw-realm=<realm-name>``.
+
+If the zonegroup is the default zonegroup, specify the ``--default`` flag. If
+the zonegroup is the master zonegroup, specify the ``--master`` flag. For
+example:
+
+.. prompt:: bash #
+
+ radosgw-admin zonegroup create --rgw-zonegroup=<name> [--rgw-realm=<name>][--master] [--default]
+
+
+.. note:: Use ``zonegroup modify --rgw-zonegroup=<zonegroup-name>`` to modify
+ an existing zonegroup’s settings.
+
+Making a Zonegroup the Default
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+One zonegroup in the list of zonegroups must be the default zonegroup. There
+can be only one default zonegroup. In the case that there is only one zonegroup
+which was not designated the default zonegroup when it was created, use the
+following command to make it the default zonegroup. Commands of this form can
+be used to change which zonegroup is the default.
+
+#. Designate a zonegroup as the default zonegroup:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup default --rgw-zonegroup=comedy
+
+ .. note:: When the zonegroup is default, the command line assumes that the name of the zonegroup will be the argument of the ``--rgw-zonegroup=<zonegroup-name>`` option. (In this example, ``<zonegroup-name>`` has been retained for the sake of consistency and legibility.)
+
+#. Update the period:
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Adding a Zone to a Zonegroup
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This procedure explains how to add a zone to a zonegroup.
+
+#. Run the following command to add a zone to a zonegroup:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup add --rgw-zonegroup=<name> --rgw-zone=<name>
+
+#. Update the period:
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Removing a Zone from a Zonegroup
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#. Run this command to remove a zone from a zonegroup:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup remove --rgw-zonegroup=<name> --rgw-zone=<name>
+
+#. Update the period:
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Renaming a Zonegroup
+~~~~~~~~~~~~~~~~~~~~
+
+#. Run this command to rename the zonegroup:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup rename --rgw-zonegroup=<name> --zonegroup-new-name=<name>
+
+#. Update the period:
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Deleting a Zonegroup
+~~~~~~~~~~~~~~~~~~~~
+
+#. To delete a zonegroup, run the following command:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup delete --rgw-zonegroup=<name>
+
+#. Update the period:
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Listing Zonegroups
+~~~~~~~~~~~~~~~~~~
+
+A Ceph cluster contains a list of zonegroup. To list the zonegroups, run
+this command:
+
+.. prompt:: bash #
+
+ radosgw-admin zonegroup list
+
+The ``radosgw-admin`` returns a JSON formatted list of zonegroups.
+
+::
+
+ {
+ "default_info": "90b28698-e7c3-462c-a42d-4aa780d24eda",
+ "zonegroups": [
+ "us"
+ ]
+ }
+
+Getting a Zonegroup Map
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To list the details of each zonegroup, run this command:
+
+.. prompt:: bash #
+
+ radosgw-admin zonegroup-map get
+
+.. note:: If you receive a ``failed to read zonegroup map`` error, run
+ ``radosgw-admin zonegroup-map update`` as ``root`` first.
+
+Getting a Zonegroup
+~~~~~~~~~~~~~~~~~~~~
+
+To view the configuration of a zonegroup, run this command:
+
+.. prompt:: bash #
+
+ radosgw-admin zonegroup get [--rgw-zonegroup=<zonegroup>]
+
+The zonegroup configuration looks like this:
+
+::
+
+ {
+ "id": "90b28698-e7c3-462c-a42d-4aa780d24eda",
+ "name": "us",
+ "api_name": "us",
+ "is_master": "true",
+ "endpoints": [
+ "http:\/\/rgw1:80"
+ ],
+ "hostnames": [],
+ "hostnames_s3website": [],
+ "master_zone": "9248cab2-afe7-43d8-a661-a40bf316665e",
+ "zones": [
+ {
+ "id": "9248cab2-afe7-43d8-a661-a40bf316665e",
+ "name": "us-east",
+ "endpoints": [
+ "http:\/\/rgw1"
+ ],
+ "log_meta": "true",
+ "log_data": "true",
+ "bucket_index_max_shards": 0,
+ "read_only": "false"
+ },
+ {
+ "id": "d1024e59-7d28-49d1-8222-af101965a939",
+ "name": "us-west",
+ "endpoints": [
+ "http:\/\/rgw2:80"
+ ],
+ "log_meta": "false",
+ "log_data": "true",
+ "bucket_index_max_shards": 0,
+ "read_only": "false"
+ }
+ ],
+ "placement_targets": [
+ {
+ "name": "default-placement",
+ "tags": []
+ }
+ ],
+ "default_placement": "default-placement",
+ "realm_id": "ae031368-8715-4e27-9a99-0c9468852cfe"
+ }
+
+Setting a Zonegroup
+~~~~~~~~~~~~~~~~~~~~
+
+The process of defining a zonegroup consists of creating a JSON object and
+specifying the required settings. Here is a list of the required settings:
+
+1. ``name``: The name of the zonegroup. Required.
+
+2. ``api_name``: The API name for the zonegroup. Optional.
+
+3. ``is_master``: Determines whether the zonegroup is the master zonegroup.
+ Required. **note:** You can only have one master zonegroup.
+
+4. ``endpoints``: A list of all the endpoints in the zonegroup. For example,
+ you may use multiple domain names to refer to the same zonegroup. Remember
+ to escape the forward slashes (``\/``). You may also specify a port
+ (``fqdn:port``) for each endpoint. Optional.
+
+5. ``hostnames``: A list of all the hostnames in the zonegroup. For example,
+ you may use multiple domain names to refer to the same zonegroup. Optional.
+ The ``rgw dns name`` setting will be included in this list automatically.
+ Restart the gateway daemon(s) after changing this setting.
+
+6. ``master_zone``: The master zone for the zonegroup. Optional. Uses
+ the default zone if not specified. **note:** You can only have one
+ master zone per zonegroup.
+
+7. ``zones``: A list of all zones within the zonegroup. Each zone has a name
+ (required), a list of endpoints (optional), and a setting that determines
+ whether the gateway will log metadata and data operations (false by
+ default).
+
+8. ``placement_targets``: A list of placement targets (optional). Each
+ placement target contains a name (required) for the placement target
+ and a list of tags (optional) so that only users with the tag can use
+ the placement target (that is, the user’s ``placement_tags`` field in
+ the user info).
+
+9. ``default_placement``: The default placement target for the object index and
+ object data. Set to ``default-placement`` by default. It is also possible
+ to set a per-user default placement in the user info for each user.
+
+Setting a Zonegroup - Procedure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+#. To set a zonegroup, create a JSON object that contains the required fields,
+ save the object to a file (for example, ``zonegroup.json``), and run the
+ following command:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup set --infile zonegroup.json
+
+ Where ``zonegroup.json`` is the JSON file you created.
+
+ .. important:: The ``default`` zonegroup ``is_master`` setting is ``true`` by default. If you create an additional zonegroup and want to make it the master zonegroup, you must either set the ``default`` zonegroup ``is_master`` setting to ``false`` or delete the ``default`` zonegroup.
+
+#. Update the period:
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Setting a Zonegroup Map
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+The process of setting a zonegroup map comprises (1) creating a JSON object
+that consists of one or more zonegroups, and (2) setting the
+``master_zonegroup`` for the cluster. Each zonegroup in the zonegroup map
+consists of a key/value pair where the ``key`` setting is equivalent to the
+``name`` setting for an individual zonegroup configuration and the ``val`` is
+a JSON object consisting of an individual zonegroup configuration.
+
+You may only have one zonegroup with ``is_master`` equal to ``true``, and it
+must be specified as the ``master_zonegroup`` at the end of the zonegroup map.
+The following JSON object is an example of a default zonegroup map:
+
+::
+
+ {
+ "zonegroups": [
+ {
+ "key": "90b28698-e7c3-462c-a42d-4aa780d24eda",
+ "val": {
+ "id": "90b28698-e7c3-462c-a42d-4aa780d24eda",
+ "name": "us",
+ "api_name": "us",
+ "is_master": "true",
+ "endpoints": [
+ "http:\/\/rgw1:80"
+ ],
+ "hostnames": [],
+ "hostnames_s3website": [],
+ "master_zone": "9248cab2-afe7-43d8-a661-a40bf316665e",
+ "zones": [
+ {
+ "id": "9248cab2-afe7-43d8-a661-a40bf316665e",
+ "name": "us-east",
+ "endpoints": [
+ "http:\/\/rgw1"
+ ],
+ "log_meta": "true",
+ "log_data": "true",
+ "bucket_index_max_shards": 0,
+ "read_only": "false"
+ },
+ {
+ "id": "d1024e59-7d28-49d1-8222-af101965a939",
+ "name": "us-west",
+ "endpoints": [
+ "http:\/\/rgw2:80"
+ ],
+ "log_meta": "false",
+ "log_data": "true",
+ "bucket_index_max_shards": 0,
+ "read_only": "false"
+ }
+ ],
+ "placement_targets": [
+ {
+ "name": "default-placement",
+ "tags": []
+ }
+ ],
+ "default_placement": "default-placement",
+ "realm_id": "ae031368-8715-4e27-9a99-0c9468852cfe"
+ }
+ }
+ ],
+ "master_zonegroup": "90b28698-e7c3-462c-a42d-4aa780d24eda",
+ "bucket_quota": {
+ "enabled": false,
+ "max_size_kb": -1,
+ "max_objects": -1
+ },
+ "user_quota": {
+ "enabled": false,
+ "max_size_kb": -1,
+ "max_objects": -1
+ }
+ }
+
+#. To set a zonegroup map, run the following command:
+
+ .. prompt:: bash #
+
+ radosgw-admin zonegroup-map set --infile zonegroupmap.json
+
+ In this command, ``zonegroupmap.json`` is the JSON file you created. Ensure
+ that you have zones created for the ones specified in the zonegroup map.
+
+#. Update the period:
+
+ .. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+.. _radosgw-zones:
+
+Zones
+-----
+
+A zone defines a logical group that consists of one or more Ceph Object Gateway
+instances. All RGWs in a given zone serve S3 objects that are backed by RADOS objects that are stored in the same set of pools in the same cluster. Ceph Object Gateway supports zones.
+
+The procedure for configuring zones differs from typical configuration
+procedures, because not all of the settings end up in a Ceph configuration
+file.
+
+Zones can be listed. You can "get" a zone configuration and "set" a zone
+configuration.
+
+Creating a Zone
+~~~~~~~~~~~~~~~
+
+To create a zone, specify a zone name. If you are creating a master zone,
+specify the ``--master`` flag. Only one zone in a zonegroup may be a master
+zone. To add the zone to a zonegroup, specify the ``--rgw-zonegroup`` option
+with the zonegroup name.
+
+.. prompt:: bash #
+
+ radosgw-admin zone create --rgw-zone=<name> \
+ [--zonegroup=<zonegroup-name]\
+ [--endpoints=<endpoint>[,<endpoint>] \
+ [--master] [--default] \
+ --access-key $SYSTEM_ACCESS_KEY --secret $SYSTEM_SECRET_KEY
+
+After you have created the zone, update the period:
+
+.. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Deleting a Zone
+~~~~~~~~~~~~~~~
+
+To delete a zone, first remove it from the zonegroup:
+
+.. prompt:: bash #
+
+ radosgw-admin zonegroup remove --zonegroup=<name>\
+ --zone=<name>
+
+Then, update the period:
+
+.. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Next, delete the zone:
+
+.. prompt:: bash #
+
+ radosgw-admin zone delete --rgw-zone<name>
+
+Finally, update the period:
+
+.. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+.. important:: Do not delete a zone without removing it from a zonegroup first.
+ Otherwise, updating the period will fail.
+
+If the pools for the deleted zone will not be used anywhere else,
+consider deleting the pools. Replace ``<del-zone>`` in the example below
+with the deleted zone’s name.
+
+.. important:: Only delete the pools with prepended zone names. Deleting the
+ root pool (for example, ``.rgw.root``) will remove all of the system’s
+ configuration.
+
+.. important:: When the pools are deleted, all of the data within them are
+ deleted in an unrecoverable manner. Delete the pools only if the pool's
+ contents are no longer needed.
+
+.. prompt:: bash #
+
+ ceph osd pool rm <del-zone>.rgw.control <del-zone>.rgw.control --yes-i-really-really-mean-it
+ ceph osd pool rm <del-zone>.rgw.meta <del-zone>.rgw.meta --yes-i-really-really-mean-it
+ ceph osd pool rm <del-zone>.rgw.log <del-zone>.rgw.log --yes-i-really-really-mean-it
+ ceph osd pool rm <del-zone>.rgw.otp <del-zone>.rgw.otp --yes-i-really-really-mean-it
+ ceph osd pool rm <del-zone>.rgw.buckets.index <del-zone>.rgw.buckets.index --yes-i-really-really-mean-it
+ ceph osd pool rm <del-zone>.rgw.buckets.non-ec <del-zone>.rgw.buckets.non-ec --yes-i-really-really-mean-it
+ ceph osd pool rm <del-zone>.rgw.buckets.data <del-zone>.rgw.buckets.data --yes-i-really-really-mean-it
+
+Modifying a Zone
+~~~~~~~~~~~~~~~~
+
+To modify a zone, specify the zone name and the parameters you wish to
+modify.
+
+.. prompt:: bash #
+
+ radosgw-admin zone modify [options]
+
+Where ``[options]``:
+
+- ``--access-key=<key>``
+- ``--secret/--secret-key=<key>``
+- ``--master``
+- ``--default``
+- ``--endpoints=<list>``
+
+Then, update the period:
+
+.. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Listing Zones
+~~~~~~~~~~~~~
+
+As ``root``, to list the zones in a cluster, run the following command:
+
+.. prompt:: bash #
+
+ radosgw-admin zone list
+
+Getting a Zone
+~~~~~~~~~~~~~~
+
+As ``root``, to get the configuration of a zone, run the following command:
+
+.. prompt:: bash #
+
+ radosgw-admin zone get [--rgw-zone=<zone>]
+
+The ``default`` zone looks like this:
+
+::
+
+ { "domain_root": ".rgw",
+ "control_pool": ".rgw.control",
+ "gc_pool": ".rgw.gc",
+ "log_pool": ".log",
+ "intent_log_pool": ".intent-log",
+ "usage_log_pool": ".usage",
+ "user_keys_pool": ".users",
+ "user_email_pool": ".users.email",
+ "user_swift_pool": ".users.swift",
+ "user_uid_pool": ".users.uid",
+ "system_key": { "access_key": "", "secret_key": ""},
+ "placement_pools": [
+ { "key": "default-placement",
+ "val": { "index_pool": ".rgw.buckets.index",
+ "data_pool": ".rgw.buckets"}
+ }
+ ]
+ }
+
+Setting a Zone
+~~~~~~~~~~~~~~
+
+Configuring a zone involves specifying a series of Ceph Object Gateway
+pools. For consistency, we recommend using a pool prefix that is the
+same as the zone name. See
+`Pools <http://docs.ceph.com/en/latest/rados/operations/pools/#pools>`__
+for details of configuring pools.
+
+To set a zone, create a JSON object consisting of the pools, save the
+object to a file (e.g., ``zone.json``); then, run the following
+command, replacing ``{zone-name}`` with the name of the zone:
+
+.. prompt:: bash #
+
+ radosgw-admin zone set --rgw-zone={zone-name} --infile zone.json
+
+Where ``zone.json`` is the JSON file you created.
+
+Then, as ``root``, update the period:
+
+.. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Renaming a Zone
+~~~~~~~~~~~~~~~
+
+To rename a zone, specify the zone name and the new zone name.
+
+.. prompt:: bash #
+
+ radosgw-admin zone rename --rgw-zone=<name> --zone-new-name=<name>
+
+Then, update the period:
+
+.. prompt:: bash #
+
+ radosgw-admin period update --commit
+
+Zonegroup and Zone Settings
+----------------------------
+
+When configuring a default zonegroup and zone, the pool name includes
+the zone name. For example:
+
+- ``default.rgw.control``
+
+To change the defaults, include the following settings in your Ceph
+configuration file under each ``[client.radosgw.{instance-name}]``
+instance.
+
++-------------------------------------+-----------------------------------+---------+-----------------------+
+| Name | Description | Type | Default |
++=====================================+===================================+=========+=======================+
+| ``rgw_zone`` | The name of the zone for the | String | None |
+| | gateway instance. | | |
++-------------------------------------+-----------------------------------+---------+-----------------------+
+| ``rgw_zonegroup`` | The name of the zonegroup for | String | None |
+| | the gateway instance. | | |
++-------------------------------------+-----------------------------------+---------+-----------------------+
+| ``rgw_zonegroup_root_pool`` | The root pool for the zonegroup. | String | ``.rgw.root`` |
++-------------------------------------+-----------------------------------+---------+-----------------------+
+| ``rgw_zone_root_pool`` | The root pool for the zone. | String | ``.rgw.root`` |
++-------------------------------------+-----------------------------------+---------+-----------------------+
+| ``rgw_default_zone_group_info_oid`` | The OID for storing the default | String | ``default.zonegroup`` |
+| | zonegroup. We do not recommend | | |
+| | changing this setting. | | |
++-------------------------------------+-----------------------------------+---------+-----------------------+
+
+
+Zone Features
+=============
+
+Some multisite features require support from all zones before they can be enabled. Each zone lists its ``supported_features``, and each zonegroup lists its ``enabled_features``. Before a feature can be enabled in the zonegroup, it must be supported by all of its zones.
+
+On creation of new zones and zonegroups, all known features are supported and some features (see table below) are enabled by default. After upgrading an existing multisite configuration, however, new features must be enabled manually.
+
+Supported Features
+------------------
+
++-----------------------------------+---------+----------+
+| Feature | Release | Default |
++===================================+=========+==========+
+| :ref:`feature_resharding` | Reef | Enabled |
++-----------------------------------+---------+----------+
+| :ref:`feature_compress_encrypted` | Reef | Disabled |
++-----------------------------------+---------+----------+
+
+.. _feature_resharding:
+
+resharding
+~~~~~~~~~~
+
+This feature allows buckets to be resharded in a multisite configuration
+without interrupting the replication of their objects. When
+``rgw_dynamic_resharding`` is enabled, it runs on each zone independently, and
+zones may choose different shard counts for the same bucket. When buckets are
+resharded manually with ``radosgw-admin bucket reshard``, only that zone's
+bucket is modified. A zone feature should only be marked as supported after all
+of its RGWs and OSDs have upgraded.
+
+.. note:: Dynamic resharding is not supported in multisite deployments prior to
+ the Reef release.
+
+
+.. _feature_compress_encrypted:
+
+compress-encrypted
+~~~~~~~~~~~~~~~~~~
+
+This feature enables support for combining `Server-Side Encryption`_ and
+`Compression`_ on the same object. Object data gets compressed before encryption.
+Prior to Reef, multisite would not replicate such objects correctly, so all zones
+must upgrade to Reef or later before enabling.
+
+.. warning:: The compression ratio may leak information about the encrypted data,
+ and allow attackers to distinguish whether two same-sized objects might contain
+ the same data. Due to these security considerations, this feature is disabled
+ by default.
+
+Commands
+--------
+
+Add support for a zone feature
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+On the cluster that contains the given zone:
+
+.. prompt:: bash $
+
+ radosgw-admin zone modify --rgw-zone={zone-name} --enable-feature={feature-name}
+ radosgw-admin period update --commit
+
+
+Remove support for a zone feature
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+On the cluster that contains the given zone:
+
+.. prompt:: bash $
+
+ radosgw-admin zone modify --rgw-zone={zone-name} --disable-feature={feature-name}
+ radosgw-admin period update --commit
+
+Enable a zonegroup feature
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+On any cluster in the realm:
+
+.. prompt:: bash $
+
+ radosgw-admin zonegroup modify --rgw-zonegroup={zonegroup-name} --enable-feature={feature-name}
+ radosgw-admin period update --commit
+
+Disable a zonegroup feature
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+On any cluster in the realm:
+
+.. prompt:: bash $
+
+ radosgw-admin zonegroup modify --rgw-zonegroup={zonegroup-name} --disable-feature={feature-name}
+ radosgw-admin period update --commit
+
+
+.. _`Pools`: ../pools
+.. _`Sync Policy Config`: ../multisite-sync-policy
+.. _`Server-Side Encryption`: ../encryption
+.. _`Compression`: ../compression
diff --git a/doc/radosgw/multitenancy.rst b/doc/radosgw/multitenancy.rst
new file mode 100644
index 000000000..09f5071c1
--- /dev/null
+++ b/doc/radosgw/multitenancy.rst
@@ -0,0 +1,169 @@
+.. _rgw-multitenancy:
+
+=================
+RGW Multi-tenancy
+=================
+
+.. versionadded:: Jewel
+
+The multi-tenancy feature allows to use buckets and users of the same
+name simultaneously by segregating them under so-called ``tenants``.
+This may be useful, for instance, to permit users of Swift API to
+create buckets with easily conflicting names such as "test" or "trove".
+
+From the Jewel release onward, each user and bucket lies under a tenant.
+For compatibility, a "legacy" tenant with an empty name is provided.
+Whenever a bucket is referred without an explicit tenant, an implicit
+tenant is used, taken from the user performing the operation. Since
+the pre-existing users are under the legacy tenant, they continue
+to create and access buckets as before. The layout of objects in RADOS
+is extended in a compatible way, ensuring a smooth upgrade to Jewel.
+
+Administering Users With Explicit Tenants
+=========================================
+
+Tenants as such do not have any operations on them. They appear and
+disappear as needed, when users are administered. In order to create,
+modify, and remove users with explicit tenants, either an additional
+option --tenant is supplied, or a syntax '<tenant>$<user>' is used
+in the parameters of the radosgw-admin command.
+
+Examples
+--------
+
+Create a user testx$tester to be accessed with S3::
+
+ # radosgw-admin --tenant testx --uid tester --display-name "Test User" --access_key TESTER --secret test123 user create
+
+Create a user testx$tester to be accessed with Swift::
+
+ # radosgw-admin --tenant testx --uid tester --display-name "Test User" --subuser tester:test --key-type swift --access full user create
+ # radosgw-admin --subuser 'testx$tester:test' --key-type swift --secret test123
+
+.. note:: The subuser with explicit tenant has to be quoted in the shell.
+
+ Tenant names may contain only alphanumeric characters and underscores.
+
+Accessing Buckets with Explicit Tenants
+=======================================
+
+When a client application accesses buckets, it always operates with
+credentials of a particular user. As mentioned above, every user belongs
+to a tenant. Therefore, every operation has an implicit tenant in its
+context, to be used if no tenant is specified explicitly. Thus a complete
+compatibility is maintained with previous releases, as long as the
+referred buckets and referring user belong to the same tenant.
+In other words, anything unusual occurs when accessing another tenant's
+buckets *only*.
+
+Extensions employed to specify an explicit tenant differ according
+to the protocol and authentication system used.
+
+S3
+--
+
+In case of S3, a colon character is used to separate tenant and bucket.
+Thus a sample URL would be::
+
+ https://ep.host.dom/tenant:bucket
+
+Here's a simple Python sample:
+
+.. code-block:: python
+ :linenos:
+
+ from boto.s3.connection import S3Connection, OrdinaryCallingFormat
+ c = S3Connection(
+ aws_access_key_id="TESTER",
+ aws_secret_access_key="test123",
+ host="ep.host.dom",
+ calling_format = OrdinaryCallingFormat())
+ bucket = c.get_bucket("test5b:testbucket")
+
+Note that it's not possible to supply an explicit tenant using
+a hostname. Hostnames cannot contain colons, or any other separators
+that are not already valid in bucket names. Using a period creates an
+ambiguous syntax. Therefore, the bucket-in-URL-path format has to be
+used.
+
+Due to the fact that the native S3 API does not deal with
+multi-tenancy and radosgw's implementation does, things get a bit
+involved when dealing with signed URLs and public read ACLs.
+
+* A **signed URL** does contain the ``AWSAccessKeyId`` query
+ parameters, from which radosgw is able to discern the correct user
+ and tenant owning the bucket. In other words, an application
+ generating signed URLs should be able to take just the un-prefixed
+ bucket name, and produce a signed URL that itself contains the
+ bucket name without the tenant prefix. However, it is *possible* to
+ include the prefix if you so choose.
+
+ Thus, accessing a signed URL of an object ``bar`` in a container
+ ``foo`` belonging to the tenant ``7188e165c0ae4424ac68ae2e89a05c50``
+ would be possible either via
+ ``http://<host>:<port>/foo/bar?AWSAccessKeyId=b200fb6634c547199e436a0f93c0c46e&Expires=1542890806&Signature=eok6CYQC%2FDwmQQmqvY5jTg6ehXU%3D``,
+ or via
+ ``http://<host>:<port>/7188e165c0ae4424ac68ae2e89a05c50:foo/bar?AWSAccessKeyId=b200fb6634c547199e436a0f93c0c46e&Expires=1542890806&Signature=eok6CYQC%2FDwmQQmqvY5jTg6ehXU%3D``,
+ depending on whether or not the tenant prefix was passed in on
+ signature generation.
+
+* A bucket with a **public read ACL** is meant to be read by an HTTP
+ client *without* including any query parameters that would allow
+ radosgw to discern tenants. Thus, publicly readable objects must
+ always be accessed using the bucket name with the tenant prefix.
+
+ Thus, if you set a public read ACL on an object ``bar`` in a
+ container ``foo`` belonging to the tenant
+ ``7188e165c0ae4424ac68ae2e89a05c50``, you would need to access that
+ object via the public URL
+ ``http://<host>:<port>/7188e165c0ae4424ac68ae2e89a05c50:foo/bar``.
+
+Swift with built-in authenticator
+---------------------------------
+
+TBD -- not in test_multen.py yet
+
+Swift with Keystone
+-------------------
+
+In the default configuration, although native Swift has inherent
+multi-tenancy, radosgw does not enable multi-tenancy for the Swift
+API. This is to ensure that a setup with legacy buckets --- that is,
+buckets that were created before radosgw supported multitenancy ---,
+those buckets retain their dual-API capability to be queried and
+modified using either S3 or Swift.
+
+If you want to enable multitenancy for Swift, particularly if your
+users only ever authenticate against OpenStack Keystone, you should
+enable Keystone-based multitenancy with the following ``ceph.conf``
+configuration option::
+
+ rgw keystone implicit tenants = true
+
+Once you enable this option, any newly connecting user (whether they
+are using the Swift API, or Keystone-authenticated S3) will prompt
+radosgw to create a user named ``<tenant_id>$<tenant_id``, where
+``<tenant_id>`` is a Keystone tenant (project) UUID --- for example,
+``7188e165c0ae4424ac68ae2e89a05c50$7188e165c0ae4424ac68ae2e89a05c50``.
+
+Whenever that user then creates an Swift container, radosgw internally
+translates the given container name into
+``<tenant_id>/<container_name>``, such as
+``7188e165c0ae4424ac68ae2e89a05c50/foo``. This ensures that if there
+are two or more different tenants all creating a container named
+``foo``, radosgw is able to transparently discern them by their tenant
+prefix.
+
+It is also possible to limit the effects of implicit tenants
+to only apply to swift or s3, by setting ``rgw keystone implicit tenants``
+to either ``s3`` or ``swift``. This will likely primarily
+be of use to users who had previously used implicit tenants
+with older versions of ceph, where implicit tenants
+only applied to the swift protocol.
+
+Notes and known issues
+----------------------
+
+Just to be clear, it is not possible to create buckets in other
+tenants at present. The owner of newly created bucket is extracted
+from authentication information.
diff --git a/doc/radosgw/nfs.rst b/doc/radosgw/nfs.rst
new file mode 100644
index 000000000..373765e10
--- /dev/null
+++ b/doc/radosgw/nfs.rst
@@ -0,0 +1,375 @@
+===
+NFS
+===
+
+.. versionadded:: Jewel
+
+.. note:: Only the NFSv4 protocol is supported when using a cephadm or rook based deployment.
+
+Ceph Object Gateway namespaces can be exported over the file-based
+NFSv4 protocols, alongside traditional HTTP access
+protocols (S3 and Swift).
+
+In particular, the Ceph Object Gateway can now be configured to
+provide file-based access when embedded in the NFS-Ganesha NFS server.
+
+The simplest and preferred way of managing nfs-ganesha clusters and rgw exports
+is using ``ceph nfs ...`` commands. See :doc:`/mgr/nfs` for more details.
+
+librgw
+======
+
+The librgw.so shared library (Unix) provides a loadable interface to
+Ceph Object Gateway services, and instantiates a full Ceph Object Gateway
+instance on initialization.
+
+In turn, librgw.so exports rgw_file, a stateful API for file-oriented
+access to RGW buckets and objects. The API is general, but its design
+is strongly influenced by the File System Abstraction Layer (FSAL) API
+of NFS-Ganesha, for which it has been primarily designed.
+
+A set of Python bindings is also provided.
+
+Namespace Conventions
+=====================
+
+The implementation conforms to Amazon Web Services (AWS) hierarchical
+namespace conventions which map UNIX-style path names onto S3 buckets
+and objects.
+
+The top level of the attached namespace consists of S3 buckets,
+represented as NFS directories. Files and directories subordinate to
+buckets are each represented as objects, following S3 prefix and
+delimiter conventions, with '/' being the only supported path
+delimiter [#]_.
+
+For example, if an NFS client has mounted an RGW namespace at "/nfs",
+then a file "/nfs/mybucket/www/index.html" in the NFS namespace
+corresponds to an RGW object "www/index.html" in a bucket/container
+"mybucket."
+
+Although it is generally invisible to clients, the NFS namespace is
+assembled through concatenation of the corresponding paths implied by
+the objects in the namespace. Leaf objects, whether files or
+directories, will always be materialized in an RGW object of the
+corresponding key name, "<name>" if a file, "<name>/" if a directory.
+Non-leaf directories (e.g., "www" above) might only be implied by
+their appearance in the names of one or more leaf objects. Directories
+created within NFS or directly operated on by an NFS client (e.g., via
+an attribute-setting operation such as chown or chmod) always have a
+leaf object representation used to store materialized attributes such
+as Unix ownership and permissions.
+
+Supported Operations
+====================
+
+The RGW NFS interface supports most operations on files and
+directories, with the following restrictions:
+
+- Links, including symlinks, are not supported.
+- NFS ACLs are not supported.
+
+ + Unix user and group ownership and permissions *are* supported.
+
+- Directories may not be moved/renamed.
+
+ + Files may be moved between directories.
+
+- Only full, sequential *write* I/O is supported
+
+ + i.e., write operations are constrained to be **uploads**.
+ + Many typical I/O operations such as editing files in place will necessarily fail as they perform non-sequential stores.
+ + Some file utilities *apparently* writing sequentially (e.g., some versions of GNU tar) may fail due to infrequent non-sequential stores.
+ + When mounting via NFS, sequential application I/O can generally be constrained to be written sequentially to the NFS server via a synchronous mount option (e.g. -osync in Linux).
+ + NFS clients which cannot mount synchronously (e.g., MS Windows) will not be able to upload files.
+
+Security
+========
+
+The RGW NFS interface provides a hybrid security model with the
+following characteristics:
+
+- NFS protocol security is provided by the NFS-Ganesha server, as negotiated by the NFS server and clients
+
+ + e.g., clients can by trusted (AUTH_SYS), or required to present Kerberos user credentials (RPCSEC_GSS)
+ + RPCSEC_GSS wire security can be integrity only (krb5i) or integrity and privacy (encryption, krb5p)
+ + various NFS-specific security and permission rules are available
+
+ * e.g., root-squashing
+
+- a set of RGW/S3 security credentials (unknown to NFS) is associated with each RGW NFS mount (i.e., NFS-Ganesha EXPORT)
+
+ + all RGW object operations performed via the NFS server will be performed by the RGW user associated with the credentials stored in the export being accessed (currently only RGW and RGW LDAP credentials are supported)
+
+ * additional RGW authentication types such as Keystone are not currently supported
+
+Manually configuring an NFS-Ganesha Instance
+============================================
+
+Each NFS RGW instance is an NFS-Ganesha server instance *embedding*
+a full Ceph RGW instance.
+
+Therefore, the RGW NFS configuration includes Ceph and Ceph Object
+Gateway-specific configuration in a local ceph.conf, as well as
+NFS-Ganesha-specific configuration in the NFS-Ganesha config file,
+ganesha.conf.
+
+ceph.conf
+---------
+
+Required ceph.conf configuration for RGW NFS includes:
+
+* valid [client.rgw.{instance-name}] section
+* valid values for minimal instance configuration, in particular, an installed and correct ``keyring``
+
+Other config variables (e.g., ``rgw data`` and ``rgw backend store``) are
+optional.
+
+A small number of config variables (e.g., ``rgw_nfs_namespace_expire_secs``)
+are unique to RGW NFS.
+
+In particular, front-end selection is handled specially by the librgw.so runtime. By default, only the
+``rgw-nfs`` frontend is started. Additional frontends (e.g., ``beast``) are enabled via the
+``rgw nfs frontends`` config option. Its syntax is identical to the ordinary ``rgw frontends`` option.
+Default options for non-default frontends are specified via ``rgw frontend defaults`` as normal.
+
+ganesha.conf
+------------
+
+A strictly minimal ganesha.conf for use with RGW NFS includes one
+EXPORT block with embedded FSAL block of type RGW::
+
+ EXPORT
+ {
+ Export_ID={numeric-id};
+ Path = "/";
+ Pseudo = "/";
+ Access_Type = RW;
+ SecType = "sys";
+ NFS_Protocols = 4;
+ Transport_Protocols = TCP;
+
+ # optional, permit unsquashed access by client "root" user
+ #Squash = No_Root_Squash;
+
+ FSAL {
+ Name = RGW;
+ User_Id = {s3-user-id};
+ Access_Key_Id ="{s3-access-key}";
+ Secret_Access_Key = "{s3-secret}";
+ }
+ }
+
+``Export_ID`` must have an integer value, e.g., "77"
+
+``Path`` (for RGW) should be "/"
+
+``Pseudo`` defines an NFSv4 pseudo root name (NFSv4 only)
+
+``SecType = sys;`` allows clients to attach without Kerberos
+authentication
+
+``Squash = No_Root_Squash;`` enables the client root user to override
+permissions (Unix convention). When root-squashing is enabled,
+operations attempted by the root user are performed as if by the local
+"nobody" (and "nogroup") user on the NFS-Ganesha server
+
+The RGW FSAL additionally supports RGW-specific configuration
+variables in the RGW config section::
+
+ RGW {
+ cluster = "{cluster name, default 'ceph'}";
+ name = "client.rgw.{instance-name}";
+ ceph_conf = "/opt/ceph-rgw/etc/ceph/ceph.conf";
+ init_args = "-d --debug-rgw=16";
+ }
+
+``cluster`` sets a Ceph cluster name (must match the cluster being exported)
+
+``name`` sets an RGW instance name (must match the cluster being exported)
+
+``ceph_conf`` gives a path to a non-default ceph.conf file to use
+
+
+Other useful NFS-Ganesha configuration:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Any EXPORT block which should support NFSv3 should include version 3
+in the NFS_Protocols setting. Additionally, NFSv3 is the last major
+version to support the UDP transport. To enable UDP, include it in the
+Transport_Protocols setting. For example::
+
+ EXPORT {
+ ...
+ NFS_Protocols = 3,4;
+ Transport_Protocols = UDP,TCP;
+ ...
+ }
+
+One important family of options pertains to interaction with the Linux
+idmapping service, which is used to normalize user and group names
+across systems. Details of idmapper integration are not provided here.
+
+With Linux NFS clients, NFS-Ganesha can be configured
+to accept client-supplied numeric user and group identifiers with
+NFSv4, which by default stringifies these--this may be useful in small
+setups and for experimentation::
+
+ NFSV4 {
+ Allow_Numeric_Owners = true;
+ Only_Numeric_Owners = true;
+ }
+
+Troubleshooting
+~~~~~~~~~~~~~~~
+
+NFS-Ganesha configuration problems are usually debugged by running the
+server with debugging options, controlled by the LOG config section.
+
+NFS-Ganesha log messages are grouped into various components, logging
+can be enabled separately for each component. Valid values for
+component logging include::
+
+ *FATAL* critical errors only
+ *WARN* unusual condition
+ *DEBUG* mildly verbose trace output
+ *FULL_DEBUG* verbose trace output
+
+Example::
+
+ LOG {
+
+ Components {
+ MEMLEAKS = FATAL;
+ FSAL = FATAL;
+ NFSPROTO = FATAL;
+ NFS_V4 = FATAL;
+ EXPORT = FATAL;
+ FILEHANDLE = FATAL;
+ DISPATCH = FATAL;
+ CACHE_INODE = FATAL;
+ CACHE_INODE_LRU = FATAL;
+ HASHTABLE = FATAL;
+ HASHTABLE_CACHE = FATAL;
+ DUPREQ = FATAL;
+ INIT = DEBUG;
+ MAIN = DEBUG;
+ IDMAPPER = FATAL;
+ NFS_READDIR = FATAL;
+ NFS_V4_LOCK = FATAL;
+ CONFIG = FATAL;
+ CLIENTID = FATAL;
+ SESSIONS = FATAL;
+ PNFS = FATAL;
+ RW_LOCK = FATAL;
+ NLM = FATAL;
+ RPC = FATAL;
+ NFS_CB = FATAL;
+ THREAD = FATAL;
+ NFS_V4_ACL = FATAL;
+ STATE = FATAL;
+ FSAL_UP = FATAL;
+ DBUS = FATAL;
+ }
+ # optional: redirect log output
+ # Facility {
+ # name = FILE;
+ # destination = "/tmp/ganesha-rgw.log";
+ # enable = active;
+ }
+ }
+
+Running Multiple NFS Gateways
+=============================
+
+Each NFS-Ganesha instance acts as a full gateway endpoint, with the
+limitation that currently an NFS-Ganesha instance cannot be configured
+to export HTTP services. As with ordinary gateway instances, any
+number of NFS-Ganesha instances can be started, exporting the same or
+different resources from the cluster. This enables the clustering of
+NFS-Ganesha instances. However, this does not imply high availability.
+
+When regular gateway instances and NFS-Ganesha instances overlap the
+same data resources, they will be accessible from both the standard S3
+API and through the NFS-Ganesha instance as exported. You can
+co-locate the NFS-Ganesha instance with a Ceph Object Gateway instance
+on the same host.
+
+RGW vs RGW NFS
+==============
+
+Exporting an NFS namespace and other RGW namespaces (e.g., S3 or Swift
+via the Civetweb HTTP front-end) from the same program instance is
+currently not supported.
+
+When adding objects and buckets outside of NFS, those objects will
+appear in the NFS namespace in the time set by
+``rgw_nfs_namespace_expire_secs``, which defaults to 300 seconds (5 minutes).
+Override the default value for ``rgw_nfs_namespace_expire_secs`` in the
+Ceph configuration file to change the refresh rate.
+
+If exporting Swift containers that do not conform to valid S3 bucket
+naming requirements, set ``rgw_relaxed_s3_bucket_names`` to true in the
+[client.rgw] section of the Ceph configuration file. For example,
+if a Swift container name contains underscores, it is not a valid S3
+bucket name and will be rejected unless ``rgw_relaxed_s3_bucket_names``
+is set to true.
+
+Configuring NFSv4 clients
+=========================
+
+To access the namespace, mount the configured NFS-Ganesha export(s)
+into desired locations in the local POSIX namespace. As noted, this
+implementation has a few unique restrictions:
+
+- NFS 4.1 and higher protocol flavors are preferred
+
+ + NFSv4 OPEN and CLOSE operations are used to track upload transactions
+
+- To upload data successfully, clients must preserve write ordering
+
+ + on Linux and many Unix NFS clients, use the -osync mount option
+
+Conventions for mounting NFS resources are platform-specific. The
+following conventions work on Linux and some Unix platforms:
+
+From the command line::
+
+ mount -t nfs -o nfsvers=4.1,noauto,soft,sync,proto=tcp <ganesha-host-name>:/ <mount-point>
+
+In /etc/fstab::
+
+<ganesha-host-name>:/ <mount-point> nfs noauto,soft,nfsvers=4.1,sync,proto=tcp 0 0
+
+Specify the NFS-Ganesha host name and the path to the mount point on
+the client.
+
+Configuring NFSv3 Clients
+=========================
+
+Linux clients can be configured to mount with NFSv3 by supplying
+``nfsvers=3`` and ``noacl`` as mount options. To use UDP as the
+transport, add ``proto=udp`` to the mount options. However, TCP is the
+preferred transport::
+
+ <ganesha-host-name>:/ <mount-point> nfs noauto,noacl,soft,nfsvers=3,sync,proto=tcp 0 0
+
+Configure the NFS Ganesha EXPORT block Protocols setting with version
+3 and the Transports setting with UDP if the mount will use version 3 with UDP.
+
+NFSv3 Semantics
+---------------
+
+Since NFSv3 does not communicate client OPEN and CLOSE operations to
+file servers, RGW NFS cannot use these operations to mark the
+beginning and ending of file upload transactions. Instead, RGW NFS
+starts a new upload when the first write is sent to a file at offset
+0, and finalizes the upload when no new writes to the file have been
+seen for a period of time, by default, 10 seconds. To change this
+timeout, set an alternate value for ``rgw_nfs_write_completion_interval_s``
+in the RGW section(s) of the Ceph configuration file.
+
+References
+==========
+
+.. [#] http://docs.aws.amazon.com/AmazonS3/latest/dev/ListingKeysHierarchy.html
diff --git a/doc/radosgw/notifications.rst b/doc/radosgw/notifications.rst
new file mode 100644
index 000000000..1d18772b2
--- /dev/null
+++ b/doc/radosgw/notifications.rst
@@ -0,0 +1,547 @@
+====================
+Bucket Notifications
+====================
+
+.. versionadded:: Nautilus
+
+.. contents::
+
+Bucket notifications provide a mechanism for sending information out of radosgw
+when certain events happen on the bucket. Notifications can be sent to HTTP
+endpoints, AMQP0.9.1 endpoints, and Kafka endpoints.
+
+A user can create topics. A topic entity is defined by its name and is "per
+tenant". A user can associate its topics (via notification configuration) only
+with buckets it owns.
+
+A notification entity must be created in order to send event notifications for
+a specific bucket. A notification entity can be created either for a subset
+of event types or for all event types (which is the default). The
+notification may also filter out events based on matches of the prefixes and
+suffixes of (1) the keys, (2) the metadata attributes attached to the object,
+or (3) the object tags. Regular-expression matching can also be used on these
+to create filters. There can be multiple notifications for any specific topic,
+and the same topic can used for multiple notifications.
+
+REST API has been defined so as to provide configuration and control interfaces
+for the bucket notification mechanism.
+
+.. toctree::
+ :maxdepth: 1
+
+ S3 Bucket Notification Compatibility <s3-notification-compatibility>
+
+.. note:: To enable bucket notifications API, the `rgw_enable_apis` configuration parameter should contain: "notifications".
+
+Notification Reliability
+------------------------
+
+Notifications can be sent synchronously or asynchronously. This section
+describes the latency and reliability that you should expect for synchronous
+and asynchronous notifications.
+
+Synchronous Notifications
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notifications can be sent synchronously, as part of the operation that
+triggered them. In this mode, the operation is acknowledged (acked) only after
+the notification is sent to the topic's configured endpoint. This means that
+the round trip time of the notification (the time it takes to send the
+notification to the topic's endpoint plus the time it takes to receive the
+acknowledgement) is added to the latency of the operation itself.
+
+.. note:: The original triggering operation is considered successful even if
+ the notification fails with an error, cannot be delivered, or times out.
+
+Asynchronous Notifications
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Notifications can be sent asynchronously. They are committed into persistent
+storage and then asynchronously sent to the topic's configured endpoint. In
+this case, the only latency added to the original operation is the latency
+added when the notification is committed to persistent storage.
+
+.. note:: If the notification fails with an error, cannot be delivered, or
+ times out, it is retried until it is successfully acknowledged.
+
+.. tip:: To minimize the latency added by asynchronous notification, we
+ recommended placing the "log" pool on fast media.
+
+
+Topic Management via CLI
+------------------------
+
+Fetch the configuration of all topics associated with tenants by running the
+following command:
+
+.. prompt:: bash #
+
+ radosgw-admin topic list [--tenant={tenant}]
+
+
+Fetch the configuration of a specific topic by running the following command:
+
+.. prompt:: bash #
+
+ radosgw-admin topic get --topic={topic-name} [--tenant={tenant}]
+
+
+Remove a topic by running the following command:
+
+.. prompt:: bash #
+
+ radosgw-admin topic rm --topic={topic-name} [--tenant={tenant}]
+
+
+Notification Performance Statistics
+-----------------------------------
+
+- ``pubsub_event_triggered``: a running counter of events that have at least one topic associated with them
+- ``pubsub_event_lost``: a running counter of events that had topics associated with them, but that were not pushed to any of the endpoints
+- ``pubsub_push_ok``: a running counter, for all notifications, of events successfully pushed to their endpoints
+- ``pubsub_push_fail``: a running counter, for all notifications, of events that failed to be pushed to their endpoints
+- ``pubsub_push_pending``: the gauge value of events pushed to an endpoint but not acked or nacked yet
+
+.. note::
+
+ ``pubsub_event_triggered`` and ``pubsub_event_lost`` are incremented per
+ event on each notification, but ``pubsub_push_ok`` and ``pubsub_push_fail``
+ are incremented per push action on each notification.
+
+Bucket Notification REST API
+----------------------------
+
+Topics
+~~~~~~
+
+.. note::
+
+ In all topic actions, the parameters are URL-encoded and sent in the
+ message body using this content type:
+ ``application/x-www-form-urlencoded``.
+
+
+.. _Create a Topic:
+
+Create a Topic
+``````````````
+
+This creates a new topic. Provide the topic with push endpoint parameters,
+which will be used later when a notification is created. A response is
+generated. A successful response includes the topic's `ARN
+<https://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html>`_
+(the "Amazon Resource Name", a unique identifier used to reference the topic).
+To update a topic, use the same command that you used to create it (but when
+updating, use the name of an existing topic and different endpoint values).
+
+.. tip:: Any notification already associated with the topic must be re-created
+ in order for the topic to update.
+
+::
+
+ POST
+
+ Action=CreateTopic
+ &Name=<topic-name>
+ [&Attributes.entry.1.key=amqp-exchange&Attributes.entry.1.value=<exchange>]
+ [&Attributes.entry.2.key=amqp-ack-level&Attributes.entry.2.value=none|broker|routable]
+ [&Attributes.entry.3.key=verify-ssl&Attributes.entry.3.value=true|false]
+ [&Attributes.entry.4.key=kafka-ack-level&Attributes.entry.4.value=none|broker]
+ [&Attributes.entry.5.key=use-ssl&Attributes.entry.5.value=true|false]
+ [&Attributes.entry.6.key=ca-location&Attributes.entry.6.value=<file path>]
+ [&Attributes.entry.7.key=OpaqueData&Attributes.entry.7.value=<opaque data>]
+ [&Attributes.entry.8.key=push-endpoint&Attributes.entry.8.value=<endpoint>]
+ [&Attributes.entry.9.key=persistent&Attributes.entry.9.value=true|false]
+ [&Attributes.entry.10.key=cloudevents&Attributes.entry.10.value=true|false]
+ [&Attributes.entry.11.key=mechanism&Attributes.entry.11.value=<mechanism>]
+
+Request parameters:
+
+- push-endpoint: This is the URI of an endpoint to send push notifications to.
+- OpaqueData: Opaque data is set in the topic configuration and added to all
+ notifications that are triggered by the topic.
+- persistent: This indicates whether notifications to this endpoint are
+ persistent (=asynchronous) or not persistent. (This is "false" by default.)
+
+- HTTP endpoint
+
+ - URI: ``http[s]://<fqdn>[:<port]``
+ - port: This defaults to 80 for HTTP and 443 for HTTPS.
+ - verify-ssl: This indicates whether the server certificate is validated by
+ the client. (This is "true" by default.)
+ - cloudevents: This indicates whether the HTTP header should contain
+ attributes according to the `S3 CloudEvents Spec`_. (This is "false" by
+ default.)
+
+- AMQP0.9.1 endpoint
+
+ - URI: ``amqp[s]://[<user>:<password>@]<fqdn>[:<port>][/<vhost>]``
+ - user/password: This defaults to "guest/guest".
+ - user/password: This must be provided only over HTTPS. Topic creation
+ requests will otherwise be rejected.
+ - port: This defaults to 5672 for unencrypted connections and 5671 for
+ SSL-encrypted connections.
+ - vhost: This defaults to "/".
+ - verify-ssl: This indicates whether the server certificate is validated by
+ the client. (This is "true" by default.)
+ - If ``ca-location`` is provided and a secure connection is used, the
+ specified CA will be used to authenticate the broker. The default CA will
+ not be used.
+ - amqp-exchange: The exchanges must exist and must be able to route messages
+ based on topics. This parameter is mandatory.
+ - amqp-ack-level: No end2end acking is required. Messages may persist in the
+ broker before being delivered to their final destinations. Three ack methods
+ exist:
+
+ - "none": The message is considered "delivered" if it is sent to the broker.
+ - "broker": The message is considered "delivered" if it is acked by the broker (default).
+ - "routable": The message is considered "delivered" if the broker can route to a consumer.
+
+.. tip:: The topic-name (see :ref:`Create a Topic`) is used for the
+ AMQP topic ("routing key" for a topic exchange).
+
+- Kafka endpoint
+
+ - URI: ``kafka://[<user>:<password>@]<fqdn>[:<port]``
+ - ``use-ssl``: If this is set to "true", a secure connection is used to
+ connect to the broker. (This is "false" by default.)
+ - ``ca-location``: If this is provided and a secure connection is used, the
+ specified CA will be used instead of the default CA to authenticate the
+ broker.
+ - user/password may be provided over HTTPS. If not, the config parameter
+ `rgw_allow_notification_secrets_in_cleartext` must be `true` in order to create topic
+ - user/password may be provided along with ``use-ssl``.
+ The broker credentials will otherwise be sent over insecure transport
+ - ``mechanism`` may be provided together with user/password (default: ``PLAIN``).
+ The supported SASL mechanisms are:
+
+ - PLAIN
+ - SCRAM-SHA-256
+ - SCRAM-SHA-512
+ - GSSAPI
+ - OAUTHBEARER
+
+ - port: This defaults to 9092.
+ - kafka-ack-level: No end2end acking is required. Messages may persist in the
+ broker before being delivered to their final destinations. Two ack methods
+ exist:
+
+ - "none": Messages are considered "delivered" if sent to the broker.
+ - "broker": Messages are considered "delivered" if acked by the broker. (This
+ is the default.)
+
+.. note::
+
+ - The key-value pair of a specific parameter need not reside in the same
+ line as the parameter, and need not appear in any specific order, but it
+ must use the same index.
+ - Attribute indexing need not be sequential and need not start from any
+ specific value.
+ - `AWS Create Topic`_ provides a detailed explanation of the endpoint
+ attributes format. In our case, however, different keys and values are
+ used.
+
+The response has the following format:
+
+::
+
+ <CreateTopicResponse xmlns="https://sns.amazonaws.com/doc/2010-03-31/">
+ <CreateTopicResult>
+ <TopicArn></TopicArn>
+ </CreateTopicResult>
+ <ResponseMetadata>
+ <RequestId></RequestId>
+ </ResponseMetadata>
+ </CreateTopicResponse>
+
+The topic `ARN
+<https://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html>`_
+in the response has the following format:
+
+::
+
+ arn:aws:sns:<zone-group>:<tenant>:<topic>
+
+Get Topic Attributes
+````````````````````
+
+This returns information about a specific topic. This includes push-endpoint
+information, if provided.
+
+::
+
+ POST
+
+ Action=GetTopicAttributes
+ &TopicArn=<topic-arn>
+
+The response has the following format:
+
+::
+
+ <GetTopicAttributesResponse>
+ <GetTopicAttributesResult>
+ <Attributes>
+ <entry>
+ <key>User</key>
+ <value></value>
+ </entry>
+ <entry>
+ <key>Name</key>
+ <value></value>
+ </entry>
+ <entry>
+ <key>EndPoint</key>
+ <value></value>
+ </entry>
+ <entry>
+ <key>TopicArn</key>
+ <value></value>
+ </entry>
+ <entry>
+ <key>OpaqueData</key>
+ <value></value>
+ </entry>
+ </Attributes>
+ </GetTopicAttributesResult>
+ <ResponseMetadata>
+ <RequestId></RequestId>
+ </ResponseMetadata>
+ </GetTopicAttributesResponse>
+
+- User: the name of the user that created the topic.
+- Name: the name of the topic.
+- EndPoint: The JSON-formatted endpoint parameters, including:
+ - EndpointAddress: The push-endpoint URL.
+ - EndpointArgs: The push-endpoint args.
+ - EndpointTopic: The topic name to be sent to the endpoint (can be different
+ than the above topic name).
+ - HasStoredSecret: This is "true" if the endpoint URL contains user/password
+ information. In this case, the request must be made over HTTPS. The "topic
+ get" request will otherwise be rejected.
+ - Persistent: This is "true" if the topic is persistent.
+- TopicArn: topic `ARN
+ <https://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html>`_.
+- OpaqueData: The opaque data set on the topic.
+
+Get Topic Information
+`````````````````````
+
+This returns information about a specific topic. This includes push-endpoint
+information, if provided. Note that this API is now deprecated in favor of the
+AWS compliant `GetTopicAttributes` API.
+
+::
+
+ POST
+
+ Action=GetTopic
+ &TopicArn=<topic-arn>
+
+The response has the following format:
+
+::
+
+ <GetTopicResponse>
+ <GetTopicResult>
+ <Topic>
+ <User></User>
+ <Name></Name>
+ <EndPoint>
+ <EndpointAddress></EndpointAddress>
+ <EndpointArgs></EndpointArgs>
+ <EndpointTopic></EndpointTopic>
+ <HasStoredSecret></HasStoredSecret>
+ <Persistent></Persistent>
+ </EndPoint>
+ <TopicArn></TopicArn>
+ <OpaqueData></OpaqueData>
+ </Topic>
+ </GetTopicResult>
+ <ResponseMetadata>
+ <RequestId></RequestId>
+ </ResponseMetadata>
+ </GetTopicResponse>
+
+- User: The name of the user that created the topic.
+- Name: The name of the topic.
+- EndpointAddress: The push-endpoint URL.
+- EndpointArgs: The push-endpoint args.
+- EndpointTopic: The topic name to be sent to the endpoint (which can be
+ different than the above topic name).
+- HasStoredSecret: This is "true" if the endpoint URL contains user/password
+ information. In this case, the request must be made over HTTPS. The "topic
+ get" request will otherwise be rejected.
+- Persistent: "true" if topic is persistent.
+- TopicArn: topic `ARN
+ <https://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html>`_.
+- OpaqueData: the opaque data set on the topic.
+
+Delete Topic
+````````````
+
+::
+
+ POST
+
+ Action=DeleteTopic
+ &TopicArn=<topic-arn>
+
+This deletes the specified topic.
+
+.. note::
+
+ - Deleting an unknown notification (for example, double delete) is not
+ considered an error.
+ - Deleting a topic does not automatically delete all notifications associated
+ with it.
+
+The response has the following format:
+
+::
+
+ <DeleteTopicResponse xmlns="https://sns.amazonaws.com/doc/2010-03-31/">
+ <ResponseMetadata>
+ <RequestId></RequestId>
+ </ResponseMetadata>
+ </DeleteTopicResponse>
+
+List Topics
+```````````
+
+List all topics associated with a tenant.
+
+::
+
+ POST
+
+ Action=ListTopics
+
+The response has the following format:
+
+::
+
+ <ListTopicsResponse xmlns="https://sns.amazonaws.com/doc/2010-03-31/">
+ <ListTopicsResult>
+ <Topics>
+ <member>
+ <User></User>
+ <Name></Name>
+ <EndPoint>
+ <EndpointAddress></EndpointAddress>
+ <EndpointArgs></EndpointArgs>
+ <EndpointTopic></EndpointTopic>
+ </EndPoint>
+ <TopicArn></TopicArn>
+ <OpaqueData></OpaqueData>
+ </member>
+ </Topics>
+ </ListTopicsResult>
+ <ResponseMetadata>
+ <RequestId></RequestId>
+ </ResponseMetadata>
+ </ListTopicsResponse>
+
+- If the endpoint URL contains user/password information in any part of the
+ topic, the request must be made over HTTPS. The "topic list" request will
+ otherwise be rejected.
+
+Notifications
+~~~~~~~~~~~~~
+
+Detailed under: `Bucket Operations`_.
+
+.. note::
+
+ - "Abort Multipart Upload" request does not emit a notification
+ - Both "Initiate Multipart Upload" and "POST Object" requests will emit an ``s3:ObjectCreated:Post`` notification
+
+Events
+~~~~~~
+
+Events are in JSON format (regardless of the actual endpoint), and are S3-compatible.
+For example:
+
+::
+
+ {"Records":[
+ {
+ "eventVersion":"2.1",
+ "eventSource":"ceph:s3",
+ "awsRegion":"zonegroup1",
+ "eventTime":"2019-11-22T13:47:35.124724Z",
+ "eventName":"ObjectCreated:Put",
+ "userIdentity":{
+ "principalId":"tester"
+ },
+ "requestParameters":{
+ "sourceIPAddress":""
+ },
+ "responseElements":{
+ "x-amz-request-id":"503a4c37-85eb-47cd-8681-2817e80b4281.5330.903595",
+ "x-amz-id-2":"14d2-zone1-zonegroup1"
+ },
+ "s3":{
+ "s3SchemaVersion":"1.0",
+ "configurationId":"mynotif1",
+ "bucket":{
+ "name":"mybucket1",
+ "ownerIdentity":{
+ "principalId":"tester"
+ },
+ "arn":"arn:aws:s3:zonegroup1::mybucket1",
+ "id":"503a4c37-85eb-47cd-8681-2817e80b4281.5332.38"
+ },
+ "object":{
+ "key":"myimage1.jpg",
+ "size":"1024",
+ "eTag":"37b51d194a7513e45b56f6524f2d51f2",
+ "versionId":"",
+ "sequencer": "F7E6D75DC742D108",
+ "metadata":[],
+ "tags":[]
+ }
+ },
+ "eventId":"",
+ "opaqueData":"me@example.com"
+ }
+ ]}
+
+- awsRegion: The zonegroup.
+- eventTime: The timestamp, indicating when the event was triggered.
+- eventName: For the list of supported events see: `S3 Notification
+ Compatibility`_. Note that eventName values do not start with the `s3:`
+ prefix.
+- userIdentity.principalId: The user that triggered the change.
+- requestParameters.sourceIPAddress: not supported
+- responseElements.x-amz-request-id: The request ID of the original change.
+- responseElements.x_amz_id_2: The RGW on which the change was made.
+- s3.configurationId: The notification ID that created the event.
+- s3.bucket.name: The name of the bucket.
+- s3.bucket.ownerIdentity.principalId: The owner of the bucket.
+- s3.bucket.arn: The ARN of the bucket.
+- s3.bucket.id: The ID of the bucket. (This is an extension to the S3
+ notification API.)
+- s3.object.key: The object key.
+- s3.object.size: The object size.
+- s3.object.eTag: The object etag.
+- s3.object.versionId: The object version, if the bucket is versioned. When a
+ copy is made, it includes the version of the target object. When a delete
+ marker is created, it includes the version of the delete marker.
+- s3.object.sequencer: The monotonically-increasing identifier of the "change
+ per object" (hexadecimal format).
+- s3.object.metadata: Any metadata set on the object that is sent as
+ ``x-amz-meta-`` (that is, any metadata set on the object that is sent as an
+ extension to the S3 notification API).
+- s3.object.tags: Any tags set on the object. (This is an extension to the S3
+ notification API.)
+- s3.eventId: The unique ID of the event, which could be used for acking. (This
+ is an extension to the S3 notification API.)
+- s3.opaqueData: This means that "opaque data" is set in the topic configuration
+ and is added to all notifications triggered by the topic. (This is an
+ extension to the S3 notification API.)
+
+.. _S3 Notification Compatibility: ../s3-notification-compatibility
+.. _AWS Create Topic: https://docs.aws.amazon.com/sns/latest/api/API_CreateTopic.html
+.. _Bucket Operations: ../s3/bucketops
+.. _S3 CloudEvents Spec: https://github.com/cloudevents/spec/blob/main/cloudevents/adapters/aws-s3.md
diff --git a/doc/radosgw/oidc.rst b/doc/radosgw/oidc.rst
new file mode 100644
index 000000000..46593f1d8
--- /dev/null
+++ b/doc/radosgw/oidc.rst
@@ -0,0 +1,97 @@
+===============================
+ OpenID Connect Provider in RGW
+===============================
+
+An entity describing the OpenID Connect Provider needs to be created in RGW, in order to establish trust between the two.
+
+REST APIs for Manipulating an OpenID Connect Provider
+=====================================================
+
+The following REST APIs can be used for creating and managing an OpenID Connect Provider entity in RGW.
+
+In order to invoke the REST admin APIs, a user with admin caps needs to be created.
+
+.. code-block:: javascript
+
+ radosgw-admin --uid TESTER --display-name "TestUser" --access_key TESTER --secret test123 user create
+ radosgw-admin caps add --uid="TESTER" --caps="oidc-provider=*"
+
+
+CreateOpenIDConnectProvider
+---------------------------------
+
+Create an OpenID Connect Provider entity in RGW
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``ClientIDList.member.N``
+
+:Description: List of Client Ids that needs access to S3 resources.
+:Type: Array of Strings
+
+``ThumbprintList.member.N``
+
+:Description: List of OpenID Connect IDP's server certificates' thumbprints. A maximum of 5 thumbprints are allowed.
+:Type: Array of Strings
+
+``Url``
+
+:Description: URL of the IDP.
+:Type: String
+
+
+Example::
+ POST "<hostname>?Action=Action=CreateOpenIDConnectProvider
+ &ThumbprintList.list.1=F7D7B3515DD0D319DD219A43A9EA727AD6065287
+ &ClientIDList.list.1=app-profile-jsp
+ &Url=http://localhost:8080/auth/realms/quickstart
+
+
+DeleteOpenIDConnectProvider
+---------------------------
+
+Deletes an OpenID Connect Provider entity in RGW
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``OpenIDConnectProviderArn``
+
+:Description: ARN of the IDP which is returned by the Create API.
+:Type: String
+
+Example::
+ POST "<hostname>?Action=Action=DeleteOpenIDConnectProvider
+ &OpenIDConnectProviderArn=arn:aws:iam:::oidc-provider/localhost:8080/auth/realms/quickstart
+
+
+GetOpenIDConnectProvider
+---------------------------
+
+Gets information about an IDP.
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``OpenIDConnectProviderArn``
+
+:Description: ARN of the IDP which is returned by the Create API.
+:Type: String
+
+Example::
+ POST "<hostname>?Action=Action=GetOpenIDConnectProvider
+ &OpenIDConnectProviderArn=arn:aws:iam:::oidc-provider/localhost:8080/auth/realms/quickstart
+
+ListOpenIDConnectProviders
+--------------------------
+
+Lists information about all IDPs
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+None
+
+Example::
+ POST "<hostname>?Action=Action=ListOpenIDConnectProviders
diff --git a/doc/radosgw/opa.rst b/doc/radosgw/opa.rst
new file mode 100644
index 000000000..f1b76b5ef
--- /dev/null
+++ b/doc/radosgw/opa.rst
@@ -0,0 +1,72 @@
+==============================
+Open Policy Agent Integration
+==============================
+
+Open Policy Agent (OPA) is a lightweight general-purpose policy engine
+that can be co-located with a service. OPA can be integrated as a
+sidecar, host-level daemon, or library.
+
+Services can offload policy decisions to OPA by executing queries. Hence,
+policy enforcement can be decoupled from policy decisions.
+
+Configure OPA
+=============
+
+To configure OPA, load custom policies into OPA that control the resources users
+are allowed to access. Relevant data or context can also be loaded into OPA to make decisions.
+
+Policies and data can be loaded into OPA in the following ways::
+ * OPA's RESTful APIs
+ * OPA's *bundle* feature that downloads policies and data from remote HTTP servers
+ * Filesystem
+
+Configure the Ceph Object Gateway
+=================================
+
+The following configuration options are available for OPA integration::
+
+ rgw use opa authz = {use opa server to authorize client requests}
+ rgw opa url = {opa server url:opa server port}
+ rgw opa token = {opa bearer token}
+ rgw opa verify ssl = {verify opa server ssl certificate}
+
+How does the RGW-OPA integration work
+=====================================
+
+After a user is authenticated, OPA can be used to check if the user is authorized
+to perform the given action on the resource. OPA responds with an allow or deny
+decision which is sent back to the RGW which enforces the decision.
+
+Example request::
+
+ POST /v1/data/ceph/authz HTTP/1.1
+ Host: opa.example.com:8181
+ Content-Type: application/json
+
+ {
+ "input": {
+ "method": "GET",
+ "subuser": "subuser",
+ "user_info": {
+ "user_id": "john",
+ "display_name": "John"
+ },
+ "bucket_info": {
+ "bucket": {
+ "name": "Testbucket",
+ "bucket_id": "testbucket"
+ },
+ "owner": "john"
+ }
+ }
+ }
+
+Response::
+
+ {"result": true}
+
+The above is a sample request sent to OPA which contains information about the
+user, resource and the action to be performed on the resource. Based on the polices
+and data loaded into OPA, it will verify whether the request should be allowed or denied.
+In the sample request, RGW makes a POST request to the endpoint */v1/data/ceph/authz*,
+where *ceph* is the package name and *authz* is the rule name.
diff --git a/doc/radosgw/orphans.rst b/doc/radosgw/orphans.rst
new file mode 100644
index 000000000..bf6b10edf
--- /dev/null
+++ b/doc/radosgw/orphans.rst
@@ -0,0 +1,117 @@
+==================================
+Orphan List and Associated Tooling
+==================================
+
+.. version added:: Luminous
+
+.. contents::
+
+Orphans are RADOS objects that are left behind after their associated
+RGW objects are removed. Normally these RADOS objects are removed
+automatically, either immediately or through a process known as
+"garbage collection". Over the history of RGW, however, there may have
+been bugs that prevented these RADOS objects from being deleted, and
+these RADOS objects may be consuming space on the Ceph cluster without
+being of any use. From the perspective of RGW, we call such RADOS
+objects "orphans".
+
+Orphans Find -- DEPRECATED
+--------------------------
+
+The `radosgw-admin` tool has/had three subcommands to help manage
+orphans, however these subcommands are (or will soon be)
+deprecated. These subcommands are:
+
+.. prompt:: bash #
+
+ radosgw-admin orphans find ...
+ radosgw-admin orphans finish ...
+ radosgw-admin orphans list-jobs ...
+
+There are two key problems with these subcommands, however. First,
+these subcommands have not been actively maintained and therefore have
+not tracked RGW as it has evolved in terms of features and updates. As
+a result the confidence that these subcommands can accurately identify
+true orphans is presently low.
+
+Second, these subcommands store intermediate results on the cluster
+itself. This can be problematic when cluster administrators are
+confronting insufficient storage space and want to remove orphans as a
+means of addressing the issue. The intermediate results could strain
+the existing cluster storage capacity even further.
+
+For these reasons "orphans find" has been deprecated.
+
+Orphan List
+-----------
+
+Because "orphans find" has been deprecated, RGW now includes an
+additional tool -- 'rgw-orphan-list'. When run it will list the
+available pools and prompt the user to enter the name of the data
+pool. At that point the tool will, perhaps after an extended period of
+time, produce a local file containing the RADOS objects from the
+designated pool that appear to be orphans. The administrator is free
+to examine this file and the decide on a course of action, perhaps
+removing those RADOS objects from the designated pool.
+
+All intermediate results are stored on the local file system rather
+than the Ceph cluster. So running the 'rgw-orphan-list' tool should
+have no appreciable impact on the amount of cluster storage consumed.
+
+WARNING: Experimental Status
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The 'rgw-orphan-list' tool is new and therefore currently considered
+experimental. The list of orphans produced should be "sanity checked"
+before being used for a large delete operation.
+
+WARNING: Specifying a Data Pool
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a pool other than an RGW data pool is specified, the results of the
+tool will be erroneous. All RADOS objects found on such a pool will
+falsely be designated as orphans.
+
+WARNING: Unindexed Buckets
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+RGW allows for unindexed buckets, that is buckets that do not maintain
+an index of their contents. This is not a typical configuration, but
+it is supported. Because the 'rgw-orphan-list' tool uses the bucket
+indices to determine what RADOS objects should exist, objects in the
+unindexed buckets will falsely be listed as orphans.
+
+
+RADOS List
+----------
+
+One of the sub-steps in computing a list of orphans is to map each RGW
+object into its corresponding set of RADOS objects. This is done using
+a subcommand of 'radosgw-admin'.
+
+.. prompt:: bash #
+
+ radosgw-admin bucket radoslist [--bucket={bucket-name}]
+
+The subcommand will produce a list of RADOS objects that support all
+of the RGW objects. If a bucket is specified then the subcommand will
+only produce a list of RADOS objects that correspond back the RGW
+objects in the specified bucket.
+
+Note: Shared Bucket Markers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some administrators will be aware of the coding schemes used to name
+the RADOS objects that correspond to RGW objects, which include a
+"marker" unique to a given bucket.
+
+RADOS objects that correspond with the contents of one RGW bucket,
+however, may contain a marker that specifies a different bucket. This
+behavior is a consequence of the "shallow copy" optimization used by
+RGW. When larger objects are copied from bucket to bucket, only the
+"head" objects are actually copied, and the tail objects are
+shared. Those shared objects will contain the marker of the original
+bucket.
+
+.. _Data Layout in RADOS : ../layout
+.. _Pool Placement and Storage Classes : ../placement
diff --git a/doc/radosgw/placement.rst b/doc/radosgw/placement.rst
new file mode 100644
index 000000000..28c71783d
--- /dev/null
+++ b/doc/radosgw/placement.rst
@@ -0,0 +1,263 @@
+==================================
+Pool Placement and Storage Classes
+==================================
+
+.. contents::
+
+Placement Targets
+=================
+
+.. versionadded:: Jewel
+
+Placement targets control which `Pools`_ are associated with a particular
+bucket. A bucket's placement target is selected on creation, and cannot be
+modified. The ``radosgw-admin bucket stats`` command will display its
+``placement_rule``.
+
+The zonegroup configuration contains a list of placement targets with an
+initial target named ``default-placement``. The zone configuration then maps
+each zonegroup placement target name onto its local storage. This zone
+placement information includes the ``index_pool`` name for the bucket index,
+the ``data_extra_pool`` name for metadata about incomplete multipart uploads,
+and a ``data_pool`` name for each storage class.
+
+.. _storage_classes:
+
+Storage Classes
+===============
+
+.. versionadded:: Nautilus
+
+Storage classes are used to customize the placement of object data. S3 Bucket
+Lifecycle rules can automate the transition of objects between storage classes.
+
+Storage classes are defined in terms of placement targets. Each zonegroup
+placement target lists its available storage classes with an initial class
+named ``STANDARD``. The zone configuration is responsible for providing a
+``data_pool`` pool name for each of the zonegroup's storage classes.
+
+Zonegroup/Zone Configuration
+============================
+
+Placement configuration is performed with ``radosgw-admin`` commands on
+the zonegroups and zones.
+
+The zonegroup placement configuration can be queried with:
+
+::
+
+ $ radosgw-admin zonegroup get
+ {
+ "id": "ab01123f-e0df-4f29-9d71-b44888d67cd5",
+ "name": "default",
+ "api_name": "default",
+ ...
+ "placement_targets": [
+ {
+ "name": "default-placement",
+ "tags": [],
+ "storage_classes": [
+ "STANDARD"
+ ]
+ }
+ ],
+ "default_placement": "default-placement",
+ ...
+ }
+
+The zone placement configuration can be queried with:
+
+::
+
+ $ radosgw-admin zone get
+ {
+ "id": "557cdcee-3aae-4e9e-85c7-2f86f5eddb1f",
+ "name": "default",
+ "domain_root": "default.rgw.meta:root",
+ ...
+ "placement_pools": [
+ {
+ "key": "default-placement",
+ "val": {
+ "index_pool": "default.rgw.buckets.index",
+ "storage_classes": {
+ "STANDARD": {
+ "data_pool": "default.rgw.buckets.data"
+ }
+ },
+ "data_extra_pool": "default.rgw.buckets.non-ec",
+ "index_type": 0,
+ "inline_data": true
+ }
+ }
+ ],
+ ...
+ }
+
+.. note:: If you have not done any previous `Multisite Configuration`_,
+ a ``default`` zone and zonegroup are created for you, and changes
+ to the zone/zonegroup will not take effect until the Ceph Object
+ Gateways are restarted. If you have created a realm for multisite,
+ the zone/zonegroup changes will take effect once the changes are
+ committed with ``radosgw-admin period update --commit``.
+
+Adding a Placement Target
+-------------------------
+
+To create a new placement target named ``temporary``, start by adding it to
+the zonegroup:
+
+::
+
+ $ radosgw-admin zonegroup placement add \
+ --rgw-zonegroup default \
+ --placement-id temporary
+
+Then provide the zone placement info for that target:
+
+::
+
+ $ radosgw-admin zone placement add \
+ --rgw-zone default \
+ --placement-id temporary \
+ --data-pool default.rgw.temporary.data \
+ --index-pool default.rgw.temporary.index \
+ --data-extra-pool default.rgw.temporary.non-ec
+
+.. note:: With default placement target settings, RGW stores an object's first data chunk in the RADOS "head" object along
+ with xattr metadata. The `--placement-inline-data=false` flag may be passed with the `zone placement add` or
+ `zone placement modify` commands to change this behavior for new objects stored on the target.
+ When data is stored inline (default), it may provide an advantage for read/write workloads since the first chunk of
+ an object's data can be retrieved/stored in a single librados call along with object metadata. On the other hand, a
+ target that does not store data inline can provide a performance benefit for RGW client delete requests when
+ the BlueStore DB is located on faster storage than bucket data since it eliminates the need to access
+ slower devices synchronously while processing the client request. In that case, data associated with the deleted
+ objects is removed asynchronously in the background by garbage collection.
+
+.. _adding_a_storage_class:
+
+Adding a Storage Class
+----------------------
+
+To add a new storage class named ``GLACIER`` to the ``default-placement`` target,
+start by adding it to the zonegroup:
+
+::
+
+ $ radosgw-admin zonegroup placement add \
+ --rgw-zonegroup default \
+ --placement-id default-placement \
+ --storage-class GLACIER
+
+Then provide the zone placement info for that storage class:
+
+::
+
+ $ radosgw-admin zone placement add \
+ --rgw-zone default \
+ --placement-id default-placement \
+ --storage-class GLACIER \
+ --data-pool default.rgw.glacier.data \
+ --compression lz4
+
+Customizing Placement
+=====================
+
+Default Placement
+-----------------
+
+By default, new buckets will use the zonegroup's ``default_placement`` target.
+This zonegroup setting can be changed with:
+
+::
+
+ $ radosgw-admin zonegroup placement default \
+ --rgw-zonegroup default \
+ --placement-id new-placement
+
+User Placement
+--------------
+
+A Ceph Object Gateway user can override the zonegroup's default placement
+target by setting a non-empty ``default_placement`` field in the user info.
+Similarly, the ``default_storage_class`` can override the ``STANDARD``
+storage class applied to objects by default.
+
+::
+
+ $ radosgw-admin user info --uid testid
+ {
+ ...
+ "default_placement": "",
+ "default_storage_class": "",
+ "placement_tags": [],
+ ...
+ }
+
+If a zonegroup's placement target contains any ``tags``, users will be unable
+to create buckets with that placement target unless their user info contains
+at least one matching tag in its ``placement_tags`` field. This can be useful
+to restrict access to certain types of storage.
+
+The ``radosgw-admin`` command can modify these fields directly with:
+
+::
+
+ $ radosgw-admin user modify \
+ --uid <user-id> \
+ --placement-id <default-placement-id> \
+ --storage-class <default-storage-class> \
+ --tags <tag1,tag2>
+
+.. _s3_bucket_placement:
+
+S3 Bucket Placement
+-------------------
+
+When creating a bucket with the S3 protocol, a placement target can be
+provided as part of the LocationConstraint to override the default placement
+targets from the user and zonegroup.
+
+Normally, the LocationConstraint must match the zonegroup's ``api_name``:
+
+::
+
+ <LocationConstraint>default</LocationConstraint>
+
+A custom placement target can be added to the ``api_name`` following a colon:
+
+::
+
+ <LocationConstraint>default:new-placement</LocationConstraint>
+
+Swift Bucket Placement
+----------------------
+
+When creating a bucket with the Swift protocol, a placement target can be
+provided in the HTTP header ``X-Storage-Policy``:
+
+::
+
+ X-Storage-Policy: new-placement
+
+Using Storage Classes
+=====================
+
+All placement targets have a ``STANDARD`` storage class which is applied to
+new objects by default. The user can override this default with its
+``default_storage_class``.
+
+To create an object in a non-default storage class, provide that storage class
+name in an HTTP header with the request. The S3 protocol uses the
+``X-Amz-Storage-Class`` header, while the Swift protocol uses the
+``X-Object-Storage-Class`` header.
+
+When using AWS S3 SDKs such as ``boto3``, it is important that non-default
+storage class names match those provided by AWS S3, or else the SDK
+will drop the request and raise an exception.
+
+S3 Object Lifecycle Management can then be used to move object data between
+storage classes using ``Transition`` actions.
+
+.. _`Pools`: ../pools
+.. _`Multisite Configuration`: ../multisite
diff --git a/doc/radosgw/pools.rst b/doc/radosgw/pools.rst
new file mode 100644
index 000000000..bb1246c1f
--- /dev/null
+++ b/doc/radosgw/pools.rst
@@ -0,0 +1,57 @@
+=====
+Pools
+=====
+
+The Ceph Object Gateway uses several pools for its various storage needs,
+which are listed in the Zone object (see ``radosgw-admin zone get``). A
+single zone named ``default`` is created automatically with pool names
+starting with ``default.rgw.``, but a `Multisite Configuration`_ will have
+multiple zones.
+
+Tuning
+======
+
+When ``radosgw`` first tries to operate on a zone pool that does not
+exist, it will create that pool with the default values from
+``osd pool default pg num`` and ``osd pool default pgp num``. These defaults
+are sufficient for some pools, but others (especially those listed in
+``placement_pools`` for the bucket index and data) will require additional
+tuning. We recommend using the `Ceph Placement Group’s per Pool
+Calculator <https://old.ceph.com/pgcalc/>`__ to calculate a suitable number of
+placement groups for these pools. See
+`Pools <http://docs.ceph.com/en/latest/rados/operations/pools/#pools>`__
+for details on pool creation.
+
+.. _radosgw-pool-namespaces:
+
+Pool Namespaces
+===============
+
+.. versionadded:: Luminous
+
+Pool names particular to a zone follow the naming convention
+``{zone-name}.pool-name``. For example, a zone named ``us-east`` will
+have the following pools:
+
+- ``.rgw.root``
+
+- ``us-east.rgw.control``
+
+- ``us-east.rgw.meta``
+
+- ``us-east.rgw.log``
+
+- ``us-east.rgw.buckets.index``
+
+- ``us-east.rgw.buckets.data``
+
+The zone definitions list several more pools than that, but many of those
+are consolidated through the use of rados namespaces. For example, all of
+the following pool entries use namespaces of the ``us-east.rgw.meta`` pool::
+
+ "user_keys_pool": "us-east.rgw.meta:users.keys",
+ "user_email_pool": "us-east.rgw.meta:users.email",
+ "user_swift_pool": "us-east.rgw.meta:users.swift",
+ "user_uid_pool": "us-east.rgw.meta:users.uid",
+
+.. _`Multisite Configuration`: ../multisite
diff --git a/doc/radosgw/qat-accel.rst b/doc/radosgw/qat-accel.rst
new file mode 100644
index 000000000..b275e8a19
--- /dev/null
+++ b/doc/radosgw/qat-accel.rst
@@ -0,0 +1,155 @@
+===============================================
+QAT Acceleration for Encryption and Compression
+===============================================
+
+Intel QAT (QuickAssist Technology) can provide extended accelerated encryption
+and compression services by offloading the actual encryption and compression
+request(s) to the hardware QuickAssist accelerators, which are more efficient
+in terms of cost and power than general purpose CPUs for those specific
+compute-intensive workloads.
+
+See `QAT Support for Compression`_ and `QAT based Encryption for RGW`_.
+
+
+QAT in the Software Stack
+=========================
+
+Application developers can access QuickAssist features through the QAT API.
+The QAT API is the top-level API for QuickAssist technology, and enables easy
+interfacing between the customer application and the QuickAssist acceleration
+driver.
+
+The QAT API accesses the QuickAssist driver, which in turn drives the
+QuickAssist Accelerator hardware. The QuickAssist driver is responsible for
+exposing the acceleration services to the application software.
+
+A user can write directly to the QAT API, or the use of QAT can be done via
+frameworks that have been enabled by others including Intel (for example, zlib*,
+OpenSSL* libcrypto*, and the Linux* Kernel Crypto Framework).
+
+QAT Environment Setup
+=====================
+1. QuickAssist Accelerator hardware is necessary to make use of accelerated
+ encryption and compression services. And QAT driver in kernel space have to
+ be loaded to drive the hardware.
+
+The driver package can be downloaded from `Intel Quickassist Technology`_.
+
+2. The implementation for QAT based encryption is directly base on QAT API which
+ is included the driver package. But QAT support for compression depends on
+ QATzip project, which is a user space library which builds on top of the QAT
+ API. Currently, QATzip speeds up gzip compression and decompression at the
+ time of writing.
+
+See `QATzip`_.
+
+Implementation
+==============
+1. QAT based Encryption for RGW
+
+`OpenSSL support for RGW encryption`_ has been merged into Ceph, and Intel also
+provides one `QAT Engine`_ for OpenSSL. So, theoretically speaking, QAT based
+encryption in Ceph can be directly supported through OpenSSl+QAT Engine.
+
+But the QAT Engine for OpenSSL currently supports chained operations only, and
+so Ceph will not be able to utilize QAT hardware feature for crypto operations
+based on OpenSSL crypto plugin. As a result, one QAT plugin based on native
+QAT API is added into crypto framework.
+
+2. QAT Support for Compression
+
+As mentioned above, QAT support for compression is based on QATzip library in
+user space, which is designed to take full advantage of the performance provided
+by QuickAssist Technology. Unlike QAT based encryption, QAT based compression
+is supported through a tool class for QAT acceleration rather than a compressor
+plugin. The common tool class can transparently accelerate the existing compression
+types, but only zlib compressor can be supported at the time of writing. So
+user is allowed to use it to speed up zlib compressor as long as the QAT
+hardware is available and QAT is capable to handle it.
+
+Configuration
+=============
+#. Prerequisites
+
+ Make sure the QAT driver with version v1.7.L.4.14.0 or higher has been installed.
+ Remember to set an environment variable "ICP_ROOT" for your QAT driver package
+ root directory.
+
+ To enable the QAT based encryption and compression, user needs to modify the QAT
+ configuration files. For example, for Intel QuickAssist Adapter 8970 product, revise
+ c6xx_dev0/1/2.conf in the directory ``/etc/`` and keep them the same, e.g.:
+
+ .. code-block:: ini
+
+ #...
+ # User Process Instance Section
+ ##############################################
+ [CEPH]
+ NumberCyInstances = 1
+ NumberDcInstances = 1
+ NumProcesses = 8
+ LimitDevAccess = 1
+ # Crypto - User instance #0
+ Cy0Name = "SSL0"
+ Cy0IsPolled = 1
+ # List of core affinities
+ Cy0CoreAffinity = 0
+
+ # Data Compression - User instance #0
+ Dc0Name = "Dc0"
+ Dc0IsPolled = 1
+ # List of core affinities
+ Dc0CoreAffinity = 0
+
+#. QAT based Encryption for RGW
+
+ The CMake option ``WITH_QAT=ON`` must be configured. If you build Ceph from
+ source code (see: :ref:`build-ceph`), navigate to your cloned Ceph repository
+ and execute the following:
+
+ .. prompt:: bash $
+
+ cd ceph
+ ./do_cmake.sh -DWITH_QAT=ON
+ cd build
+ ininja
+
+ .. note::
+ The section name of the QAT configuration files must be ``CEPH`` since
+ the section name is set as "CEPH" in Ceph crypto source code.
+
+ Then, edit the Ceph configuration file to make use of QAT based crypto plugin::
+
+ plugin crypto accelerator = crypto_qat
+
+#. QAT Support for Compression
+
+ Before starting, make sure both QAT driver and `QATzip`_ have been installed. Besides
+ "ICP_ROOT", remember to set the environment variable "QZ_ROOT" for the root directory
+ of your QATzip source tree.
+
+ The following CMake options have to be configured to trigger QAT based compression
+ when building Ceph:
+
+ .. prompt:: bash $
+
+ ./do_cmake.sh -DWITH_QAT=ON -DWITH_QATZIP=ON
+
+ Then, set an environment variable to clarify the section name of User Process Instance
+ Section in QAT configuration files, e.g.:
+
+ .. prompt:: bash $
+
+ export QAT_SECTION_NAME=CEPH
+
+ Next, edit the Ceph configuration file to enable QAT support for compression::
+
+ qat compressor enabled=true
+
+
+.. _QAT Support for Compression: https://github.com/ceph/ceph/pull/19714
+.. _QAT based Encryption for RGW: https://github.com/ceph/ceph/pull/19386
+.. _Intel Quickassist Technology: https://01.org/intel-quickassist-technology
+.. _QATzip: https://github.com/intel/QATzip
+.. _OpenSSL support for RGW encryption: https://github.com/ceph/ceph/pull/15168
+.. _QAT Engine: https://github.com/intel/QAT_Engine
diff --git a/doc/radosgw/rgw-cache.rst b/doc/radosgw/rgw-cache.rst
new file mode 100644
index 000000000..116db8ed4
--- /dev/null
+++ b/doc/radosgw/rgw-cache.rst
@@ -0,0 +1,155 @@
+==========================
+RGW Data caching and CDN
+==========================
+
+.. versionadded:: Octopus
+
+.. contents::
+
+This feature adds to RGW the ability to securely cache objects and offload the workload from the cluster, using Nginx.
+After an object is accessed the first time it will be stored in the Nginx cache directory.
+When data is already cached, it need not be fetched from RGW. A permission check will be made against RGW to ensure the requesting user has access.
+This feature is based on some Nginx modules, ngx_http_auth_request_module, https://github.com/kaltura/nginx-aws-auth-module, Openresty for Lua capabilities.
+
+Currently, this feature will cache only AWSv4 requests (only s3 requests), caching-in the output of the 1st GET request
+and caching-out on subsequent GET requests, passing thru transparently PUT,POST,HEAD,DELETE and COPY requests.
+
+
+The feature introduces 2 new APIs: Auth and Cache.
+
+ NOTE: The `D3N RGW Data Cache`_ is an alternative data caching mechanism implemented natively in the Rados Gateway.
+
+New APIs
+-------------------------
+
+There are 2 new APIs for this feature:
+
+Auth API - The cache uses this to validate that a user can access the cached data
+
+Cache API - Adds the ability to override securely Range header, that way Nginx can use it is own smart cache on top of S3:
+https://www.nginx.com/blog/smart-efficient-byte-range-caching-nginx/
+Using this API gives the ability to read ahead objects when clients asking a specific range from the object.
+On subsequent accesses to the cached object, Nginx will satisfy requests for already-cached ranges from the cache. Uncached ranges will be read from RGW (and cached).
+
+Auth API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This API Validates a specific authenticated access being made to the cache, using RGW's knowledge of the client credentials and stored access policy.
+Returns success if the encapsulated request would be granted.
+
+Cache API
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This API is meant to allow changing signed Range headers using a privileged user, cache user.
+
+Creating cache user
+
+::
+
+$ radosgw-admin user create --uid=<uid for cache user> --display-name="cache user" --caps="amz-cache=read"
+
+This user can send to the RGW the Cache API header ``X-Amz-Cache``, this header contains the headers from the original request(before changing the Range header).
+It means that ``X-Amz-Cache`` built from several headers.
+The headers that are building the ``X-Amz-Cache`` header are separated by char with ASCII code 177 and the header name and value are separated by char ASCII code 178.
+The RGW will check that the cache user is an authorized user and if it is a cache user,
+if yes it will use the ``X-Amz-Cache`` to revalidate that the user has permissions, using the headers from the X-Amz-Cache.
+During this flow, the RGW will override the Range header.
+
+
+Using Nginx with RGW
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Download the source of Openresty:
+
+::
+
+$ wget https://openresty.org/download/openresty-1.15.8.3.tar.gz
+
+git clone the AWS auth Nginx module:
+
+::
+
+$ git clone https://github.com/kaltura/nginx-aws-auth-module
+
+untar the openresty package:
+
+::
+
+$ tar xvzf openresty-1.15.8.3.tar.gz
+$ cd openresty-1.15.8.3
+
+Compile openresty, Make sure that you have pcre lib and openssl lib:
+
+::
+
+$ sudo yum install pcre-devel openssl-devel gcc curl zlib-devel nginx
+$ ./configure --add-module=<the nginx-aws-auth-module dir> --with-http_auth_request_module --with-http_slice_module --conf-path=/etc/nginx/nginx.conf
+$ gmake -j $(nproc)
+$ sudo gmake install
+$ sudo ln -sf /usr/local/openresty/bin/openresty /usr/bin/nginx
+
+Put in-place your Nginx configuration files and edit them according to your environment:
+
+All Nginx conf files are under: https://github.com/ceph/ceph/tree/main/examples/rgw/rgw-cache
+
+`nginx.conf` should go to `/etc/nginx/nginx.conf`
+
+`nginx-lua-file.lua` should go to `/etc/nginx/nginx-lua-file.lua`
+
+`nginx-default.conf` should go to `/etc/nginx/conf.d/nginx-default.conf`
+
+The parameters that are most likely to require adjustment according to the environment are located in the file `nginx-default.conf`
+
+Modify the example values of *proxy_cache_path* and *max_size* at:
+
+::
+
+ proxy_cache_path /data/cache levels=2:2:2 keys_zone=mycache:999m max_size=20G inactive=1d use_temp_path=off;
+
+
+And modify the example *server* values to point to the RGWs URIs:
+
+::
+
+ server rgw1:8000 max_fails=2 fail_timeout=5s;
+ server rgw2:8000 max_fails=2 fail_timeout=5s;
+ server rgw3:8000 max_fails=2 fail_timeout=5s;
+
+| It is important to substitute the *access key* and *secret key* located in the `nginx.conf` with those belong to the user with the `amz-cache` caps
+| for example, create the `cache` user as following:
+
+::
+
+ radosgw-admin user create --uid=cacheuser --display-name="cache user" --caps="amz-cache=read" --access-key <access> --secret <secret>
+
+It is possible to use Nginx slicing which is a better method for streaming purposes.
+
+For using slice you should use `nginx-slicing.conf` and not `nginx-default.conf`
+
+Further information about Nginx slicing:
+
+https://docs.nginx.com/nginx/admin-guide/content-cache/content-caching/#byte-range-caching
+
+
+If you do not want to use the prefetch caching, It is possible to replace `nginx-default.conf` with `nginx-noprefetch.conf`
+Using `noprefetch` means that if the client is sending range request of 0-4095 and then 0-4096 Nginx will cache those requests separately, So it will need to fetch those requests twice.
+
+
+Run Nginx(openresty):
+
+::
+
+$ sudo systemctl restart nginx
+
+Appendix
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**A note about performance:** In certain instances like development environment, disabling the authentication by commenting the following line in `nginx-default.conf`:
+
+::
+
+ #auth_request /authentication;
+
+may (depending on the hardware) increases the performance significantly as it forgoes the auth API calls to radosgw.
+
+
+.. _D3N RGW Data Cache: ../d3n_datacache/
diff --git a/doc/radosgw/role.rst b/doc/radosgw/role.rst
new file mode 100644
index 000000000..e97449872
--- /dev/null
+++ b/doc/radosgw/role.rst
@@ -0,0 +1,570 @@
+======
+ Role
+======
+
+A role is similar to a user and has permission policies attached to it, that determine what a role can or can not do. A role can be assumed by any identity that needs it. If a user assumes a role, a set of dynamically created temporary credentials are returned to the user. A role can be used to delegate access to users, applications, services that do not have permissions to access some s3 resources.
+
+The following radosgw-admin commands can be used to create/ delete/ update a role and permissions associated with a role.
+
+Create a Role
+-------------
+
+To create a role, execute the following::
+
+ radosgw-admin role create --role-name={role-name} [--path=="{path to the role}"] [--assume-role-policy-doc={trust-policy-document}]
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``role-name``
+
+:Description: Name of the role.
+:Type: String
+
+``path``
+
+:Description: Path to the role. The default value is a slash(/).
+:Type: String
+
+``assume-role-policy-doc``
+
+:Description: The trust relationship policy document that grants an entity permission to assume the role.
+:Type: String
+
+For example::
+
+ radosgw-admin role create --role-name=S3Access1 --path=/application_abc/component_xyz/ --assume-role-policy-doc=\{\"Version\":\"2012-10-17\",\"Statement\":\[\{\"Effect\":\"Allow\",\"Principal\":\{\"AWS\":\[\"arn:aws:iam:::user/TESTER\"\]\},\"Action\":\[\"sts:AssumeRole\"\]\}\]\}
+
+.. code-block:: javascript
+
+ {
+ "id": "ca43045c-082c-491a-8af1-2eebca13deec",
+ "name": "S3Access1",
+ "path": "/application_abc/component_xyz/",
+ "arn": "arn:aws:iam:::role/application_abc/component_xyz/S3Access1",
+ "create_date": "2018-10-17T10:18:29.116Z",
+ "max_session_duration": 3600,
+ "assume_role_policy_document": "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"AWS\":[\"arn:aws:iam:::user/TESTER\"]},\"Action\":[\"sts:AssumeRole\"]}]}"
+ }
+
+
+Delete a Role
+-------------
+
+To delete a role, execute the following::
+
+ radosgw-admin role delete --role-name={role-name}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``role-name``
+
+:Description: Name of the role.
+:Type: String
+
+For example::
+
+ radosgw-admin role delete --role-name=S3Access1
+
+Note: A role can be deleted only when it doesn't have any permission policy attached to it.
+
+Get a Role
+----------
+
+To get information about a role, execute the following::
+
+ radosgw-admin role get --role-name={role-name}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``role-name``
+
+:Description: Name of the role.
+:Type: String
+
+For example::
+
+ radosgw-admin role get --role-name=S3Access1
+
+.. code-block:: javascript
+
+ {
+ "id": "ca43045c-082c-491a-8af1-2eebca13deec",
+ "name": "S3Access1",
+ "path": "/application_abc/component_xyz/",
+ "arn": "arn:aws:iam:::role/application_abc/component_xyz/S3Access1",
+ "create_date": "2018-10-17T10:18:29.116Z",
+ "max_session_duration": 3600,
+ "assume_role_policy_document": "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"AWS\":[\"arn:aws:iam:::user/TESTER\"]},\"Action\":[\"sts:AssumeRole\"]}]}"
+ }
+
+
+List Roles
+----------
+
+To list roles with a specified path prefix, execute the following::
+
+ radosgw-admin role list [--path-prefix ={path prefix}]
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``path-prefix``
+
+:Description: Path prefix for filtering roles. If this is not specified, all roles are listed.
+:Type: String
+
+For example::
+
+ radosgw-admin role list --path-prefix="/application"
+
+.. code-block:: javascript
+
+ [
+ {
+ "id": "3e1c0ff7-8f2b-456c-8fdf-20f428ba6a7f",
+ "name": "S3Access1",
+ "path": "/application_abc/component_xyz/",
+ "arn": "arn:aws:iam:::role/application_abc/component_xyz/S3Access1",
+ "create_date": "2018-10-17T10:32:01.881Z",
+ "max_session_duration": 3600,
+ "assume_role_policy_document": "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"AWS\":[\"arn:aws:iam:::user/TESTER\"]},\"Action\":[\"sts:AssumeRole\"]}]}"
+ }
+ ]
+
+
+Update Assume Role Policy Document of a role
+--------------------------------------------
+
+To modify a role's assume role policy document, execute the following::
+
+ radosgw-admin role-trust-policy modify --role-name={role-name} --assume-role-policy-doc={trust-policy-document}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``role-name``
+
+:Description: Name of the role.
+:Type: String
+
+``assume-role-policy-doc``
+
+:Description: The trust relationship policy document that grants an entity permission to assume the role.
+:Type: String
+
+For example::
+
+ radosgw-admin role-trust-policy modify --role-name=S3Access1 --assume-role-policy-doc=\{\"Version\":\"2012-10-17\",\"Statement\":\[\{\"Effect\":\"Allow\",\"Principal\":\{\"AWS\":\[\"arn:aws:iam:::user/TESTER2\"\]\},\"Action\":\[\"sts:AssumeRole\"\]\}\]\}
+
+.. code-block:: javascript
+
+ {
+ "id": "ca43045c-082c-491a-8af1-2eebca13deec",
+ "name": "S3Access1",
+ "path": "/application_abc/component_xyz/",
+ "arn": "arn:aws:iam:::role/application_abc/component_xyz/S3Access1",
+ "create_date": "2018-10-17T10:18:29.116Z",
+ "max_session_duration": 3600,
+ "assume_role_policy_document": "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"AWS\":[\"arn:aws:iam:::user/TESTER2\"]},\"Action\":[\"sts:AssumeRole\"]}]}"
+ }
+
+
+In the above example, we are modifying the Principal from TESTER to TESTER2 in its assume role policy document.
+
+Add/ Update a Policy attached to a Role
+---------------------------------------
+
+To add or update the inline policy attached to a role, execute the following::
+
+ radosgw-admin role policy put --role-name={role-name} --policy-name={policy-name} --policy-doc={permission-policy-doc}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``role-name``
+
+:Description: Name of the role.
+:Type: String
+
+``policy-name``
+
+:Description: Name of the policy.
+:Type: String
+
+``policy-doc``
+
+:Description: The Permission policy document.
+:Type: String
+
+For example::
+
+ radosgw-admin role-policy put --role-name=S3Access1 --policy-name=Policy1 --policy-doc=\{\"Version\":\"2012-10-17\",\"Statement\":\[\{\"Effect\":\"Allow\",\"Action\":\[\"s3:*\"\],\"Resource\":\"arn:aws:s3:::example_bucket\"\}\]\}
+
+For passing ``policy-doc`` as a file::
+
+ radosgw-admin role-policy put --role-name=S3Access1 --policy-name=Policy1 --infile policy-document.json
+
+In the above example, we are attaching a policy 'Policy1' to role 'S3Access1', which allows all s3 actions on 'example_bucket'.
+
+List Permission Policy Names attached to a Role
+-----------------------------------------------
+
+To list the names of permission policies attached to a role, execute the following::
+
+ radosgw-admin role policy get --role-name={role-name}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``role-name``
+
+:Description: Name of the role.
+:Type: String
+
+For example::
+
+ radosgw-admin role-policy list --role-name=S3Access1
+
+.. code-block:: javascript
+
+ [
+ "Policy1"
+ ]
+
+
+Get Permission Policy attached to a Role
+----------------------------------------
+
+To get a specific permission policy attached to a role, execute the following::
+
+ radosgw-admin role policy get --role-name={role-name} --policy-name={policy-name}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``role-name``
+
+:Description: Name of the role.
+:Type: String
+
+``policy-name``
+
+:Description: Name of the policy.
+:Type: String
+
+For example::
+
+ radosgw-admin role-policy get --role-name=S3Access1 --policy-name=Policy1
+
+.. code-block:: javascript
+
+ {
+ "Permission policy": "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Action\":[\"s3:*\"],\"Resource\":\"arn:aws:s3:::example_bucket\"}]}"
+ }
+
+
+Delete Policy attached to a Role
+--------------------------------
+
+To delete permission policy attached to a role, execute the following::
+
+ radosgw-admin role policy delete --role-name={role-name} --policy-name={policy-name}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``role-name``
+
+:Description: Name of the role.
+:Type: String
+
+``policy-name``
+
+:Description: Name of the policy.
+:Type: String
+
+For example::
+
+ radosgw-admin role-policy delete --role-name=S3Access1 --policy-name=Policy1
+
+
+Update a role
+-------------
+
+To update a role's max-session-duration, execute the following::
+
+ radosgw-admin role update --role-name={role-name} --max-session-duration={max-session-duration}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``role-name``
+
+:Description: Name of the role.
+:Type: String
+
+``max-session-duration``
+
+:Description: Maximum session duration for a role.
+:Type: String
+
+For example::
+
+ radosgw-admin role update --role-name=S3Access1 --max-session-duration=43200
+
+Note: This command currently can only be used to update max-session-duration.
+
+REST APIs for Manipulating a Role
+=================================
+
+In addition to the above radosgw-admin commands, the following REST APIs can be used for manipulating a role. For the request parameters and their explanations, refer to the sections above.
+
+In order to invoke the REST admin APIs, a user with admin caps needs to be created.
+
+.. code-block:: javascript
+
+ radosgw-admin --uid TESTER --display-name "TestUser" --access_key TESTER --secret test123 user create
+ radosgw-admin caps add --uid="TESTER" --caps="roles=*"
+
+
+Create a Role
+-------------
+
+Example::
+ POST "<hostname>?Action=CreateRole&RoleName=S3Access&Path=/application_abc/component_xyz/&AssumeRolePolicyDocument=\{\"Version\":\"2012-10-17\",\"Statement\":\[\{\"Effect\":\"Allow\",\"Principal\":\{\"AWS\":\[\"arn:aws:iam:::user/TESTER\"\]\},\"Action\":\[\"sts:AssumeRole\"\]\}\]\}"
+
+.. code-block:: XML
+
+ <role>
+ <id>8f41f4e0-7094-4dc0-ac20-074a881ccbc5</id>
+ <name>S3Access</name>
+ <path>/application_abc/component_xyz/</path>
+ <arn>arn:aws:iam:::role/application_abc/component_xyz/S3Access</arn>
+ <create_date>2018-10-23T07:43:42.811Z</create_date>
+ <max_session_duration>3600</max_session_duration>
+ <assume_role_policy_document>{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["arn:aws:iam:::user/TESTER"]},"Action":["sts:AssumeRole"]}]}</assume_role_policy_document>
+ </role>
+
+
+Delete a Role
+-------------
+
+Example::
+ POST "<hostname>?Action=DeleteRole&RoleName=S3Access"
+
+Note: A role can be deleted only when it doesn't have any permission policy attached to it.
+
+Get a Role
+----------
+
+Example::
+ POST "<hostname>?Action=GetRole&RoleName=S3Access"
+
+.. code-block:: XML
+
+ <role>
+ <id>8f41f4e0-7094-4dc0-ac20-074a881ccbc5</id>
+ <name>S3Access</name>
+ <path>/application_abc/component_xyz/</path>
+ <arn>arn:aws:iam:::role/application_abc/component_xyz/S3Access</arn>
+ <create_date>2018-10-23T07:43:42.811Z</create_date>
+ <max_session_duration>3600</max_session_duration>
+ <assume_role_policy_document>{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["arn:aws:iam:::user/TESTER"]},"Action":["sts:AssumeRole"]}]}</assume_role_policy_document>
+ </role>
+
+
+List Roles
+----------
+
+Example::
+ POST "<hostname>?Action=ListRoles&RoleName=S3Access&PathPrefix=/application"
+
+.. code-block:: XML
+
+ <role>
+ <id>8f41f4e0-7094-4dc0-ac20-074a881ccbc5</id>
+ <name>S3Access</name>
+ <path>/application_abc/component_xyz/</path>
+ <arn>arn:aws:iam:::role/application_abc/component_xyz/S3Access</arn>
+ <create_date>2018-10-23T07:43:42.811Z</create_date>
+ <max_session_duration>3600</max_session_duration>
+ <assume_role_policy_document>{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"AWS":["arn:aws:iam:::user/TESTER"]},"Action":["sts:AssumeRole"]}]}</assume_role_policy_document>
+ </role>
+
+
+Update Assume Role Policy Document
+----------------------------------
+
+Example::
+ POST "<hostname>?Action=UpdateAssumeRolePolicy&RoleName=S3Access&PolicyDocument=\{\"Version\":\"2012-10-17\",\"Statement\":\[\{\"Effect\":\"Allow\",\"Principal\":\{\"AWS\":\[\"arn:aws:iam:::user/TESTER2\"\]\},\"Action\":\[\"sts:AssumeRole\"\]\}\]\}"
+
+Add/ Update a Policy attached to a Role
+---------------------------------------
+
+Example::
+ POST "<hostname>?Action=PutRolePolicy&RoleName=S3Access&PolicyName=Policy1&PolicyDocument=\{\"Version\":\"2012-10-17\",\"Statement\":\[\{\"Effect\":\"Allow\",\"Action\":\[\"s3:CreateBucket\"\],\"Resource\":\"arn:aws:s3:::example_bucket\"\}\]\}"
+
+List Permission Policy Names attached to a Role
+-----------------------------------------------
+
+Example::
+ POST "<hostname>?Action=ListRolePolicies&RoleName=S3Access"
+
+.. code-block:: XML
+
+ <PolicyNames>
+ <member>Policy1</member>
+ </PolicyNames>
+
+
+Get Permission Policy attached to a Role
+----------------------------------------
+
+Example::
+ POST "<hostname>?Action=GetRolePolicy&RoleName=S3Access&PolicyName=Policy1"
+
+.. code-block:: XML
+
+ <GetRolePolicyResult>
+ <PolicyName>Policy1</PolicyName>
+ <RoleName>S3Access</RoleName>
+ <Permission_policy>{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Action":["s3:CreateBucket"],"Resource":"arn:aws:s3:::example_bucket"}]}</Permission_policy>
+ </GetRolePolicyResult>
+
+
+Delete Policy attached to a Role
+--------------------------------
+
+Example::
+ POST "<hostname>?Action=DeleteRolePolicy&RoleName=S3Access&PolicyName=Policy1"
+
+Tag a role
+----------
+A role can have multivalued tags attached to it. These tags can be passed in as part of CreateRole REST API also.
+AWS does not support multi-valued role tags.
+
+Example::
+ POST "<hostname>?Action=TagRole&RoleName=S3Access&Tags.member.1.Key=Department&Tags.member.1.Value=Engineering"
+
+.. code-block:: XML
+
+ <TagRoleResponse>
+ <ResponseMetadata>
+ <RequestId>tx000000000000000000004-00611f337e-1027-default</RequestId>
+ </ResponseMetadata>
+ </TagRoleResponse>
+
+
+List role tags
+--------------
+Lists the tags attached to a role.
+
+Example::
+ POST "<hostname>?Action=ListRoleTags&RoleName=S3Access"
+
+.. code-block:: XML
+
+ <ListRoleTagsResponse>
+ <ListRoleTagsResult>
+ <Tags>
+ <member>
+ <Key>Department</Key>
+ <Value>Engineering</Value>
+ </member>
+ </Tags>
+ </ListRoleTagsResult>
+ <ResponseMetadata>
+ <RequestId>tx000000000000000000005-00611f337e-1027-default</RequestId>
+ </ResponseMetadata>
+ </ListRoleTagsResponse>
+
+Delete role tags
+----------------
+Delete a tag/ tags attached to a role.
+
+Example::
+ POST "<hostname>?Action=UntagRoles&RoleName=S3Access&TagKeys.member.1=Department"
+
+.. code-block:: XML
+
+ <UntagRoleResponse>
+ <ResponseMetadata>
+ <RequestId>tx000000000000000000007-00611f337e-1027-default</RequestId>
+ </ResponseMetadata>
+ </UntagRoleResponse>
+
+Update Role
+-----------
+
+Example::
+ POST "<hostname>?Action=UpdateRole&RoleName=S3Access&MaxSessionDuration=43200"
+
+.. code-block:: XML
+
+ <UpdateRoleResponse>
+ <UpdateRoleResult>
+ <ResponseMetadata>
+ <RequestId>tx000000000000000000007-00611f337e-1027-default</RequestId>
+ </ResponseMetadata>
+ </UpdateRoleResult>
+ </UpdateRoleResponse>
+
+Note: This API currently can only be used to update max-session-duration.
+
+Sample code for tagging, listing tags and untagging a role
+----------------------------------------------------------
+
+The following is sample code for adding tags to role, listing tags and untagging a role using boto3.
+
+.. code-block:: python
+
+ import boto3
+
+ access_key = 'TESTER'
+ secret_key = 'test123'
+
+ iam_client = boto3.client('iam',
+ aws_access_key_id=access_key,
+ aws_secret_access_key=secret_key,
+ endpoint_url='http://s3.us-east.localhost:8000',
+ region_name=''
+ )
+
+ policy_document = "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"Federated\":[\"arn:aws:iam:::oidc-provider/localhost:8080/auth/realms/quickstart\"]},\"Action\":[\"sts:AssumeRoleWithWebIdentity\"],\"Condition\":{\"StringEquals\":{\"localhost:8080/auth/realms/quickstart:sub\":\"user1\"}}}]}"
+
+ print ("\n Creating Role with tags\n")
+ tags_list = [
+ {'Key':'Department','Value':'Engineering'}
+ ]
+ role_response = iam_client.create_role(
+ AssumeRolePolicyDocument=policy_document,
+ Path='/',
+ RoleName='S3Access',
+ Tags=tags_list,
+ )
+
+ print ("Adding tags to role\n")
+ response = iam_client.tag_role(
+ RoleName='S3Access',
+ Tags= [
+ {'Key':'CostCenter','Value':'123456'}
+ ]
+ )
+ print ("Listing role tags\n")
+ response = iam_client.list_role_tags(
+ RoleName='S3Access'
+ )
+ print (response)
+ print ("Untagging role\n")
+ response = iam_client.untag_role(
+ RoleName='S3Access',
+ TagKeys=[
+ 'Department',
+ ]
+ )
+
+
+
diff --git a/doc/radosgw/s3-notification-compatibility.rst b/doc/radosgw/s3-notification-compatibility.rst
new file mode 100644
index 000000000..1627ed0c4
--- /dev/null
+++ b/doc/radosgw/s3-notification-compatibility.rst
@@ -0,0 +1,149 @@
+=====================================
+S3 Bucket Notifications Compatibility
+=====================================
+
+Ceph's `Bucket Notifications`_ API follows `AWS S3 Bucket Notifications API`_. However, some differences exist, as listed below.
+
+
+.. note::
+
+ Compatibility is different depending on which of the above mechanism is used
+
+Supported Destination
+---------------------
+
+AWS supports: **SNS**, **SQS** and **Lambda** as possible destinations (AWS internal destinations).
+Currently, we support: **HTTP/S**, **Kafka** and **AMQP**.
+
+We are using the **SNS** ARNs to represent the **HTTP/S**, **Kafka** and **AMQP** destinations.
+
+Notification Configuration XML
+------------------------------
+
+Following tags (and the tags inside them) are not supported:
+
++-----------------------------------+----------------------------------------------+
+| Tag | Remaks |
++===================================+==============================================+
+| ``<QueueConfiguration>`` | not needed, we treat all destinations as SNS |
++-----------------------------------+----------------------------------------------+
+| ``<CloudFunctionConfiguration>`` | not needed, we treat all destinations as SNS |
++-----------------------------------+----------------------------------------------+
+
+REST API Extension
+------------------
+
+Ceph's bucket notification API has the following extensions:
+
+- Deletion of a specific notification, or all notifications on a bucket, using the ``DELETE`` verb
+
+ - In S3, all notifications are deleted when the bucket is deleted, or when an empty notification is set on the bucket
+
+- Getting the information on a specific notification (when more than one exists on a bucket)
+
+ - In S3, it is only possible to fetch all notifications on a bucket
+
+- In addition to filtering based on prefix/suffix of object keys we support:
+
+ - Filtering based on regular expression matching
+
+ - Filtering based on metadata attributes attached to the object
+
+ - Filtering based on object tags
+
+- Each one of the additional filters extends the S3 API and using it will require extension of the client SDK (unless you are using plain HTTP).
+
+- Filtering overlapping is allowed, so that same event could be sent as different notification
+
+
+Unsupported Fields in the Event Record
+--------------------------------------
+
+The records sent for bucket notification follows the format described in: `Event Message Structure`_.
+However, the ``requestParameters.sourceIPAddress`` field will be sent empty.
+
+
+Event Types
+-----------
+
++--------------------------------------------------------+-----------------------------------------+
+| Event | Note |
++========================================================+=========================================+
+| ``s3:ObjectCreated:*`` | Supported |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectCreated:Put`` | Supported |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectCreated:Post`` | Supported |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectCreated:Copy`` | Supported |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectCreated:CompleteMultipartUpload`` | Supported |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectRemoved:*`` | Supported |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectRemoved:Delete`` | Supported |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectRemoved:DeleteMarkerCreated`` | Supported |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectLifecycle:Expiration:Current`` | Ceph extension |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectLifecycle:Expiration:NonCurrent`` | Ceph extension |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectLifecycle:Expiration:DeleteMarker`` | Ceph extension |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectLifecycle:Expiration:AbortMultipartUpload`` | Defined, Ceph extension (not generated) |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectLifecycle:Transition:Current`` | Ceph extension |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectLifecycle:Transition:NonCurrent`` | Ceph extension |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectSynced:*`` | Ceph extension |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectSynced:Create`` | Ceph Extension |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectSynced:Delete`` | Defined, Ceph extension (not generated) |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectSynced:DeletionMarkerCreated`` | Defined, Ceph extension (not generated) |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectRestore:Post`` | Not applicable |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ObjectRestore:Complete`` | Not applicable |
++--------------------------------------------------------+-----------------------------------------+
+| ``s3:ReducedRedundancyLostObject`` | Not applicable |
++--------------------------------------------------------+-----------------------------------------+
+
+.. note::
+
+ The ``s3:ObjectRemoved:DeleteMarkerCreated`` event presents information on the latest version of the object
+
+.. note::
+
+ In case of multipart upload, an ``ObjectCreated:CompleteMultipartUpload`` notification will be sent at the end of the process.
+
+.. note::
+
+ The ``s3:ObjectSynced:Create`` event is sent when an object successfully syncs to a zone. It must be explicitly set for each zone.
+
+Topic Configuration
+-------------------
+In the case of bucket notifications, the topics management API will be derived from `AWS Simple Notification Service API`_.
+Note that most of the API is not applicable to Ceph, and only the following actions are implemented:
+
+ - ``CreateTopic``
+ - ``DeleteTopic``
+ - ``ListTopics``
+
+We also have the following extensions to topic configuration:
+
+ - In ``GetTopic`` we allow fetching a specific topic, instead of all user topics
+ - In ``CreateTopic``
+
+ - we allow setting endpoint attributes
+ - we allow setting opaque data that will be sent to the endpoint in the notification
+
+
+.. _AWS Simple Notification Service API: https://docs.aws.amazon.com/sns/latest/api/API_Operations.html
+.. _AWS S3 Bucket Notifications API: https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html
+.. _Event Message Structure: https://docs.aws.amazon.com/AmazonS3/latest/dev/notification-content-structure.html
+.. _`Bucket Notifications`: ../notifications
+.. _`boto3 SDK filter extensions`: https://github.com/ceph/ceph/tree/main/examples/rgw/boto3
diff --git a/doc/radosgw/s3.rst b/doc/radosgw/s3.rst
new file mode 100644
index 000000000..cb5eb3adb
--- /dev/null
+++ b/doc/radosgw/s3.rst
@@ -0,0 +1,98 @@
+.. _radosgw s3:
+
+============================
+ Ceph Object Gateway S3 API
+============================
+
+Ceph supports a RESTful API that is compatible with the basic data access model of the `Amazon S3 API`_.
+
+API
+---
+
+.. toctree::
+ :maxdepth: 1
+
+ Common <s3/commons>
+ Authentication <s3/authentication>
+ Service Ops <s3/serviceops>
+ Bucket Ops <s3/bucketops>
+ Object Ops <s3/objectops>
+ C++ <s3/cpp>
+ C# <s3/csharp>
+ Java <s3/java>
+ Perl <s3/perl>
+ PHP <s3/php>
+ Python <s3/python>
+ Ruby <s3/ruby>
+
+
+Features Support
+----------------
+
+The following table describes the support status for current Amazon S3 functional features:
+
++---------------------------------+-----------------+----------------------------------------+
+| Feature | Status | Remarks |
++=================================+=================+========================================+
+| **List Buckets** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Delete Bucket** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Create Bucket** | Supported | Different set of canned ACLs |
++---------------------------------+-----------------+----------------------------------------+
+| **Bucket Lifecycle** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Bucket Replication** | Partial | Permitted only across zones |
++---------------------------------+-----------------+----------------------------------------+
+| **Policy (Buckets, Objects)** | Supported | ACLs & bucket policies are supported |
++---------------------------------+-----------------+----------------------------------------+
+| **Bucket Website** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Bucket ACLs (Get, Put)** | Supported | Different set of canned ACLs |
++---------------------------------+-----------------+----------------------------------------+
+| **Bucket Location** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Bucket Notification** | Supported | See `S3 Notification Compatibility`_ |
++---------------------------------+-----------------+----------------------------------------+
+| **Bucket Object Versions** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Get Bucket Info (HEAD)** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Bucket Request Payment** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Put Object** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Delete Object** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Get Object** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Object ACLs (Get, Put)** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Get Object Info (HEAD)** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **POST Object** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Copy Object** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Multipart Uploads** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Object Tagging** | Supported | See :ref:`tag_policy` for Policy verbs |
++---------------------------------+-----------------+----------------------------------------+
+| **Bucket Tagging** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Storage Class** | Supported | See :ref:`storage_classes` |
++---------------------------------+-----------------+----------------------------------------+
+
+Unsupported Header Fields
+-------------------------
+
+The following common request header fields are not supported:
+
++----------------------------+------------+
+| Name | Type |
++============================+============+
+| **x-amz-id-2** | Response |
++----------------------------+------------+
+
+.. _Amazon S3 API: http://docs.aws.amazon.com/AmazonS3/latest/API/APIRest.html
+.. _S3 Notification Compatibility: ../s3-notification-compatibility
diff --git a/doc/radosgw/s3/authentication.rst b/doc/radosgw/s3/authentication.rst
new file mode 100644
index 000000000..64747cde2
--- /dev/null
+++ b/doc/radosgw/s3/authentication.rst
@@ -0,0 +1,235 @@
+=========================
+ Authentication and ACLs
+=========================
+
+Requests to the RADOS Gateway (RGW) can be either authenticated or
+unauthenticated. RGW assumes unauthenticated requests are sent by an anonymous
+user. RGW supports canned ACLs.
+
+Authentication
+--------------
+Authenticating a request requires including an access key and a Hash-based
+Message Authentication Code (HMAC) in the request before it is sent to the
+RGW server. RGW uses an S3-compatible authentication approach.
+
+::
+
+ HTTP/1.1
+ PUT /buckets/bucket/object.mpeg
+ Host: cname.domain.com
+ Date: Mon, 2 Jan 2012 00:01:01 +0000
+ Content-Encoding: mpeg
+ Content-Length: 9999999
+
+ Authorization: AWS {access-key}:{hash-of-header-and-secret}
+
+In the foregoing example, replace ``{access-key}`` with the value for your access
+key ID followed by a colon (``:``). Replace ``{hash-of-header-and-secret}`` with
+a hash of the header string and the secret corresponding to the access key ID.
+
+To generate the hash of the header string and secret, you must:
+
+#. Get the value of the header string.
+#. Normalize the request header string into canonical form.
+#. Generate an HMAC using a SHA-1 hashing algorithm.
+ See `RFC 2104`_ and `HMAC`_ for details.
+#. Encode the ``hmac`` result as base-64.
+
+To normalize the header into canonical form:
+
+#. Get all fields beginning with ``x-amz-``.
+#. Ensure that the fields are all lowercase.
+#. Sort the fields lexicographically.
+#. Combine multiple instances of the same field name into a
+ single field and separate the field values with a comma.
+#. Replace white space and line breaks in field values with a single space.
+#. Remove white space before and after colons.
+#. Append a new line after each field.
+#. Merge the fields back into the header.
+
+Replace the ``{hash-of-header-and-secret}`` with the base-64 encoded HMAC string.
+
+Authentication against OpenStack Keystone
+-----------------------------------------
+
+In a radosgw instance that is configured with authentication against
+OpenStack Keystone, it is possible to use Keystone as an authoritative
+source for S3 API authentication. To do so, you must set:
+
+* the ``rgw keystone`` configuration options explained in :doc:`../keystone`,
+* ``rgw s3 auth use keystone = true``.
+
+In addition, a user wishing to use the S3 API must obtain an AWS-style
+access key and secret key. They can do so with the ``openstack ec2
+credentials create`` command::
+
+ $ openstack --os-interface public ec2 credentials create
+ +------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+ | Field | Value |
+ +------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+ | access | c921676aaabbccdeadbeef7e8b0eeb2c |
+ | links | {u'self': u'https://auth.example.com:5000/v3/users/7ecbebaffeabbddeadbeefa23267ccbb24/credentials/OS-EC2/c921676aaabbccdeadbeef7e8b0eeb2c'} |
+ | project_id | 5ed51981aab4679851adeadbeef6ebf7 |
+ | secret | ******************************** |
+ | trust_id | None |
+ | user_id | 7ecbebaffeabbddeadbeefa23267cc24 |
+ +------------+---------------------------------------------------------------------------------------------------------------------------------------------+
+
+The thus-generated access and secret key can then be used for S3 API
+access to radosgw.
+
+.. note:: Consider that most production radosgw deployments
+ authenticating against OpenStack Keystone are also set up
+ for :doc:`../multitenancy`, for which special
+ considerations apply with respect to S3 signed URLs and
+ public read ACLs.
+
+Access Control Lists (ACLs)
+---------------------------
+
+RGW supports S3-compatible ACL functionality. An ACL is a list of access grants
+that specify which operations a user can perform on a bucket or on an object.
+Each grant has a different meaning when applied to a bucket versus applied to
+an object:
+
++------------------+--------------------------------------------------------+----------------------------------------------+
+| Permission | Bucket | Object |
++==================+========================================================+==============================================+
+| ``READ`` | Grantee can list the objects in the bucket. | Grantee can read the object. |
++------------------+--------------------------------------------------------+----------------------------------------------+
+| ``WRITE`` | Grantee can write or delete objects in the bucket. | N/A |
++------------------+--------------------------------------------------------+----------------------------------------------+
+| ``READ_ACP`` | Grantee can read bucket ACL. | Grantee can read the object ACL. |
++------------------+--------------------------------------------------------+----------------------------------------------+
+| ``WRITE_ACP`` | Grantee can write bucket ACL. | Grantee can write to the object ACL. |
++------------------+--------------------------------------------------------+----------------------------------------------+
+| ``FULL_CONTROL`` | Grantee has full permissions for object in the bucket. | Grantee can read or write to the object ACL. |
++------------------+--------------------------------------------------------+----------------------------------------------+
+
+Internally, S3 operations are mapped to ACL permissions thus:
+
++---------------------------------------+---------------+
+| Operation | Permission |
++=======================================+===============+
+| ``s3:GetObject`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:GetObjectTorrent`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersion`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersionTorrent`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:GetObjectTagging`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersionTagging`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:ListAllMyBuckets`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:ListBucket`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:ListBucketMultipartUploads`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:ListBucketVersions`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:ListMultipartUploadParts`` | ``READ`` |
++---------------------------------------+---------------+
+| ``s3:AbortMultipartUpload`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:CreateBucket`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:DeleteBucket`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:DeleteObject`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:s3DeleteObjectVersion`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:PutObject`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:PutObjectTagging`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:PutObjectVersionTagging`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:DeleteObjectTagging`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:DeleteObjectVersionTagging`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:RestoreObject`` | ``WRITE`` |
++---------------------------------------+---------------+
+| ``s3:GetAccelerateConfiguration`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketAcl`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketCORS`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketLocation`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketLogging`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketNotification`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketPolicy`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketRequestPayment`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketTagging`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketVersioning`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketWebsite`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetLifecycleConfiguration`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetObjectAcl`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetObjectVersionAcl`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetReplicationConfiguration`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:GetBucketEncryption`` | ``READ_ACP`` |
++---------------------------------------+---------------+
+| ``s3:DeleteBucketPolicy`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:DeleteBucketWebsite`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:DeleteReplicationConfiguration`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutAccelerateConfiguration`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketAcl`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketCORS`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketLogging`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketNotification`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketPolicy`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketRequestPayment`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketTagging`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutPutBucketVersioning`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketWebsite`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutLifecycleConfiguration`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutObjectAcl`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutObjectVersionAcl`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutReplicationConfiguration`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+| ``s3:PutBucketEncryption`` | ``WRITE_ACP`` |
++---------------------------------------+---------------+
+
+Some mappings, (e.g. ``s3:CreateBucket`` to ``WRITE``) are not
+applicable to S3 operation, but are required to allow Swift and S3 to
+access the same resources when things like Swift user ACLs are in
+play. This is one of the many reasons that you should use S3 bucket
+policies rather than S3 ACLs when possible.
+
+
+.. _RFC 2104: http://www.ietf.org/rfc/rfc2104.txt
+.. _HMAC: https://en.wikipedia.org/wiki/HMAC
diff --git a/doc/radosgw/s3/bucketops.rst b/doc/radosgw/s3/bucketops.rst
new file mode 100644
index 000000000..17da3a935
--- /dev/null
+++ b/doc/radosgw/s3/bucketops.rst
@@ -0,0 +1,706 @@
+===================
+ Bucket Operations
+===================
+
+PUT Bucket
+----------
+Creates a new bucket. To create a bucket, you must have a user ID and a valid AWS Access Key ID to authenticate requests. You may not
+create buckets as an anonymous user.
+
+Constraints
+~~~~~~~~~~~
+In general, bucket names should follow domain name constraints.
+
+- Bucket names must be unique.
+- Bucket names cannot be formatted as IP address.
+- Bucket names can be between 3 and 63 characters long.
+- Bucket names must not contain uppercase characters or underscores.
+- Bucket names must start with a lowercase letter or number.
+- Bucket names must be a series of one or more labels. Adjacent labels are separated by a single period (.). Bucket names can contain lowercase letters, numbers, and hyphens. Each label must start and end with a lowercase letter or a number.
+
+.. note:: The above constraints are relaxed if the option 'rgw_relaxed_s3_bucket_names' is set to true except that the bucket names must still be unique, cannot be formatted as IP address and can contain letters, numbers, periods, dashes and underscores for up to 255 characters long.
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{bucket} HTTP/1.1
+ Host: cname.domain.com
+ x-amz-acl: public-read-write
+
+ Authorization: AWS {access-key}:{hash-of-header-and-secret}
+
+Parameters
+~~~~~~~~~~
+
+
++---------------+----------------------+-----------------------------------------------------------------------------+------------+
+| Name | Description | Valid Values | Required |
++===============+======================+=============================================================================+============+
+| ``x-amz-acl`` | Canned ACLs. | ``private``, ``public-read``, ``public-read-write``, ``authenticated-read`` | No |
++---------------+----------------------+-----------------------------------------------------------------------------+------------+
+| ``x-amz-bucket-object-lock-enabled`` | Enable object lock on bucket. | ``true``, ``false`` | No |
++--------------------------------------+-------------------------------+---------------------------------------------+------------+
+
+Request Entities
+~~~~~~~~~~~~~~~~
+
++-------------------------------+-----------+----------------------------------------------------------------+
+| Name | Type | Description |
++===============================+===========+================================================================+
+| ``CreateBucketConfiguration`` | Container | A container for the bucket configuration. |
++-------------------------------+-----------+----------------------------------------------------------------+
+| ``LocationConstraint`` | String | A zonegroup api name, with optional :ref:`s3_bucket_placement` |
++-------------------------------+-----------+----------------------------------------------------------------+
+
+
+HTTP Response
+~~~~~~~~~~~~~
+
+If the bucket name is unique, within constraints and unused, the operation will succeed.
+If a bucket with the same name already exists and the user is the bucket owner, the operation will succeed.
+If the bucket name is already in use, the operation will fail.
+
++---------------+-----------------------+----------------------------------------------------------+
+| HTTP Status | Status Code | Description |
++===============+=======================+==========================================================+
+| ``409`` | BucketAlreadyExists | Bucket already exists under different user's ownership. |
++---------------+-----------------------+----------------------------------------------------------+
+
+DELETE Bucket
+-------------
+
+Deletes a bucket. You can reuse bucket names following a successful bucket removal.
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{bucket} HTTP/1.1
+ Host: cname.domain.com
+
+ Authorization: AWS {access-key}:{hash-of-header-and-secret}
+
+HTTP Response
+~~~~~~~~~~~~~
+
++---------------+---------------+------------------+
+| HTTP Status | Status Code | Description |
++===============+===============+==================+
+| ``204`` | No Content | Bucket removed. |
++---------------+---------------+------------------+
+
+GET Bucket
+----------
+Returns a list of bucket objects.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{bucket}?max-keys=25 HTTP/1.1
+ Host: cname.domain.com
+
+Parameters
+~~~~~~~~~~
+
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+| Name | Type | Description |
++=====================+===========+=================================================================================================+
+| ``prefix`` | String | Only returns objects that contain the specified prefix. |
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+| ``delimiter`` | String | The delimiter between the prefix and the rest of the object name. |
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+| ``marker`` | String | A beginning index for the list of objects returned. |
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+| ``max-keys`` | Integer | The maximum number of keys to return. Default is 1000. |
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+| ``allow-unordered`` | Boolean | Non-standard extension. Allows results to be returned unordered. Cannot be used with delimiter. |
++---------------------+-----------+-------------------------------------------------------------------------------------------------+
+
+HTTP Response
+~~~~~~~~~~~~~
+
++---------------+---------------+--------------------+
+| HTTP Status | Status Code | Description |
++===============+===============+====================+
+| ``200`` | OK | Buckets retrieved |
++---------------+---------------+--------------------+
+
+Bucket Response Entities
+~~~~~~~~~~~~~~~~~~~~~~~~
+``GET /{bucket}`` returns a container for buckets with the following fields.
+
++------------------------+-----------+----------------------------------------------------------------------------------+
+| Name | Type | Description |
++========================+===========+==================================================================================+
+| ``ListBucketResult`` | Entity | The container for the list of objects. |
++------------------------+-----------+----------------------------------------------------------------------------------+
+| ``Name`` | String | The name of the bucket whose contents will be returned. |
++------------------------+-----------+----------------------------------------------------------------------------------+
+| ``Prefix`` | String | A prefix for the object keys. |
++------------------------+-----------+----------------------------------------------------------------------------------+
+| ``Marker`` | String | A beginning index for the list of objects returned. |
++------------------------+-----------+----------------------------------------------------------------------------------+
+| ``MaxKeys`` | Integer | The maximum number of keys returned. |
++------------------------+-----------+----------------------------------------------------------------------------------+
+| ``Delimiter`` | String | If set, objects with the same prefix will appear in the ``CommonPrefixes`` list. |
++------------------------+-----------+----------------------------------------------------------------------------------+
+| ``IsTruncated`` | Boolean | If ``true``, only a subset of the bucket's contents were returned. |
++------------------------+-----------+----------------------------------------------------------------------------------+
+| ``CommonPrefixes`` | Container | If multiple objects contain the same prefix, they will appear in this list. |
++------------------------+-----------+----------------------------------------------------------------------------------+
+
+Object Response Entities
+~~~~~~~~~~~~~~~~~~~~~~~~
+The ``ListBucketResult`` contains objects, where each object is within a ``Contents`` container.
+
++------------------------+-----------+------------------------------------------+
+| Name | Type | Description |
++========================+===========+==========================================+
+| ``Contents`` | Object | A container for the object. |
++------------------------+-----------+------------------------------------------+
+| ``Key`` | String | The object's key. |
++------------------------+-----------+------------------------------------------+
+| ``LastModified`` | Date | The object's last-modified date/time. |
++------------------------+-----------+------------------------------------------+
+| ``ETag`` | String | An MD-5 hash of the object. (entity tag) |
++------------------------+-----------+------------------------------------------+
+| ``Size`` | Integer | The object's size. |
++------------------------+-----------+------------------------------------------+
+| ``StorageClass`` | String | Should always return ``STANDARD``. |
++------------------------+-----------+------------------------------------------+
+| ``Type`` | String | ``Appendable`` or ``Normal``. |
++------------------------+-----------+------------------------------------------+
+
+Get Bucket Location
+-------------------
+Retrieves the bucket's region. The user needs to be the bucket owner
+to call this. A bucket can be constrained to a region by providing
+``LocationConstraint`` during a PUT request.
+
+Syntax
+~~~~~~
+Add the ``location`` subresource to bucket resource as shown below
+
+::
+
+ GET /{bucket}?location HTTP/1.1
+ Host: cname.domain.com
+
+ Authorization: AWS {access-key}:{hash-of-header-and-secret}
+
+Response Entities
+~~~~~~~~~~~~~~~~~~~~~~~~
+
++------------------------+-----------+------------------------------------------+
+| Name | Type | Description |
++========================+===========+==========================================+
+| ``LocationConstraint`` | String | The region where bucket resides, empty |
+| | | string for default region |
++------------------------+-----------+------------------------------------------+
+
+
+
+Get Bucket ACL
+--------------
+Retrieves the bucket access control list. The user needs to be the bucket
+owner or to have been granted ``READ_ACP`` permission on the bucket.
+
+Syntax
+~~~~~~
+Add the ``acl`` subresource to the bucket request as shown below.
+
+::
+
+ GET /{bucket}?acl HTTP/1.1
+ Host: cname.domain.com
+
+ Authorization: AWS {access-key}:{hash-of-header-and-secret}
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| Name | Type | Description |
++===========================+=============+==============================================================================================+
+| ``AccessControlPolicy`` | Container | A container for the response. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``AccessControlList`` | Container | A container for the ACL information. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Owner`` | Container | A container for the bucket owner's ``ID`` and ``DisplayName``. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``ID`` | String | The bucket owner's ID. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``DisplayName`` | String | The bucket owner's display name. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Grant`` | Container | A container for ``Grantee`` and ``Permission``. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Grantee`` | Container | A container for the ``DisplayName`` and ``ID`` of the user receiving a grant of permission. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Permission`` | String | The permission given to the ``Grantee`` bucket. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+
+PUT Bucket ACL
+--------------
+Sets an access control to an existing bucket. The user needs to be the bucket
+owner or to have been granted ``WRITE_ACP`` permission on the bucket.
+
+Syntax
+~~~~~~
+Add the ``acl`` subresource to the bucket request as shown below.
+
+::
+
+ PUT /{bucket}?acl HTTP/1.1
+
+Request Entities
+~~~~~~~~~~~~~~~~
+
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| Name | Type | Description |
++===========================+=============+==============================================================================================+
+| ``AccessControlPolicy`` | Container | A container for the request. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``AccessControlList`` | Container | A container for the ACL information. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Owner`` | Container | A container for the bucket owner's ``ID`` and ``DisplayName``. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``ID`` | String | The bucket owner's ID. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``DisplayName`` | String | The bucket owner's display name. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Grant`` | Container | A container for ``Grantee`` and ``Permission``. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Grantee`` | Container | A container for the ``DisplayName`` and ``ID`` of the user receiving a grant of permission. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Permission`` | String | The permission given to the ``Grantee`` bucket. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+
+List Bucket Multipart Uploads
+-----------------------------
+
+``GET /?uploads`` returns a list of the current in-progress multipart uploads--i.e., the application initiates a multipart upload, but
+the service hasn't completed all the uploads yet.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{bucket}?uploads HTTP/1.1
+
+Parameters
+~~~~~~~~~~
+
+You may specify parameters for ``GET /{bucket}?uploads``, but none of them are required.
+
++------------------------+-----------+--------------------------------------------------------------------------------------+
+| Name | Type | Description |
++========================+===========+======================================================================================+
+| ``prefix`` | String | Returns in-progress uploads whose keys contains the specified prefix. |
++------------------------+-----------+--------------------------------------------------------------------------------------+
+| ``delimiter`` | String | The delimiter between the prefix and the rest of the object name. |
++------------------------+-----------+--------------------------------------------------------------------------------------+
+| ``key-marker`` | String | The beginning marker for the list of uploads. |
++------------------------+-----------+--------------------------------------------------------------------------------------+
+| ``max-keys`` | Integer | The maximum number of in-progress uploads. The default is 1000. |
++------------------------+-----------+--------------------------------------------------------------------------------------+
+| ``max-uploads`` | Integer | The maximum number of multipart uploads. The range from 1-1000. The default is 1000. |
++------------------------+-----------+--------------------------------------------------------------------------------------+
+| ``upload-id-marker`` | String | Ignored if ``key-marker`` is not specified. Specifies the ``ID`` of first |
+| | | upload to list in lexicographical order at or following the ``ID``. |
++------------------------+-----------+--------------------------------------------------------------------------------------+
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| Name | Type | Description |
++=========================================+=============+==========================================================================================================+
+| ``ListMultipartUploadsResult`` | Container | A container for the results. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``ListMultipartUploadsResult.Prefix`` | String | The prefix specified by the ``prefix`` request parameter (if any). |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Bucket`` | String | The bucket that will receive the bucket contents. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``KeyMarker`` | String | The key marker specified by the ``key-marker`` request parameter (if any). |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``UploadIdMarker`` | String | The marker specified by the ``upload-id-marker`` request parameter (if any). |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``NextKeyMarker`` | String | The key marker to use in a subsequent request if ``IsTruncated`` is ``true``. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``NextUploadIdMarker`` | String | The upload ID marker to use in a subsequent request if ``IsTruncated`` is ``true``. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``MaxUploads`` | Integer | The max uploads specified by the ``max-uploads`` request parameter. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Delimiter`` | String | If set, objects with the same prefix will appear in the ``CommonPrefixes`` list. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``IsTruncated`` | Boolean | If ``true``, only a subset of the bucket's upload contents were returned. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Upload`` | Container | A container for ``Key``, ``UploadId``, ``InitiatorOwner``, ``StorageClass``, and ``Initiated`` elements. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Key`` | String | The key of the object once the multipart upload is complete. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``UploadId`` | String | The ``ID`` that identifies the multipart upload. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Initiator`` | Container | Contains the ``ID`` and ``DisplayName`` of the user who initiated the upload. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``DisplayName`` | String | The initiator's display name. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``ID`` | String | The initiator's ID. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Owner`` | Container | A container for the ``ID`` and ``DisplayName`` of the user who owns the uploaded object. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``StorageClass`` | String | The method used to store the resulting object. ``STANDARD`` or ``REDUCED_REDUNDANCY`` |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Initiated`` | Date | The date and time the user initiated the upload. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``CommonPrefixes`` | Container | If multiple objects contain the same prefix, they will appear in this list. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``CommonPrefixes.Prefix`` | String | The substring of the key after the prefix as defined by the ``prefix`` request parameter. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+
+ENABLE/SUSPEND BUCKET VERSIONING
+--------------------------------
+
+``PUT /?versioning`` This subresource set the versioning state of an existing bucket. To set the versioning state, you must be the bucket owner.
+
+You can set the versioning state with one of the following values:
+
+- Enabled : Enables versioning for the objects in the bucket, All objects added to the bucket receive a unique version ID.
+- Suspended : Disables versioning for the objects in the bucket, All objects added to the bucket receive the version ID null.
+
+If the versioning state has never been set on a bucket, it has no versioning state; a GET versioning request does not return a versioning state value.
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{bucket}?versioning HTTP/1.1
+
+REQUEST ENTITIES
+~~~~~~~~~~~~~~~~
+
++-----------------------------+-----------+---------------------------------------------------------------------------+
+| Name | Type | Description |
++=============================+===========+===========================================================================+
+| ``VersioningConfiguration`` | Container | A container for the request. |
++-----------------------------+-----------+---------------------------------------------------------------------------+
+| ``Status`` | String | Sets the versioning state of the bucket. Valid Values: Suspended/Enabled |
++-----------------------------+-----------+---------------------------------------------------------------------------+
+
+PUT BUCKET OBJECT LOCK
+--------------------------------
+
+Places an Object Lock configuration on the specified bucket. The rule specified in the Object Lock configuration will be
+applied by default to every new object placed in the specified bucket.
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{bucket}?object-lock HTTP/1.1
+
+Request Entities
+~~~~~~~~~~~~~~~~
+
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| Name | Type | Description | Required |
++=============================+=============+========================================================================================+==========+
+| ``ObjectLockConfiguration`` | Container | A container for the request. | Yes |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``ObjectLockEnabled`` | String | Indicates whether this bucket has an Object Lock configuration enabled. | Yes |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``Rule`` | Container | The Object Lock rule in place for the specified bucket. | No |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``DefaultRetention`` | Container | The default retention period applied to new objects placed in the specified bucket. | No |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``Mode`` | String | The default Object Lock retention mode. Valid Values: GOVERNANCE/COMPLIANCE | Yes |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``Days`` | Integer | The number of days specified for the default retention period. | No |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``Years`` | Integer | The number of years specified for the default retention period. | No |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+
+HTTP Response
+~~~~~~~~~~~~~
+
+If the bucket object lock is not enabled when creating the bucket, the operation will fail.
+
++---------------+-----------------------+----------------------------------------------------------+
+| HTTP Status | Status Code | Description |
++===============+=======================+==========================================================+
+| ``400`` | MalformedXML | The XML is not well-formed |
++---------------+-----------------------+----------------------------------------------------------+
+| ``409`` | InvalidBucketState | The bucket object lock is not enabled |
++---------------+-----------------------+----------------------------------------------------------+
+
+GET BUCKET OBJECT LOCK
+--------------------------------
+
+Gets the Object Lock configuration for a bucket. The rule specified in the Object Lock configuration will be applied by
+default to every new object placed in the specified bucket.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{bucket}?object-lock HTTP/1.1
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| Name | Type | Description | Required |
++=============================+=============+========================================================================================+==========+
+| ``ObjectLockConfiguration`` | Container | A container for the request. | Yes |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``ObjectLockEnabled`` | String | Indicates whether this bucket has an Object Lock configuration enabled. | Yes |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``Rule`` | Container | The Object Lock rule in place for the specified bucket. | No |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``DefaultRetention`` | Container | The default retention period applied to new objects placed in the specified bucket. | No |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``Mode`` | String | The default Object Lock retention mode. Valid Values: GOVERNANCE/COMPLIANCE | Yes |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``Days`` | Integer | The number of days specified for the default retention period. | No |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+| ``Years`` | Integer | The number of years specified for the default retention period. | No |
++-----------------------------+-------------+----------------------------------------------------------------------------------------+----------+
+
+Create Notification
+-------------------
+
+Create a publisher for a specific bucket into a topic.
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{bucket}?notification HTTP/1.1
+
+
+Request Entities
+~~~~~~~~~~~~~~~~
+
+Parameters are XML encoded in the body of the request, in the following format:
+
+::
+
+ <NotificationConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
+ <TopicConfiguration>
+ <Id></Id>
+ <Topic></Topic>
+ <Event></Event>
+ <Filter>
+ <S3Key>
+ <FilterRule>
+ <Name></Name>
+ <Value></Value>
+ </FilterRule>
+ </S3Key>
+ <S3Metadata>
+ <FilterRule>
+ <Name></Name>
+ <Value></Value>
+ </FilterRule>
+ </S3Metadata>
+ <S3Tags>
+ <FilterRule>
+ <Name></Name>
+ <Value></Value>
+ </FilterRule>
+ </S3Tags>
+ </Filter>
+ </TopicConfiguration>
+ </NotificationConfiguration>
+
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| Name | Type | Description | Required |
++===============================+===========+======================================================================================+==========+
+| ``NotificationConfiguration`` | Container | Holding list of ``TopicConfiguration`` entities | Yes |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``TopicConfiguration`` | Container | Holding ``Id``, ``Topic`` and list of ``Event`` entities | Yes |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``Id`` | String | Name of the notification | Yes |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``Topic`` | String | Topic ARN. Topic must be created beforehand | Yes |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``Event`` | String | List of supported events see: `S3 Notification Compatibility`_. Multiple ``Event`` | No |
+| | | entities can be used. If omitted, all events are handled | |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``Filter`` | Container | Holding ``S3Key``, ``S3Metadata`` and ``S3Tags`` entities | No |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``S3Key`` | Container | Holding a list of ``FilterRule`` entities, for filtering based on object key. | No |
+| | | At most, 3 entities may be in the list, with ``Name`` be ``prefix``, ``suffix`` or | |
+| | | ``regex``. All filter rules in the list must match for the filter to match. | |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``S3Metadata`` | Container | Holding a list of ``FilterRule`` entities, for filtering based on object metadata. | No |
+| | | All filter rules in the list must match the metadata defined on the object. However, | |
+| | | the object still match if it has other metadata entries not listed in the filter. | |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``S3Tags`` | Container | Holding a list of ``FilterRule`` entities, for filtering based on object tags. | No |
+| | | All filter rules in the list must match the tags defined on the object. However, | |
+| | | the object still match it it has other tags not listed in the filter. | |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``S3Key.FilterRule`` | Container | Holding ``Name`` and ``Value`` entities. ``Name`` would be: ``prefix``, ``suffix`` | Yes |
+| | | or ``regex``. The ``Value`` would hold the key prefix, key suffix or a regular | |
+| | | expression for matching the key, accordingly. | |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``S3Metadata.FilterRule`` | Container | Holding ``Name`` and ``Value`` entities. ``Name`` would be the name of the metadata | Yes |
+| | | attribute (e.g. ``x-amz-meta-xxx``). The ``Value`` would be the expected value for | |
+| | | this attribute. | |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``S3Tags.FilterRule`` | Container | Holding ``Name`` and ``Value`` entities. ``Name`` would be the tag key, | Yes |
+| | | and ``Value`` would be the tag value. | |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+
+
+HTTP Response
+~~~~~~~~~~~~~
+
++---------------+-----------------------+----------------------------------------------------------+
+| HTTP Status | Status Code | Description |
++===============+=======================+==========================================================+
+| ``400`` | MalformedXML | The XML is not well-formed |
++---------------+-----------------------+----------------------------------------------------------+
+| ``400`` | InvalidArgument | Missing Id; Missing/Invalid Topic ARN; Invalid Event |
++---------------+-----------------------+----------------------------------------------------------+
+| ``404`` | NoSuchBucket | The bucket does not exist |
++---------------+-----------------------+----------------------------------------------------------+
+| ``404`` | NoSuchKey | The topic does not exist |
++---------------+-----------------------+----------------------------------------------------------+
+
+
+Delete Notification
+-------------------
+
+Delete a specific, or all, notifications from a bucket.
+
+.. note::
+
+ - Notification deletion is an extension to the S3 notification API
+ - When the bucket is deleted, any notification defined on it is also deleted
+ - Deleting an unknown notification (e.g. double delete) is not considered an error
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{bucket}?notification[=<notification-id>] HTTP/1.1
+
+
+Parameters
+~~~~~~~~~~
+
++------------------------+-----------+----------------------------------------------------------------------------------------+
+| Name | Type | Description |
++========================+===========+========================================================================================+
+| ``notification-id`` | String | Name of the notification. If not provided, all notifications on the bucket are deleted |
++------------------------+-----------+----------------------------------------------------------------------------------------+
+
+HTTP Response
+~~~~~~~~~~~~~
+
++---------------+-----------------------+----------------------------------------------------------+
+| HTTP Status | Status Code | Description |
++===============+=======================+==========================================================+
+| ``404`` | NoSuchBucket | The bucket does not exist |
++---------------+-----------------------+----------------------------------------------------------+
+
+Get/List Notification
+---------------------
+
+Get a specific notification, or list all notifications configured on a bucket.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{bucket}?notification[=<notification-id>] HTTP/1.1
+
+
+Parameters
+~~~~~~~~~~
+
++------------------------+-----------+----------------------------------------------------------------------------------------+
+| Name | Type | Description |
++========================+===========+========================================================================================+
+| ``notification-id`` | String | Name of the notification. If not provided, all notifications on the bucket are listed |
++------------------------+-----------+----------------------------------------------------------------------------------------+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+Response is XML encoded in the body of the request, in the following format:
+
+::
+
+ <NotificationConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
+ <TopicConfiguration>
+ <Id></Id>
+ <Topic></Topic>
+ <Event></Event>
+ <Filter>
+ <S3Key>
+ <FilterRule>
+ <Name></Name>
+ <Value></Value>
+ </FilterRule>
+ </S3Key>
+ <S3Metadata>
+ <FilterRule>
+ <Name></Name>
+ <Value></Value>
+ </FilterRule>
+ </S3Metadata>
+ <S3Tags>
+ <FilterRule>
+ <Name></Name>
+ <Value></Value>
+ </FilterRule>
+ </S3Tags>
+ </Filter>
+ </TopicConfiguration>
+ </NotificationConfiguration>
+
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| Name | Type | Description | Required |
++===============================+===========+======================================================================================+==========+
+| ``NotificationConfiguration`` | Container | Holding list of ``TopicConfiguration`` entities | Yes |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``TopicConfiguration`` | Container | Holding ``Id``, ``Topic`` and list of ``Event`` entities | Yes |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``Id`` | String | Name of the notification | Yes |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``Topic`` | String | Topic ARN | Yes |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``Event`` | String | Handled event. Multiple ``Event`` entities may exist | Yes |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+| ``Filter`` | Container | Holding the filters configured for this notification | No |
++-------------------------------+-----------+--------------------------------------------------------------------------------------+----------+
+
+HTTP Response
+~~~~~~~~~~~~~
+
++---------------+-----------------------+----------------------------------------------------------+
+| HTTP Status | Status Code | Description |
++===============+=======================+==========================================================+
+| ``404`` | NoSuchBucket | The bucket does not exist |
++---------------+-----------------------+----------------------------------------------------------+
+| ``404`` | NoSuchKey | The notification does not exist (if provided) |
++---------------+-----------------------+----------------------------------------------------------+
+
+.. _S3 Notification Compatibility: ../../s3-notification-compatibility
diff --git a/doc/radosgw/s3/commons.rst b/doc/radosgw/s3/commons.rst
new file mode 100644
index 000000000..4b9b4a040
--- /dev/null
+++ b/doc/radosgw/s3/commons.rst
@@ -0,0 +1,113 @@
+=================
+ Common Entities
+=================
+
+.. toctree::
+ :maxdepth: -1
+
+Bucket and Host Name
+--------------------
+There are two different modes of accessing the buckets. The first (preferred) method
+identifies the bucket as the top-level directory in the URI. ::
+
+ GET /mybucket HTTP/1.1
+ Host: cname.domain.com
+
+The second method identifies the bucket via a virtual bucket host name. For example::
+
+ GET / HTTP/1.1
+ Host: mybucket.cname.domain.com
+
+To configure virtual hosted buckets, you can either set ``rgw_dns_name = cname.domain.com`` in ceph.conf, or add ``cname.domain.com`` to the list of ``hostnames`` in your zonegroup configuration. See `Ceph Object Gateway - Multisite Configuration`_ for more on zonegroups.
+
+.. tip:: We prefer the first method, because the second method requires expensive domain certification and DNS wild cards.
+
+.. tip:: You can define multiple hostname directly with the :confval:`rgw_dns_name` parameter.
+
+Common Request Headers
+----------------------
+
++--------------------+------------------------------------------+
+| Request Header | Description |
++====================+==========================================+
+| ``CONTENT_LENGTH`` | Length of the request body. |
++--------------------+------------------------------------------+
+| ``DATE`` | Request time and date (in UTC). |
++--------------------+------------------------------------------+
+| ``HOST`` | The name of the host server. |
++--------------------+------------------------------------------+
+| ``AUTHORIZATION`` | Authorization token. |
++--------------------+------------------------------------------+
+
+Common Response Status
+----------------------
+
++---------------+-----------------------------------+
+| HTTP Status | Response Code |
++===============+===================================+
+| ``100`` | Continue |
++---------------+-----------------------------------+
+| ``200`` | Success |
++---------------+-----------------------------------+
+| ``201`` | Created |
++---------------+-----------------------------------+
+| ``202`` | Accepted |
++---------------+-----------------------------------+
+| ``204`` | NoContent |
++---------------+-----------------------------------+
+| ``206`` | Partial content |
++---------------+-----------------------------------+
+| ``304`` | NotModified |
++---------------+-----------------------------------+
+| ``400`` | InvalidArgument |
++---------------+-----------------------------------+
+| ``400`` | InvalidDigest |
++---------------+-----------------------------------+
+| ``400`` | BadDigest |
++---------------+-----------------------------------+
+| ``400`` | InvalidBucketName |
++---------------+-----------------------------------+
+| ``400`` | InvalidObjectName |
++---------------+-----------------------------------+
+| ``400`` | UnresolvableGrantByEmailAddress |
++---------------+-----------------------------------+
+| ``400`` | InvalidPart |
++---------------+-----------------------------------+
+| ``400`` | InvalidPartOrder |
++---------------+-----------------------------------+
+| ``400`` | RequestTimeout |
++---------------+-----------------------------------+
+| ``400`` | EntityTooLarge |
++---------------+-----------------------------------+
+| ``403`` | AccessDenied |
++---------------+-----------------------------------+
+| ``403`` | UserSuspended |
++---------------+-----------------------------------+
+| ``403`` | RequestTimeTooSkewed |
++---------------+-----------------------------------+
+| ``404`` | NoSuchKey |
++---------------+-----------------------------------+
+| ``404`` | NoSuchBucket |
++---------------+-----------------------------------+
+| ``404`` | NoSuchUpload |
++---------------+-----------------------------------+
+| ``405`` | MethodNotAllowed |
++---------------+-----------------------------------+
+| ``408`` | RequestTimeout |
++---------------+-----------------------------------+
+| ``409`` | BucketAlreadyExists |
++---------------+-----------------------------------+
+| ``409`` | BucketNotEmpty |
++---------------+-----------------------------------+
+| ``411`` | MissingContentLength |
++---------------+-----------------------------------+
+| ``412`` | PreconditionFailed |
++---------------+-----------------------------------+
+| ``416`` | InvalidRange |
++---------------+-----------------------------------+
+| ``422`` | UnprocessableEntity |
++---------------+-----------------------------------+
+| ``500`` | InternalError |
++---------------+-----------------------------------+
+
+.. _`Ceph Object Gateway - Multisite Configuration`: ../../multisite
diff --git a/doc/radosgw/s3/cpp.rst b/doc/radosgw/s3/cpp.rst
new file mode 100644
index 000000000..089c9c53a
--- /dev/null
+++ b/doc/radosgw/s3/cpp.rst
@@ -0,0 +1,337 @@
+.. _cpp:
+
+C++ S3 Examples
+===============
+
+Setup
+-----
+
+The following contains includes and globals that will be used in later examples:
+
+.. code-block:: cpp
+
+ #include "libs3.h"
+ #include <stdlib.h>
+ #include <iostream>
+ #include <fstream>
+
+ const char access_key[] = "ACCESS_KEY";
+ const char secret_key[] = "SECRET_KEY";
+ const char host[] = "HOST";
+ const char sample_bucket[] = "sample_bucket";
+ const char sample_key[] = "hello.txt";
+ const char sample_file[] = "resource/hello.txt";
+ const char *security_token = NULL;
+ const char *auth_region = NULL;
+
+ S3BucketContext bucketContext =
+ {
+ host,
+ sample_bucket,
+ S3ProtocolHTTP,
+ S3UriStylePath,
+ access_key,
+ secret_key,
+ security_token,
+ auth_region
+ };
+
+ S3Status responsePropertiesCallback(
+ const S3ResponseProperties *properties,
+ void *callbackData)
+ {
+ return S3StatusOK;
+ }
+
+ static void responseCompleteCallback(
+ S3Status status,
+ const S3ErrorDetails *error,
+ void *callbackData)
+ {
+ return;
+ }
+
+ S3ResponseHandler responseHandler =
+ {
+ &responsePropertiesCallback,
+ &responseCompleteCallback
+ };
+
+
+Creating (and Closing) a Connection
+-----------------------------------
+
+This creates a connection so that you can interact with the server.
+
+.. code-block:: cpp
+
+ S3_initialize("s3", S3_INIT_ALL, host);
+ // Do stuff...
+ S3_deinitialize();
+
+
+Listing Owned Buckets
+---------------------
+
+This gets a list of Buckets that you own.
+This also prints out the bucket name, owner ID, and display name
+for each bucket.
+
+.. code-block:: cpp
+
+ static S3Status listServiceCallback(
+ const char *ownerId,
+ const char *ownerDisplayName,
+ const char *bucketName,
+ int64_t creationDate, void *callbackData)
+ {
+ bool *header_printed = (bool*) callbackData;
+ if (!*header_printed) {
+ *header_printed = true;
+ printf("%-22s", " Bucket");
+ printf(" %-20s %-12s", " Owner ID", "Display Name");
+ printf("\n");
+ printf("----------------------");
+ printf(" --------------------" " ------------");
+ printf("\n");
+ }
+
+ printf("%-22s", bucketName);
+ printf(" %-20s %-12s", ownerId ? ownerId : "", ownerDisplayName ? ownerDisplayName : "");
+ printf("\n");
+
+ return S3StatusOK;
+ }
+
+ S3ListServiceHandler listServiceHandler =
+ {
+ responseHandler,
+ &listServiceCallback
+ };
+ bool header_printed = false;
+ S3_list_service(S3ProtocolHTTP, access_key, secret_key, security_token, host,
+ auth_region, NULL, 0, &listServiceHandler, &header_printed);
+
+
+Creating a Bucket
+-----------------
+
+This creates a new bucket.
+
+.. code-block:: cpp
+
+ S3_create_bucket(S3ProtocolHTTP, access_key, secret_key, NULL, host, sample_bucket, S3CannedAclPrivate, NULL, NULL, &responseHandler, NULL);
+
+
+Listing a Bucket's Content
+--------------------------
+
+This gets a list of objects in the bucket.
+This also prints out each object's name, the file size, and
+last modified date.
+
+.. code-block:: cpp
+
+ static S3Status listBucketCallback(
+ int isTruncated,
+ const char *nextMarker,
+ int contentsCount,
+ const S3ListBucketContent *contents,
+ int commonPrefixesCount,
+ const char **commonPrefixes,
+ void *callbackData)
+ {
+ printf("%-22s", " Object Name");
+ printf(" %-5s %-20s", "Size", " Last Modified");
+ printf("\n");
+ printf("----------------------");
+ printf(" -----" " --------------------");
+ printf("\n");
+
+ for (int i = 0; i < contentsCount; i++) {
+ char timebuf[256];
+ char sizebuf[16];
+ const S3ListBucketContent *content = &(contents[i]);
+ time_t t = (time_t) content->lastModified;
+
+ strftime(timebuf, sizeof(timebuf), "%Y-%m-%dT%H:%M:%SZ", gmtime(&t));
+ sprintf(sizebuf, "%5llu", (unsigned long long) content->size);
+ printf("%-22s %s %s\n", content->key, sizebuf, timebuf);
+ }
+
+ return S3StatusOK;
+ }
+
+ S3ListBucketHandler listBucketHandler =
+ {
+ responseHandler,
+ &listBucketCallback
+ };
+ S3_list_bucket(&bucketContext, NULL, NULL, NULL, 0, NULL, 0, &listBucketHandler, NULL);
+
+The output will look something like this::
+
+ myphoto1.jpg 251262 2011-08-08T21:35:48.000Z
+ myphoto2.jpg 262518 2011-08-08T21:38:01.000Z
+
+
+Deleting a Bucket
+-----------------
+
+.. note::
+
+ The Bucket must be empty! Otherwise it won't work!
+
+.. code-block:: cpp
+
+ S3_delete_bucket(S3ProtocolHTTP, S3UriStylePath, access_key, secret_key, 0, host, sample_bucket, NULL, NULL, 0, &responseHandler, NULL);
+
+
+Creating an Object (from a file)
+--------------------------------
+
+This creates a file ``hello.txt``.
+
+.. code-block:: cpp
+
+ #include <sys/stat.h>
+ typedef struct put_object_callback_data
+ {
+ FILE *infile;
+ uint64_t contentLength;
+ } put_object_callback_data;
+
+
+ static int putObjectDataCallback(int bufferSize, char *buffer, void *callbackData)
+ {
+ put_object_callback_data *data = (put_object_callback_data *) callbackData;
+
+ int ret = 0;
+
+ if (data->contentLength) {
+ int toRead = ((data->contentLength > (unsigned) bufferSize) ? (unsigned) bufferSize : data->contentLength);
+ ret = fread(buffer, 1, toRead, data->infile);
+ }
+ data->contentLength -= ret;
+ return ret;
+ }
+
+ put_object_callback_data data;
+ struct stat statbuf;
+ if (stat(sample_file, &statbuf) == -1) {
+ fprintf(stderr, "\nERROR: Failed to stat file %s: ", sample_file);
+ perror(0);
+ exit(-1);
+ }
+
+ int contentLength = statbuf.st_size;
+ data.contentLength = contentLength;
+
+ if (!(data.infile = fopen(sample_file, "r"))) {
+ fprintf(stderr, "\nERROR: Failed to open input file %s: ", sample_file);
+ perror(0);
+ exit(-1);
+ }
+
+ S3PutObjectHandler putObjectHandler =
+ {
+ responseHandler,
+ &putObjectDataCallback
+ };
+
+ S3_put_object(&bucketContext, sample_key, contentLength, NULL, NULL, 0, &putObjectHandler, &data);
+ fclose(data.infile);
+
+
+Download an Object (to a file)
+------------------------------
+
+This downloads a file and prints the contents.
+
+.. code-block:: cpp
+
+ static S3Status getObjectDataCallback(int bufferSize, const char *buffer, void *callbackData)
+ {
+ FILE *outfile = (FILE *) callbackData;
+ size_t wrote = fwrite(buffer, 1, bufferSize, outfile);
+ return ((wrote < (size_t) bufferSize) ? S3StatusAbortedByCallback : S3StatusOK);
+ }
+
+ S3GetObjectHandler getObjectHandler =
+ {
+ responseHandler,
+ &getObjectDataCallback
+ };
+ FILE *outfile = stdout;
+ S3_get_object(&bucketContext, sample_key, NULL, 0, 0, NULL, 0, &getObjectHandler, outfile);
+
+
+Delete an Object
+----------------
+
+This deletes an object.
+
+.. code-block:: cpp
+
+ S3ResponseHandler deleteResponseHandler =
+ {
+ 0,
+ &responseCompleteCallback
+ };
+ S3_delete_object(&bucketContext, sample_key, 0, 0, &deleteResponseHandler, 0);
+
+
+Change an Object's ACL
+----------------------
+
+This changes an object's ACL to grant full control to another user.
+
+
+.. code-block:: cpp
+
+ #include <string.h>
+ char ownerId[] = "owner";
+ char ownerDisplayName[] = "owner";
+ char granteeId[] = "grantee";
+ char granteeDisplayName[] = "grantee";
+
+ S3AclGrant grants[] = {
+ {
+ S3GranteeTypeCanonicalUser,
+ {{}},
+ S3PermissionFullControl
+ },
+ {
+ S3GranteeTypeCanonicalUser,
+ {{}},
+ S3PermissionReadACP
+ },
+ {
+ S3GranteeTypeAllUsers,
+ {{}},
+ S3PermissionRead
+ }
+ };
+
+ strncpy(grants[0].grantee.canonicalUser.id, ownerId, S3_MAX_GRANTEE_USER_ID_SIZE);
+ strncpy(grants[0].grantee.canonicalUser.displayName, ownerDisplayName, S3_MAX_GRANTEE_DISPLAY_NAME_SIZE);
+
+ strncpy(grants[1].grantee.canonicalUser.id, granteeId, S3_MAX_GRANTEE_USER_ID_SIZE);
+ strncpy(grants[1].grantee.canonicalUser.displayName, granteeDisplayName, S3_MAX_GRANTEE_DISPLAY_NAME_SIZE);
+
+ S3_set_acl(&bucketContext, sample_key, ownerId, ownerDisplayName, 3, grants, 0, &responseHandler, 0);
+
+
+Generate Object Download URL (signed)
+-------------------------------------
+
+This generates a signed download URL that will be valid for 5 minutes.
+
+.. code-block:: cpp
+
+ #include <time.h>
+ char buffer[S3_MAX_AUTHENTICATED_QUERY_STRING_SIZE];
+ int64_t expires = time(NULL) + 60 * 5; // Current time + 5 minutes
+
+ S3_generate_authenticated_query_string(buffer, &bucketContext, sample_key, expires, NULL, "GET");
+
diff --git a/doc/radosgw/s3/csharp.rst b/doc/radosgw/s3/csharp.rst
new file mode 100644
index 000000000..af1c6e4b5
--- /dev/null
+++ b/doc/radosgw/s3/csharp.rst
@@ -0,0 +1,199 @@
+.. _csharp:
+
+C# S3 Examples
+==============
+
+Creating a Connection
+---------------------
+
+This creates a connection so that you can interact with the server.
+
+.. code-block:: csharp
+
+ using System;
+ using Amazon;
+ using Amazon.S3;
+ using Amazon.S3.Model;
+
+ string accessKey = "put your access key here!";
+ string secretKey = "put your secret key here!";
+
+ AmazonS3Config config = new AmazonS3Config();
+ config.ServiceURL = "objects.dreamhost.com";
+
+ AmazonS3Client s3Client = new AmazonS3Client(
+ accessKey,
+ secretKey,
+ config
+ );
+
+
+Listing Owned Buckets
+---------------------
+
+This gets a list of Buckets that you own.
+This also prints out the bucket name and creation date of each bucket.
+
+.. code-block:: csharp
+
+ ListBucketsResponse response = client.ListBuckets();
+ foreach (S3Bucket b in response.Buckets)
+ {
+ Console.WriteLine("{0}\t{1}", b.BucketName, b.CreationDate);
+ }
+
+The output will look something like this::
+
+ mahbuckat1 2011-04-21T18:05:39.000Z
+ mahbuckat2 2011-04-21T18:05:48.000Z
+ mahbuckat3 2011-04-21T18:07:18.000Z
+
+
+Creating a Bucket
+-----------------
+This creates a new bucket called ``my-new-bucket``
+
+.. code-block:: csharp
+
+ PutBucketRequest request = new PutBucketRequest();
+ request.BucketName = "my-new-bucket";
+ client.PutBucket(request);
+
+Listing a Bucket's Content
+--------------------------
+
+This gets a list of objects in the bucket.
+This also prints out each object's name, the file size, and last
+modified date.
+
+.. code-block:: csharp
+
+ ListObjectsRequest request = new ListObjectsRequest();
+ request.BucketName = "my-new-bucket";
+ ListObjectsResponse response = client.ListObjects(request);
+ foreach (S3Object o in response.S3Objects)
+ {
+ Console.WriteLine("{0}\t{1}\t{2}", o.Key, o.Size, o.LastModified);
+ }
+
+The output will look something like this::
+
+ myphoto1.jpg 251262 2011-08-08T21:35:48.000Z
+ myphoto2.jpg 262518 2011-08-08T21:38:01.000Z
+
+
+Deleting a Bucket
+-----------------
+
+.. note::
+
+ The Bucket must be empty! Otherwise it won't work!
+
+.. code-block:: csharp
+
+ DeleteBucketRequest request = new DeleteBucketRequest();
+ request.BucketName = "my-new-bucket";
+ client.DeleteBucket(request);
+
+
+Forced Delete for Non-empty Buckets
+-----------------------------------
+
+.. attention::
+
+ not available
+
+
+Creating an Object
+------------------
+
+This creates a file ``hello.txt`` with the string ``"Hello World!"``
+
+.. code-block:: csharp
+
+ PutObjectRequest request = new PutObjectRequest();
+ request.BucketName = "my-new-bucket";
+ request.Key = "hello.txt";
+ request.ContentType = "text/plain";
+ request.ContentBody = "Hello World!";
+ client.PutObject(request);
+
+
+Change an Object's ACL
+----------------------
+
+This makes the object ``hello.txt`` to be publicly readable, and
+``secret_plans.txt`` to be private.
+
+.. code-block:: csharp
+
+ PutACLRequest request = new PutACLRequest();
+ request.BucketName = "my-new-bucket";
+ request.Key = "hello.txt";
+ request.CannedACL = S3CannedACL.PublicRead;
+ client.PutACL(request);
+
+ PutACLRequest request2 = new PutACLRequest();
+ request2.BucketName = "my-new-bucket";
+ request2.Key = "secret_plans.txt";
+ request2.CannedACL = S3CannedACL.Private;
+ client.PutACL(request2);
+
+
+Download an Object (to a file)
+------------------------------
+
+This downloads the object ``perl_poetry.pdf`` and saves it in
+``C:\Users\larry\Documents``
+
+.. code-block:: csharp
+
+ GetObjectRequest request = new GetObjectRequest();
+ request.BucketName = "my-new-bucket";
+ request.Key = "perl_poetry.pdf";
+ GetObjectResponse response = client.GetObject(request);
+ response.WriteResponseStreamToFile("C:\\Users\\larry\\Documents\\perl_poetry.pdf");
+
+
+Delete an Object
+----------------
+
+This deletes the object ``goodbye.txt``
+
+.. code-block:: csharp
+
+ DeleteObjectRequest request = new DeleteObjectRequest();
+ request.BucketName = "my-new-bucket";
+ request.Key = "goodbye.txt";
+ client.DeleteObject(request);
+
+
+Generate Object Download URLs (signed and unsigned)
+---------------------------------------------------
+
+This generates an unsigned download URL for ``hello.txt``. This works
+because we made ``hello.txt`` public by setting the ACL above.
+This then generates a signed download URL for ``secret_plans.txt`` that
+will work for 1 hour. Signed download URLs will work for the time
+period even if the object is private (when the time period is up, the
+URL will stop working).
+
+.. note::
+
+ The C# S3 Library does not have a method for generating unsigned
+ URLs, so the following example only shows generating signed URLs.
+
+.. code-block:: csharp
+
+ GetPreSignedUrlRequest request = new GetPreSignedUrlRequest();
+ request.BucketName = "my-bucket-name";
+ request.Key = "secret_plans.txt";
+ request.Expires = DateTime.Now.AddHours(1);
+ request.Protocol = Protocol.HTTP;
+ string url = client.GetPreSignedURL(request);
+ Console.WriteLine(url);
+
+The output of this will look something like::
+
+ http://objects.dreamhost.com/my-bucket-name/secret_plans.txt?Signature=XXXXXXXXXXXXXXXXXXXXXXXXXXX&Expires=1316027075&AWSAccessKeyId=XXXXXXXXXXXXXXXXXXX
+
diff --git a/doc/radosgw/s3/java.rst b/doc/radosgw/s3/java.rst
new file mode 100644
index 000000000..057c09c2c
--- /dev/null
+++ b/doc/radosgw/s3/java.rst
@@ -0,0 +1,212 @@
+.. _java:
+
+Java S3 Examples
+================
+
+Setup
+-----
+
+The following examples may require some or all of the following java
+classes to be imported:
+
+.. code-block:: java
+
+ import java.io.ByteArrayInputStream;
+ import java.io.File;
+ import java.util.List;
+ import com.amazonaws.auth.AWSCredentials;
+ import com.amazonaws.auth.BasicAWSCredentials;
+ import com.amazonaws.util.StringUtils;
+ import com.amazonaws.services.s3.AmazonS3;
+ import com.amazonaws.services.s3.AmazonS3Client;
+ import com.amazonaws.services.s3.model.Bucket;
+ import com.amazonaws.services.s3.model.CannedAccessControlList;
+ import com.amazonaws.services.s3.model.GeneratePresignedUrlRequest;
+ import com.amazonaws.services.s3.model.GetObjectRequest;
+ import com.amazonaws.services.s3.model.ObjectListing;
+ import com.amazonaws.services.s3.model.ObjectMetadata;
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
+
+
+If you are just testing the Ceph Object Storage services, consider
+using HTTP protocol instead of HTTPS protocol.
+
+First, import the ``ClientConfiguration`` and ``Protocol`` classes.
+
+.. code-block:: java
+
+ import com.amazonaws.ClientConfiguration;
+ import com.amazonaws.Protocol;
+
+
+Then, define the client configuration, and add the client configuration
+as an argument for the S3 client.
+
+.. code-block:: java
+
+ AWSCredentials credentials = new BasicAWSCredentials(accessKey, secretKey);
+
+ ClientConfiguration clientConfig = new ClientConfiguration();
+ clientConfig.setProtocol(Protocol.HTTP);
+
+ AmazonS3 conn = new AmazonS3Client(credentials, clientConfig);
+ conn.setEndpoint("endpoint.com");
+
+
+Creating a Connection
+---------------------
+
+This creates a connection so that you can interact with the server.
+
+.. code-block:: java
+
+ String accessKey = "insert your access key here!";
+ String secretKey = "insert your secret key here!";
+
+ AWSCredentials credentials = new BasicAWSCredentials(accessKey, secretKey);
+ AmazonS3 conn = new AmazonS3Client(credentials);
+ conn.setEndpoint("objects.dreamhost.com");
+
+
+Listing Owned Buckets
+---------------------
+
+This gets a list of Buckets that you own.
+This also prints out the bucket name and creation date of each bucket.
+
+.. code-block:: java
+
+ List<Bucket> buckets = conn.listBuckets();
+ for (Bucket bucket : buckets) {
+ System.out.println(bucket.getName() + "\t" +
+ StringUtils.fromDate(bucket.getCreationDate()));
+ }
+
+The output will look something like this::
+
+ mahbuckat1 2011-04-21T18:05:39.000Z
+ mahbuckat2 2011-04-21T18:05:48.000Z
+ mahbuckat3 2011-04-21T18:07:18.000Z
+
+
+Creating a Bucket
+-----------------
+
+This creates a new bucket called ``my-new-bucket``
+
+.. code-block:: java
+
+ Bucket bucket = conn.createBucket("my-new-bucket");
+
+
+Listing a Bucket's Content
+--------------------------
+This gets a list of objects in the bucket.
+This also prints out each object's name, the file size, and last
+modified date.
+
+.. code-block:: java
+
+ ObjectListing objects = conn.listObjects(bucket.getName());
+ do {
+ for (S3ObjectSummary objectSummary : objects.getObjectSummaries()) {
+ System.out.println(objectSummary.getKey() + "\t" +
+ objectSummary.getSize() + "\t" +
+ StringUtils.fromDate(objectSummary.getLastModified()));
+ }
+ objects = conn.listNextBatchOfObjects(objects);
+ } while (objects.isTruncated());
+
+The output will look something like this::
+
+ myphoto1.jpg 251262 2011-08-08T21:35:48.000Z
+ myphoto2.jpg 262518 2011-08-08T21:38:01.000Z
+
+
+Deleting a Bucket
+-----------------
+
+.. note::
+ The Bucket must be empty! Otherwise it won't work!
+
+.. code-block:: java
+
+ conn.deleteBucket(bucket.getName());
+
+
+Forced Delete for Non-empty Buckets
+-----------------------------------
+.. attention::
+ not available
+
+
+Creating an Object
+------------------
+
+This creates a file ``hello.txt`` with the string ``"Hello World!"``
+
+.. code-block:: java
+
+ ByteArrayInputStream input = new ByteArrayInputStream("Hello World!".getBytes());
+ conn.putObject(bucket.getName(), "hello.txt", input, new ObjectMetadata());
+
+
+Change an Object's ACL
+----------------------
+
+This makes the object ``hello.txt`` to be publicly readable, and
+``secret_plans.txt`` to be private.
+
+.. code-block:: java
+
+ conn.setObjectAcl(bucket.getName(), "hello.txt", CannedAccessControlList.PublicRead);
+ conn.setObjectAcl(bucket.getName(), "secret_plans.txt", CannedAccessControlList.Private);
+
+
+Download an Object (to a file)
+------------------------------
+
+This downloads the object ``perl_poetry.pdf`` and saves it in
+``/home/larry/documents``
+
+.. code-block:: java
+
+ conn.getObject(
+ new GetObjectRequest(bucket.getName(), "perl_poetry.pdf"),
+ new File("/home/larry/documents/perl_poetry.pdf")
+ );
+
+
+Delete an Object
+----------------
+
+This deletes the object ``goodbye.txt``
+
+.. code-block:: java
+
+ conn.deleteObject(bucket.getName(), "goodbye.txt");
+
+
+Generate Object Download URLs (signed and unsigned)
+---------------------------------------------------
+
+This generates an unsigned download URL for ``hello.txt``. This works
+because we made ``hello.txt`` public by setting the ACL above.
+This then generates a signed download URL for ``secret_plans.txt`` that
+will work for 1 hour. Signed download URLs will work for the time
+period even if the object is private (when the time period is up, the
+URL will stop working).
+
+.. note::
+ The java library does not have a method for generating unsigned
+ URLs, so the example below just generates a signed URL.
+
+.. code-block:: java
+
+ GeneratePresignedUrlRequest request = new GeneratePresignedUrlRequest(bucket.getName(), "secret_plans.txt");
+ System.out.println(conn.generatePresignedUrl(request));
+
+The output will look something like this::
+
+ https://my-bucket-name.objects.dreamhost.com/secret_plans.txt?Signature=XXXXXXXXXXXXXXXXXXXXXXXXXXX&Expires=1316027075&AWSAccessKeyId=XXXXXXXXXXXXXXXXXXX
+
diff --git a/doc/radosgw/s3/objectops.rst b/doc/radosgw/s3/objectops.rst
new file mode 100644
index 000000000..2ac52607f
--- /dev/null
+++ b/doc/radosgw/s3/objectops.rst
@@ -0,0 +1,558 @@
+Object Operations
+=================
+
+Put Object
+----------
+Adds an object to a bucket. You must have write permissions on the bucket to perform this operation.
+
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{bucket}/{object} HTTP/1.1
+
+Request Headers
+~~~~~~~~~~~~~~~
+
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| Name | Description | Valid Values | Required |
++======================+============================================+===============================================================================+============+
+| **content-md5** | A base64 encoded MD-5 hash of the message. | A string. No defaults or constraints. | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| **content-type** | A standard MIME type. | Any MIME type. Default: ``binary/octet-stream`` | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| **x-amz-meta-<...>** | User metadata. Stored with the object. | A string up to 8kb. No defaults. | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| **x-amz-acl** | A canned ACL. | ``private``, ``public-read``, ``public-read-write``, ``authenticated-read`` | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+
+
+Copy Object
+-----------
+To copy an object, use ``PUT`` and specify a destination bucket and the object name.
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{dest-bucket}/{dest-object} HTTP/1.1
+ x-amz-copy-source: {source-bucket}/{source-object}
+
+Request Headers
+~~~~~~~~~~~~~~~
+
++--------------------------------------+-------------------------------------------------+------------------------+------------+
+| Name | Description | Valid Values | Required |
++======================================+=================================================+========================+============+
+| **x-amz-copy-source** | The source bucket name + object name. | {bucket}/{obj} | Yes |
++--------------------------------------+-------------------------------------------------+------------------------+------------+
+| **x-amz-acl** | A canned ACL. | ``private``, | No |
+| | | ``public-read``, | |
+| | | ``public-read-write``, | |
+| | | ``authenticated-read`` | |
++--------------------------------------+-------------------------------------------------+------------------------+------------+
+| **x-amz-copy-if-modified-since** | Copies only if modified since the timestamp. | Timestamp | No |
++--------------------------------------+-------------------------------------------------+------------------------+------------+
+| **x-amz-copy-if-unmodified-since** | Copies only if unmodified since the timestamp. | Timestamp | No |
++--------------------------------------+-------------------------------------------------+------------------------+------------+
+| **x-amz-copy-if-match** | Copies only if object ETag matches ETag. | Entity Tag | No |
++--------------------------------------+-------------------------------------------------+------------------------+------------+
+| **x-amz-copy-if-none-match** | Copies only if object ETag doesn't match. | Entity Tag | No |
++--------------------------------------+-------------------------------------------------+------------------------+------------+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++------------------------+-------------+-----------------------------------------------+
+| Name | Type | Description |
++========================+=============+===============================================+
+| **CopyObjectResult** | Container | A container for the response elements. |
++------------------------+-------------+-----------------------------------------------+
+| **LastModified** | Date | The last modified date of the source object. |
++------------------------+-------------+-----------------------------------------------+
+| **Etag** | String | The ETag of the new object. |
++------------------------+-------------+-----------------------------------------------+
+
+Remove Object
+-------------
+
+Removes an object. Requires WRITE permission set on the containing bucket.
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{bucket}/{object} HTTP/1.1
+
+
+
+Get Object
+----------
+Retrieves an object from a bucket within RADOS.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{bucket}/{object} HTTP/1.1
+
+Request Headers
+~~~~~~~~~~~~~~~
+
++---------------------------+------------------------------------------------+--------------------------------+------------+
+| Name | Description | Valid Values | Required |
++===========================+================================================+================================+============+
+| **range** | The range of the object to retrieve. | Range: bytes=beginbyte-endbyte | No |
++---------------------------+------------------------------------------------+--------------------------------+------------+
+| **if-modified-since** | Gets only if modified since the timestamp. | Timestamp | No |
++---------------------------+------------------------------------------------+--------------------------------+------------+
+| **if-unmodified-since** | Gets only if not modified since the timestamp. | Timestamp | No |
++---------------------------+------------------------------------------------+--------------------------------+------------+
+| **if-match** | Gets only if object ETag matches ETag. | Entity Tag | No |
++---------------------------+------------------------------------------------+--------------------------------+------------+
+| **if-none-match** | Gets only if object ETag matches ETag. | Entity Tag | No |
++---------------------------+------------------------------------------------+--------------------------------+------------+
+
+Response Headers
+~~~~~~~~~~~~~~~~
+
++-------------------+--------------------------------------------------------------------------------------------+
+| Name | Description |
++===================+============================================================================================+
+| **Content-Range** | Data range, will only be returned if the range header field was specified in the request |
++-------------------+--------------------------------------------------------------------------------------------+
+
+Get Object Info
+---------------
+
+Returns information about object. This request will return the same
+header information as with the Get Object request, but will include
+the metadata only, not the object data payload.
+
+Syntax
+~~~~~~
+
+::
+
+ HEAD /{bucket}/{object} HTTP/1.1
+
+Request Headers
+~~~~~~~~~~~~~~~
+
++---------------------------+------------------------------------------------+--------------------------------+------------+
+| Name | Description | Valid Values | Required |
++===========================+================================================+================================+============+
+| **range** | The range of the object to retrieve. | Range: bytes=beginbyte-endbyte | No |
++---------------------------+------------------------------------------------+--------------------------------+------------+
+| **if-modified-since** | Gets only if modified since the timestamp. | Timestamp | No |
++---------------------------+------------------------------------------------+--------------------------------+------------+
+| **if-unmodified-since** | Gets only if not modified since the timestamp. | Timestamp | No |
++---------------------------+------------------------------------------------+--------------------------------+------------+
+| **if-match** | Gets only if object ETag matches ETag. | Entity Tag | No |
++---------------------------+------------------------------------------------+--------------------------------+------------+
+| **if-none-match** | Gets only if object ETag matches ETag. | Entity Tag | No |
++---------------------------+------------------------------------------------+--------------------------------+------------+
+
+Get Object ACL
+--------------
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{bucket}/{object}?acl HTTP/1.1
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| Name | Type | Description |
++===========================+=============+==============================================================================================+
+| ``AccessControlPolicy`` | Container | A container for the response. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``AccessControlList`` | Container | A container for the ACL information. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Owner`` | Container | A container for the object owner's ``ID`` and ``DisplayName``. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``ID`` | String | The object owner's ID. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``DisplayName`` | String | The object owner's display name. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Grant`` | Container | A container for ``Grantee`` and ``Permission``. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Grantee`` | Container | A container for the ``DisplayName`` and ``ID`` of the user receiving a grant of permission. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Permission`` | String | The permission given to the ``Grantee`` object. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+
+
+
+Set Object ACL
+--------------
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{bucket}/{object}?acl
+
+Request Entities
+~~~~~~~~~~~~~~~~
+
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| Name | Type | Description |
++===========================+=============+==============================================================================================+
+| ``AccessControlPolicy`` | Container | A container for the response. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``AccessControlList`` | Container | A container for the ACL information. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Owner`` | Container | A container for the object owner's ``ID`` and ``DisplayName``. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``ID`` | String | The object owner's ID. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``DisplayName`` | String | The object owner's display name. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Grant`` | Container | A container for ``Grantee`` and ``Permission``. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Grantee`` | Container | A container for the ``DisplayName`` and ``ID`` of the user receiving a grant of permission. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+| ``Permission`` | String | The permission given to the ``Grantee`` object. |
++---------------------------+-------------+----------------------------------------------------------------------------------------------+
+
+
+
+Initiate Multi-part Upload
+--------------------------
+
+Initiate a multi-part upload process.
+
+Syntax
+~~~~~~
+
+::
+
+ POST /{bucket}/{object}?uploads
+
+Request Headers
+~~~~~~~~~~~~~~~
+
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| Name | Description | Valid Values | Required |
++======================+============================================+===============================================================================+============+
+| **content-md5** | A base64 encoded MD-5 hash of the message. | A string. No defaults or constraints. | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| **content-type** | A standard MIME type. | Any MIME type. Default: ``binary/octet-stream`` | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| **x-amz-meta-<...>** | User metadata. Stored with the object. | A string up to 8kb. No defaults. | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| **x-amz-acl** | A canned ACL. | ``private``, ``public-read``, ``public-read-write``, ``authenticated-read`` | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| Name | Type | Description |
++=========================================+=============+==========================================================================================================+
+| ``InitiatedMultipartUploadsResult`` | Container | A container for the results. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Bucket`` | String | The bucket that will receive the object contents. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Key`` | String | The key specified by the ``key`` request parameter (if any). |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``UploadId`` | String | The ID specified by the ``upload-id`` request parameter identifying the multipart upload (if any). |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+
+
+Multipart Upload Part
+---------------------
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{bucket}/{object}?partNumber=&uploadId= HTTP/1.1
+
+HTTP Response
+~~~~~~~~~~~~~
+
+The following HTTP response may be returned:
+
++---------------+----------------+--------------------------------------------------------------------------+
+| HTTP Status | Status Code | Description |
++===============+================+==========================================================================+
+| **404** | NoSuchUpload | Specified upload-id does not match any initiated upload on this object |
++---------------+----------------+--------------------------------------------------------------------------+
+
+List Multipart Upload Parts
+---------------------------
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{bucket}/{object}?uploadId=123 HTTP/1.1
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| Name | Type | Description |
++=========================================+=============+==========================================================================================================+
+| ``ListPartsResult`` | Container | A container for the results. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Bucket`` | String | The bucket that will receive the object contents. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Key`` | String | The key specified by the ``key`` request parameter (if any). |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``UploadId`` | String | The ID specified by the ``upload-id`` request parameter identifying the multipart upload (if any). |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Initiator`` | Container | Contains the ``ID`` and ``DisplayName`` of the user who initiated the upload. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``ID`` | String | The initiator's ID. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``DisplayName`` | String | The initiator's display name. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Owner`` | Container | A container for the ``ID`` and ``DisplayName`` of the user who owns the uploaded object. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``StorageClass`` | String | The method used to store the resulting object. ``STANDARD`` or ``REDUCED_REDUNDANCY`` |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``PartNumberMarker`` | String | The part marker to use in a subsequent request if ``IsTruncated`` is ``true``. Precedes the list. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``NextPartNumberMarker`` | String | The next part marker to use in a subsequent request if ``IsTruncated`` is ``true``. The end of the list. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``MaxParts`` | Integer | The max parts allowed in the response as specified by the ``max-parts`` request parameter. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``IsTruncated`` | Boolean | If ``true``, only a subset of the object's upload contents were returned. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Part`` | Container | A container for ``LastModified``, ``PartNumber``, ``ETag`` and ``Size`` elements. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``LastModified`` | Date | Date and time at which the part was uploaded. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``PartNumber`` | Integer | The identification number of the part. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``ETag`` | String | The part's entity tag. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+| ``Size`` | Integer | The size of the uploaded part. |
++-----------------------------------------+-------------+----------------------------------------------------------------------------------------------------------+
+
+
+
+Complete Multipart Upload
+-------------------------
+Assembles uploaded parts and creates a new object, thereby completing a multipart upload.
+
+Syntax
+~~~~~~
+
+::
+
+ POST /{bucket}/{object}?uploadId= HTTP/1.1
+
+Request Entities
+~~~~~~~~~~~~~~~~
+
++----------------------------------+-------------+-----------------------------------------------------+----------+
+| Name | Type | Description | Required |
++==================================+=============+=====================================================+==========+
+| ``CompleteMultipartUpload`` | Container | A container consisting of one or more parts. | Yes |
++----------------------------------+-------------+-----------------------------------------------------+----------+
+| ``Part`` | Container | A container for the ``PartNumber`` and ``ETag``. | Yes |
++----------------------------------+-------------+-----------------------------------------------------+----------+
+| ``PartNumber`` | Integer | The identifier of the part. | Yes |
++----------------------------------+-------------+-----------------------------------------------------+----------+
+| ``ETag`` | String | The part's entity tag. | Yes |
++----------------------------------+-------------+-----------------------------------------------------+----------+
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++-------------------------------------+-------------+-------------------------------------------------------+
+| Name | Type | Description |
++=====================================+=============+=======================================================+
+| **CompleteMultipartUploadResult** | Container | A container for the response. |
++-------------------------------------+-------------+-------------------------------------------------------+
+| **Location** | URI | The resource identifier (path) of the new object. |
++-------------------------------------+-------------+-------------------------------------------------------+
+| **Bucket** | String | The name of the bucket that contains the new object. |
++-------------------------------------+-------------+-------------------------------------------------------+
+| **Key** | String | The object's key. |
++-------------------------------------+-------------+-------------------------------------------------------+
+| **ETag** | String | The entity tag of the new object. |
++-------------------------------------+-------------+-------------------------------------------------------+
+
+Abort Multipart Upload
+----------------------
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{bucket}/{object}?uploadId= HTTP/1.1
+
+
+
+Append Object
+-------------
+Append data to an object. You must have write permissions on the bucket to perform this operation.
+It is used to upload files in appending mode. The type of the objects created by the Append Object
+operation is Appendable Object, and the type of the objects uploaded with the Put Object operation is Normal Object.
+**Append Object can't be used if bucket versioning is enabled or suspended.**
+**Synced object will become normal in multisite, but you can still append to the original object.**
+**Compression and encryption features are disabled for Appendable objects.**
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{bucket}/{object}?append&position= HTTP/1.1
+
+Request Headers
+~~~~~~~~~~~~~~~
+
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| Name | Description | Valid Values | Required |
++======================+============================================+===============================================================================+============+
+| **content-md5** | A base64 encoded MD-5 hash of the message. | A string. No defaults or constraints. | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| **content-type** | A standard MIME type. | Any MIME type. Default: ``binary/octet-stream`` | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| **x-amz-meta-<...>** | User metadata. Stored with the object. | A string up to 8kb. No defaults. | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+| **x-amz-acl** | A canned ACL. | ``private``, ``public-read``, ``public-read-write``, ``authenticated-read`` | No |
++----------------------+--------------------------------------------+-------------------------------------------------------------------------------+------------+
+
+Response Headers
+~~~~~~~~~~~~~~~~
+
++--------------------------------+------------------------------------------------------------------+
+| Name | Description |
++================================+==================================================================+
+| **x-rgw-next-append-position** | Next position to append object |
++--------------------------------+------------------------------------------------------------------+
+
+HTTP Response
+~~~~~~~~~~~~~
+
+The following HTTP response may be returned:
+
++---------------+----------------------------+---------------------------------------------------+
+| HTTP Status | Status Code | Description |
++===============+============================+===================================================+
+| **409** | PositionNotEqualToLength | Specified position does not match object length |
++---------------+----------------------------+---------------------------------------------------+
+| **409** | ObjectNotAppendable | Specified object can not be appended |
++---------------+----------------------------+---------------------------------------------------+
+| **409** | InvalidBucketstate | Bucket versioning is enabled or suspended |
++---------------+----------------------------+---------------------------------------------------+
+
+
+Put Object Retention
+--------------------
+Places an Object Retention configuration on an object.
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{bucket}/{object}?retention&versionId= HTTP/1.1
+
+Request Entities
+~~~~~~~~~~~~~~~~
+
++---------------------+-------------+-------------------------------------------------------------------------------+------------+
+| Name | Type | Description | Required |
++=====================+=============+===============================================================================+============+
+| ``Retention`` | Container | A container for the request. | Yes |
++---------------------+-------------+-------------------------------------------------------------------------------+------------+
+| ``Mode`` | String | Retention mode for the specified object. Valid Values: GOVERNANCE/COMPLIANCE | Yes |
++---------------------+-------------+--------------------------------------------------------------------------------------------+
+| ``RetainUntilDate`` | Timestamp | Retention date. Format: 2020-01-05T00:00:00.000Z | Yes |
++---------------------+-------------+--------------------------------------------------------------------------------------------+
+
+
+Get Object Retention
+--------------------
+Gets an Object Retention configuration on an object.
+
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{bucket}/{object}?retention&versionId= HTTP/1.1
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++---------------------+-------------+-------------------------------------------------------------------------------+------------+
+| Name | Type | Description | Required |
++=====================+=============+===============================================================================+============+
+| ``Retention`` | Container | A container for the request. | Yes |
++---------------------+-------------+-------------------------------------------------------------------------------+------------+
+| ``Mode`` | String | Retention mode for the specified object. Valid Values: GOVERNANCE/COMPLIANCE | Yes |
++---------------------+-------------+--------------------------------------------------------------------------------------------+
+| ``RetainUntilDate`` | Timestamp | Retention date. Format: 2020-01-05T00:00:00.000Z | Yes |
++---------------------+-------------+--------------------------------------------------------------------------------------------+
+
+
+Put Object Legal Hold
+---------------------
+Applies a Legal Hold configuration to the specified object.
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{bucket}/{object}?legal-hold&versionId= HTTP/1.1
+
+Request Entities
+~~~~~~~~~~~~~~~~
+
++----------------+-------------+----------------------------------------------------------------------------------------+------------+
+| Name | Type | Description | Required |
++================+=============+========================================================================================+============+
+| ``LegalHold`` | Container | A container for the request. | Yes |
++----------------+-------------+----------------------------------------------------------------------------------------+------------+
+| ``Status`` | String | Indicates whether the specified object has a Legal Hold in place. Valid Values: ON/OFF | Yes |
++----------------+-------------+----------------------------------------------------------------------------------------+------------+
+
+
+Get Object Legal Hold
+---------------------
+Gets an object's current Legal Hold status.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{bucket}/{object}?legal-hold&versionId= HTTP/1.1
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++----------------+-------------+----------------------------------------------------------------------------------------+------------+
+| Name | Type | Description | Required |
++================+=============+========================================================================================+============+
+| ``LegalHold`` | Container | A container for the request. | Yes |
++----------------+-------------+----------------------------------------------------------------------------------------+------------+
+| ``Status`` | String | Indicates whether the specified object has a Legal Hold in place. Valid Values: ON/OFF | Yes |
++----------------+-------------+----------------------------------------------------------------------------------------+------------+
+
diff --git a/doc/radosgw/s3/perl.rst b/doc/radosgw/s3/perl.rst
new file mode 100644
index 000000000..f12e5c698
--- /dev/null
+++ b/doc/radosgw/s3/perl.rst
@@ -0,0 +1,192 @@
+.. _perl:
+
+Perl S3 Examples
+================
+
+Creating a Connection
+---------------------
+
+This creates a connection so that you can interact with the server.
+
+.. code-block:: perl
+
+ use Amazon::S3;
+ my $access_key = 'put your access key here!';
+ my $secret_key = 'put your secret key here!';
+
+ my $conn = Amazon::S3->new({
+ aws_access_key_id => $access_key,
+ aws_secret_access_key => $secret_key,
+ host => 'objects.dreamhost.com',
+ secure => 1,
+ retry => 1,
+ });
+
+
+Listing Owned Buckets
+---------------------
+
+This gets a list of `Amazon::S3::Bucket`_ objects that you own.
+We'll also print out the bucket name and creation date of each bucket.
+
+.. code-block:: perl
+
+ my @buckets = @{$conn->buckets->{buckets} || []};
+ foreach my $bucket (@buckets) {
+ print $bucket->bucket . "\t" . $bucket->creation_date . "\n";
+ }
+
+The output will look something like this::
+
+ mahbuckat1 2011-04-21T18:05:39.000Z
+ mahbuckat2 2011-04-21T18:05:48.000Z
+ mahbuckat3 2011-04-21T18:07:18.000Z
+
+
+Creating a Bucket
+-----------------
+
+This creates a new bucket called ``my-new-bucket``
+
+.. code-block:: perl
+
+ my $bucket = $conn->add_bucket({ bucket => 'my-new-bucket' });
+
+
+Listing a Bucket's Content
+--------------------------
+
+This gets a list of hashes with info about each object in the bucket.
+We'll also print out each object's name, the file size, and last
+modified date.
+
+.. code-block:: perl
+
+ my @keys = @{$bucket->list_all->{keys} || []};
+ foreach my $key (@keys) {
+ print "$key->{key}\t$key->{size}\t$key->{last_modified}\n";
+ }
+
+The output will look something like this::
+
+ myphoto1.jpg 251262 2011-08-08T21:35:48.000Z
+ myphoto2.jpg 262518 2011-08-08T21:38:01.000Z
+
+
+Deleting a Bucket
+-----------------
+
+.. note::
+ The Bucket must be empty! Otherwise it won't work!
+
+.. code-block:: perl
+
+ $conn->delete_bucket($bucket);
+
+
+Forced Delete for Non-empty Buckets
+-----------------------------------
+
+.. attention::
+
+ not available in the `Amazon::S3`_ perl module
+
+
+Creating an Object
+------------------
+
+This creates a file ``hello.txt`` with the string ``"Hello World!"``
+
+.. code-block:: perl
+
+ $bucket->add_key(
+ 'hello.txt', 'Hello World!',
+ { content_type => 'text/plain' },
+ );
+
+
+Change an Object's ACL
+----------------------
+
+This makes the object ``hello.txt`` to be publicly readable and
+``secret_plans.txt`` to be private.
+
+.. code-block:: perl
+
+ $bucket->set_acl({
+ key => 'hello.txt',
+ acl_short => 'public-read',
+ });
+ $bucket->set_acl({
+ key => 'secret_plans.txt',
+ acl_short => 'private',
+ });
+
+
+Download an Object (to a file)
+------------------------------
+
+This downloads the object ``perl_poetry.pdf`` and saves it in
+``/home/larry/documents/``
+
+.. code-block:: perl
+
+ $bucket->get_key_filename('perl_poetry.pdf', undef,
+ '/home/larry/documents/perl_poetry.pdf');
+
+
+Delete an Object
+----------------
+
+This deletes the object ``goodbye.txt``
+
+.. code-block:: perl
+
+ $bucket->delete_key('goodbye.txt');
+
+Generate Object Download URLs (signed and unsigned)
+---------------------------------------------------
+This generates an unsigned download URL for ``hello.txt``. This works
+because we made ``hello.txt`` public by setting the ACL above.
+Then this generates a signed download URL for ``secret_plans.txt`` that
+will work for 1 hour. Signed download URLs will work for the time
+period even if the object is private (when the time period is up, the
+URL will stop working).
+
+.. note::
+ The `Amazon::S3`_ module does not have a way to generate download
+ URLs, so we are going to be using another module instead. Unfortunately,
+ most modules for generating these URLs assume that you are using Amazon,
+ so we have had to go with using a more obscure module, `Muck::FS::S3`_. This
+ should be the same as Amazon's sample S3 perl module, but this sample
+ module is not in CPAN. So, you can either use CPAN to install
+ `Muck::FS::S3`_, or install Amazon's sample S3 module manually. If you go
+ the manual route, you can remove ``Muck::FS::`` from the example below.
+
+.. code-block:: perl
+
+ use Muck::FS::S3::QueryStringAuthGenerator;
+ my $generator = Muck::FS::S3::QueryStringAuthGenerator->new(
+ $access_key,
+ $secret_key,
+ 0, # 0 means use 'http'. set this to 1 for 'https'
+ 'objects.dreamhost.com',
+ );
+
+ my $hello_url = $generator->make_bare_url($bucket->bucket, 'hello.txt');
+ print $hello_url . "\n";
+
+ $generator->expires_in(3600); # 1 hour = 3600 seconds
+ my $plans_url = $generator->get($bucket->bucket, 'secret_plans.txt');
+ print $plans_url . "\n";
+
+The output will look something like this::
+
+ http://objects.dreamhost.com:80/my-bucket-name/hello.txt
+ http://objects.dreamhost.com:80/my-bucket-name/secret_plans.txt?Signature=XXXXXXXXXXXXXXXXXXXXXXXXXXX&Expires=1316027075&AWSAccessKeyId=XXXXXXXXXXXXXXXXXXX
+
+
+.. _`Amazon::S3`: http://search.cpan.org/~tima/Amazon-S3-0.441/lib/Amazon/S3.pm
+.. _`Amazon::S3::Bucket`: http://search.cpan.org/~tima/Amazon-S3-0.441/lib/Amazon/S3/Bucket.pm
+.. _`Muck::FS::S3`: http://search.cpan.org/~mike/Muck-0.02/
+
diff --git a/doc/radosgw/s3/php.rst b/doc/radosgw/s3/php.rst
new file mode 100644
index 000000000..4878a3489
--- /dev/null
+++ b/doc/radosgw/s3/php.rst
@@ -0,0 +1,214 @@
+.. _php:
+
+PHP S3 Examples
+===============
+
+Installing AWS PHP SDK
+----------------------
+
+This installs AWS PHP SDK using composer (see here_ how to install composer).
+
+.. _here: https://getcomposer.org/download/
+
+.. code-block:: bash
+
+ $ composer install aws/aws-sdk-php
+
+Creating a Connection
+---------------------
+
+This creates a connection so that you can interact with the server.
+
+.. note::
+
+ The client initialization requires a region so we use ``''``.
+
+.. code-block:: php
+
+ <?php
+
+ use Aws\S3\S3Client;
+
+ define('AWS_KEY', 'place access key here');
+ define('AWS_SECRET_KEY', 'place secret key here');
+ $ENDPOINT = 'http://objects.dreamhost.com';
+
+ // require the amazon sdk from your composer vendor dir
+ require __DIR__.'/vendor/autoload.php';
+
+ // Instantiate the S3 class and point it at the desired host
+ $client = new S3Client([
+ 'region' => '',
+ 'version' => '2006-03-01',
+ 'endpoint' => $ENDPOINT,
+ 'credentials' => [
+ 'key' => AWS_KEY,
+ 'secret' => AWS_SECRET_KEY
+ ],
+ // Set the S3 class to use objects.dreamhost.com/bucket
+ // instead of bucket.objects.dreamhost.com
+ 'use_path_style_endpoint' => true
+ ]);
+
+Listing Owned Buckets
+---------------------
+This gets a ``AWS\Result`` instance that is more convenient to visit using array access way.
+This also prints out the bucket name and creation date of each bucket.
+
+.. code-block:: php
+
+ <?php
+ $listResponse = $client->listBuckets();
+ $buckets = $listResponse['Buckets'];
+ foreach ($buckets as $bucket) {
+ echo $bucket['Name'] . "\t" . $bucket['CreationDate'] . "\n";
+ }
+
+The output will look something like this::
+
+ mahbuckat1 2011-04-21T18:05:39.000Z
+ mahbuckat2 2011-04-21T18:05:48.000Z
+ mahbuckat3 2011-04-21T18:07:18.000Z
+
+
+Creating a Bucket
+-----------------
+
+This creates a new bucket called ``my-new-bucket`` and returns a
+``AWS\Result`` object.
+
+.. code-block:: php
+
+ <?php
+ $client->createBucket(['Bucket' => 'my-new-bucket']);
+
+
+List a Bucket's Content
+-----------------------
+
+This gets a ``AWS\Result`` instance that is more convenient to visit using array access way.
+This then prints out each object's name, the file size, and last modified date.
+
+.. code-block:: php
+
+ <?php
+ $objectsListResponse = $client->listObjects(['Bucket' => $bucketname]);
+ $objects = $objectsListResponse['Contents'] ?? [];
+ foreach ($objects as $object) {
+ echo $object['Key'] . "\t" . $object['Size'] . "\t" . $object['LastModified'] . "\n";
+ }
+
+.. note::
+
+ If there are more than 1000 objects in this bucket,
+ you need to check $objectsListResponse['isTruncated']
+ and run again with the name of the last key listed.
+ Keep doing this until isTruncated is not true.
+
+The output will look something like this if the bucket has some files::
+
+ myphoto1.jpg 251262 2011-08-08T21:35:48.000Z
+ myphoto2.jpg 262518 2011-08-08T21:38:01.000Z
+
+
+Deleting a Bucket
+-----------------
+
+This deletes the bucket called ``my-old-bucket`` and returns a
+``AWS\Result`` object
+
+.. note::
+
+ The Bucket must be empty! Otherwise it won't work!
+
+.. code-block:: php
+
+ <?php
+ $client->deleteBucket(['Bucket' => 'my-old-bucket']);
+
+
+Creating an Object
+------------------
+
+This creates an object ``hello.txt`` with the string ``"Hello World!"``
+
+.. code-block:: php
+
+ <?php
+ $client->putObject([
+ 'Bucket' => 'my-bucket-name',
+ 'Key' => 'hello.txt',
+ 'Body' => "Hello World!"
+ ]);
+
+
+Change an Object's ACL
+----------------------
+
+This makes the object ``hello.txt`` to be publicly readable and
+``secret_plans.txt`` to be private.
+
+.. code-block:: php
+
+ <?php
+ $client->putObjectAcl([
+ 'Bucket' => 'my-bucket-name',
+ 'Key' => 'hello.txt',
+ 'ACL' => 'public-read'
+ ]);
+ $client->putObjectAcl([
+ 'Bucket' => 'my-bucket-name',
+ 'Key' => 'secret_plans.txt',
+ 'ACL' => 'private'
+ ]);
+
+
+Delete an Object
+----------------
+
+This deletes the object ``goodbye.txt``
+
+.. code-block:: php
+
+ <?php
+ $client->deleteObject(['Bucket' => 'my-bucket-name', 'Key' => 'goodbye.txt']);
+
+
+Download an Object (to a file)
+------------------------------
+
+This downloads the object ``poetry.pdf`` and saves it in
+``/home/larry/documents/``
+
+.. code-block:: php
+
+ <?php
+ $object = $client->getObject(['Bucket' => 'my-bucket-name', 'Key' => 'poetry.pdf']);
+ file_put_contents('/home/larry/documents/poetry.pdf', $object['Body']->getContents());
+
+Generate Object Download URLs (signed and unsigned)
+---------------------------------------------------
+
+This generates an unsigned download URL for ``hello.txt``.
+This works because we made ``hello.txt`` public by setting
+the ACL above. This then generates a signed download URL
+for ``secret_plans.txt`` that will work for 1 hour.
+Signed download URLs will work for the time period even
+if the object is private (when the time period is up,
+the URL will stop working).
+
+.. code-block:: php
+
+ <?php
+ $hello_url = $client->getObjectUrl('my-bucket-name', 'hello.txt');
+ echo $hello_url."\n";
+
+ $secret_plans_cmd = $client->getCommand('GetObject', ['Bucket' => 'my-bucket-name', 'Key' => 'secret_plans.txt']);
+ $request = $client->createPresignedRequest($secret_plans_cmd, '+1 hour');
+ echo $request->getUri()."\n";
+
+The output of this will look something like::
+
+ http://objects.dreamhost.com/my-bucket-name/hello.txt
+ http://objects.dreamhost.com/my-bucket-name/secret_plans.txt?X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=sandboxAccessKey%2F20190116%2F%2Fs3%2Faws4_request&X-Amz-Date=20190116T125520Z&X-Amz-SignedHeaders=host&X-Amz-Expires=3600&X-Amz-Signature=61921f07c73d7695e47a2192cf55ae030f34c44c512b2160bb5a936b2b48d923
+
diff --git a/doc/radosgw/s3/python.rst b/doc/radosgw/s3/python.rst
new file mode 100644
index 000000000..35f682893
--- /dev/null
+++ b/doc/radosgw/s3/python.rst
@@ -0,0 +1,197 @@
+.. _python:
+
+Python S3 Examples
+==================
+
+Creating a Connection
+---------------------
+
+This creates a connection so that you can interact with the server.
+
+.. code-block:: python
+
+ import boto
+ import boto.s3.connection
+ access_key = 'put your access key here!'
+ secret_key = 'put your secret key here!'
+
+ conn = boto.connect_s3(
+ aws_access_key_id = access_key,
+ aws_secret_access_key = secret_key,
+ host = 'objects.dreamhost.com',
+ #is_secure=False, # uncomment if you are not using ssl
+ calling_format = boto.s3.connection.OrdinaryCallingFormat(),
+ )
+
+
+Listing Owned Buckets
+---------------------
+
+This gets a list of Buckets that you own.
+This also prints out the bucket name and creation date of each bucket.
+
+.. code-block:: python
+
+ for bucket in conn.get_all_buckets():
+ print("{name}\t{created}".format(
+ name = bucket.name,
+ created = bucket.creation_date,
+ ))
+
+The output will look something like this::
+
+ mahbuckat1 2011-04-21T18:05:39.000Z
+ mahbuckat2 2011-04-21T18:05:48.000Z
+ mahbuckat3 2011-04-21T18:07:18.000Z
+
+
+Creating a Bucket
+-----------------
+
+This creates a new bucket called ``my-new-bucket``
+
+.. code-block:: python
+
+ bucket = conn.create_bucket('my-new-bucket')
+
+
+Listing a Bucket's Content
+--------------------------
+
+This gets a list of objects in the bucket.
+This also prints out each object's name, the file size, and last
+modified date.
+
+.. code-block:: python
+
+ for key in bucket.list():
+ print("{name}\t{size}\t{modified}".format(
+ name = key.name,
+ size = key.size,
+ modified = key.last_modified,
+ ))
+
+The output will look something like this::
+
+ myphoto1.jpg 251262 2011-08-08T21:35:48.000Z
+ myphoto2.jpg 262518 2011-08-08T21:38:01.000Z
+
+
+Deleting a Bucket
+-----------------
+
+.. note::
+
+ The Bucket must be empty! Otherwise it won't work!
+
+.. code-block:: python
+
+ conn.delete_bucket(bucket.name)
+
+
+Forced Delete for Non-empty Buckets
+-----------------------------------
+
+.. attention::
+
+ not available in python
+
+
+Creating an Object
+------------------
+
+This creates a file ``hello.txt`` with the string ``"Hello World!"``
+
+.. code-block:: python
+
+ key = bucket.new_key('hello.txt')
+ key.set_contents_from_string('Hello World!')
+
+
+Uploading an Object or a File
+-----------------------------
+
+This creates a file ``logo.png`` with the contents from the file ``"logo.png"``
+
+.. code-block:: python
+
+ key = bucket.new_key('logo.png')
+ key.set_contents_from_filename('logo.png')
+
+
+Change an Object's ACL
+----------------------
+
+This makes the object ``hello.txt`` to be publicly readable, and
+``secret_plans.txt`` to be private.
+
+.. code-block:: python
+
+ hello_key = bucket.get_key('hello.txt')
+ hello_key.set_canned_acl('public-read')
+ plans_key = bucket.get_key('secret_plans.txt')
+ plans_key.set_canned_acl('private')
+
+
+Download an Object (to a file)
+------------------------------
+
+This downloads the object ``perl_poetry.pdf`` and saves it in
+``/home/larry/documents/``
+
+.. code-block:: python
+
+ key = bucket.get_key('perl_poetry.pdf')
+ key.get_contents_to_filename('/home/larry/documents/perl_poetry.pdf')
+
+
+Delete an Object
+----------------
+
+This deletes the object ``goodbye.txt``
+
+.. code-block:: python
+
+ bucket.delete_key('goodbye.txt')
+
+
+Generate Object Download URLs (signed and unsigned)
+---------------------------------------------------
+
+This generates an unsigned download URL for ``hello.txt``. This works
+because we made ``hello.txt`` public by setting the ACL above.
+This then generates a signed download URL for ``secret_plans.txt`` that
+will work for 1 hour. Signed download URLs will work for the time
+period even if the object is private (when the time period is up, the
+URL will stop working).
+
+.. code-block:: python
+
+ hello_key = bucket.get_key('hello.txt')
+ hello_url = hello_key.generate_url(0, query_auth=False, force_http=True)
+ print(hello_url)
+
+ plans_key = bucket.get_key('secret_plans.txt')
+ plans_url = plans_key.generate_url(3600, query_auth=True, force_http=True)
+ print(plans_url)
+
+The output of this will look something like::
+
+ http://objects.dreamhost.com/my-bucket-name/hello.txt
+ http://objects.dreamhost.com/my-bucket-name/secret_plans.txt?Signature=XXXXXXXXXXXXXXXXXXXXXXXXXXX&Expires=1316027075&AWSAccessKeyId=XXXXXXXXXXXXXXXXXXX
+
+Using S3 API Extensions
+-----------------------
+
+To use the boto3 client to tests the RadosGW extensions to the S3 API, the `extensions file`_ should be placed under: ``~/.aws/models/s3/2006-03-01/`` directory.
+For example, unordered list of objects could be fetched using:
+
+.. code-block:: python
+
+ print(conn.list_objects(Bucket='my-new-bucket', AllowUnordered=True))
+
+
+Without the extensions file, in the above example, boto3 would complain that the ``AllowUnordered`` argument is invalid.
+
+
+.. _extensions file: https://github.com/ceph/ceph/blob/main/examples/rgw/boto3/service-2.sdk-extras.json
diff --git a/doc/radosgw/s3/ruby.rst b/doc/radosgw/s3/ruby.rst
new file mode 100644
index 000000000..435b3c630
--- /dev/null
+++ b/doc/radosgw/s3/ruby.rst
@@ -0,0 +1,364 @@
+.. _ruby:
+
+Ruby `AWS::SDK`_ Examples (aws-sdk gem ~>2)
+===========================================
+
+Settings
+---------------------
+
+You can setup the connection on global way:
+
+.. code-block:: ruby
+
+ Aws.config.update(
+ endpoint: 'https://objects.dreamhost.com.',
+ access_key_id: 'my-access-key',
+ secret_access_key: 'my-secret-key',
+ force_path_style: true,
+ region: 'us-east-1'
+ )
+
+
+and instantiate a client object:
+
+.. code-block:: ruby
+
+ s3_client = Aws::S3::Client.new
+
+Listing Owned Buckets
+---------------------
+
+This gets a list of buckets that you own.
+This also prints out the bucket name and creation date of each bucket.
+
+.. code-block:: ruby
+
+ s3_client.list_buckets.buckets.each do |bucket|
+ puts "#{bucket.name}\t#{bucket.creation_date}"
+ end
+
+The output will look something like this::
+
+ mahbuckat1 2011-04-21T18:05:39.000Z
+ mahbuckat2 2011-04-21T18:05:48.000Z
+ mahbuckat3 2011-04-21T18:07:18.000Z
+
+
+Creating a Bucket
+-----------------
+
+This creates a new bucket called ``my-new-bucket``
+
+.. code-block:: ruby
+
+ s3_client.create_bucket(bucket: 'my-new-bucket')
+
+If you want a private bucket:
+
+`acl` option accepts: # private, public-read, public-read-write, authenticated-read
+
+.. code-block:: ruby
+
+ s3_client.create_bucket(bucket: 'my-new-bucket', acl: 'private')
+
+
+Listing a Bucket's Content
+--------------------------
+
+This gets a list of hashes with the contents of each object
+This also prints out each object's name, the file size, and last
+modified date.
+
+.. code-block:: ruby
+
+ s3_client.get_objects(bucket: 'my-new-bucket').contents.each do |object|
+ puts "#{object.key}\t#{object.size}\t#{object.last-modified}"
+ end
+
+The output will look something like this if the bucket has some files::
+
+ myphoto1.jpg 251262 2011-08-08T21:35:48.000Z
+ myphoto2.jpg 262518 2011-08-08T21:38:01.000Z
+
+
+Deleting a Bucket
+-----------------
+.. note::
+ The Bucket must be empty! Otherwise it won't work!
+
+.. code-block:: ruby
+
+ s3_client.delete_bucket(bucket: 'my-new-bucket')
+
+
+Forced Delete for Non-empty Buckets
+-----------------------------------
+First, you need to clear the bucket:
+
+.. code-block:: ruby
+
+ Aws::S3::Bucket.new('my-new-bucket', client: s3_client).clear!
+
+after, you can destroy the bucket
+
+.. code-block:: ruby
+
+ s3_client.delete_bucket(bucket: 'my-new-bucket')
+
+
+Creating an Object
+------------------
+
+This creates a file ``hello.txt`` with the string ``"Hello World!"``
+
+.. code-block:: ruby
+
+ s3_client.put_object(
+ key: 'hello.txt',
+ body: 'Hello World!',
+ bucket: 'my-new-bucket',
+ content_type: 'text/plain'
+ )
+
+
+Change an Object's ACL
+----------------------
+
+This makes the object ``hello.txt`` to be publicly readable, and ``secret_plans.txt``
+to be private.
+
+.. code-block:: ruby
+
+ s3_client.put_object_acl(bucket: 'my-new-bucket', key: 'hello.txt', acl: 'public-read')
+
+ s3_client.put_object_acl(bucket: 'my-new-bucket', key: 'private.txt', acl: 'private')
+
+
+Download an Object (to a file)
+------------------------------
+
+This downloads the object ``poetry.pdf`` and saves it in
+``/home/larry/documents/``
+
+.. code-block:: ruby
+
+ s3_client.get_object(bucket: 'my-new-bucket', key: 'poetry.pdf', response_target: '/home/larry/documents/poetry.pdf')
+
+
+Delete an Object
+----------------
+
+This deletes the object ``goodbye.txt``
+
+.. code-block:: ruby
+
+ s3_client.delete_object(key: 'goodbye.txt', bucket: 'my-new-bucket')
+
+
+Generate Object Download URLs (signed and unsigned)
+---------------------------------------------------
+
+This generates an unsigned download URL for ``hello.txt``. This works
+because we made ``hello.txt`` public by setting the ACL above.
+This then generates a signed download URL for ``secret_plans.txt`` that
+will work for 1 hour. Signed download URLs will work for the time
+period even if the object is private (when the time period is up, the
+URL will stop working).
+
+.. code-block:: ruby
+
+ puts Aws::S3::Object.new(
+ key: 'hello.txt',
+ bucket_name: 'my-new-bucket',
+ client: s3_client
+ ).public_url
+
+ puts Aws::S3::Object.new(
+ key: 'secret_plans.txt',
+ bucket_name: 'hermes_ceph_gem',
+ client: s3_client
+ ).presigned_url(:get, expires_in: 60 * 60)
+
+The output of this will look something like::
+
+ http://objects.dreamhost.com/my-bucket-name/hello.txt
+ http://objects.dreamhost.com/my-bucket-name/secret_plans.txt?Signature=XXXXXXXXXXXXXXXXXXXXXXXXXXX&Expires=1316027075&AWSAccessKeyId=XXXXXXXXXXXXXXXXXXX
+
+.. _`AWS::SDK`: http://docs.aws.amazon.com/sdkforruby/api/Aws/S3/Client.html
+
+
+
+Ruby `AWS::S3`_ Examples (aws-s3 gem)
+=====================================
+
+Creating a Connection
+---------------------
+
+This creates a connection so that you can interact with the server.
+
+.. code-block:: ruby
+
+ AWS::S3::Base.establish_connection!(
+ :server => 'objects.dreamhost.com',
+ :use_ssl => true,
+ :access_key_id => 'my-access-key',
+ :secret_access_key => 'my-secret-key'
+ )
+
+
+Listing Owned Buckets
+---------------------
+
+This gets a list of `AWS::S3::Bucket`_ objects that you own.
+This also prints out the bucket name and creation date of each bucket.
+
+.. code-block:: ruby
+
+ AWS::S3::Service.buckets.each do |bucket|
+ puts "#{bucket.name}\t#{bucket.creation_date}"
+ end
+
+The output will look something like this::
+
+ mahbuckat1 2011-04-21T18:05:39.000Z
+ mahbuckat2 2011-04-21T18:05:48.000Z
+ mahbuckat3 2011-04-21T18:07:18.000Z
+
+
+Creating a Bucket
+-----------------
+
+This creates a new bucket called ``my-new-bucket``
+
+.. code-block:: ruby
+
+ AWS::S3::Bucket.create('my-new-bucket')
+
+
+Listing a Bucket's Content
+--------------------------
+
+This gets a list of hashes with the contents of each object
+This also prints out each object's name, the file size, and last
+modified date.
+
+.. code-block:: ruby
+
+ new_bucket = AWS::S3::Bucket.find('my-new-bucket')
+ new_bucket.each do |object|
+ puts "#{object.key}\t#{object.about['content-length']}\t#{object.about['last-modified']}"
+ end
+
+The output will look something like this if the bucket has some files::
+
+ myphoto1.jpg 251262 2011-08-08T21:35:48.000Z
+ myphoto2.jpg 262518 2011-08-08T21:38:01.000Z
+
+
+Deleting a Bucket
+-----------------
+.. note::
+ The Bucket must be empty! Otherwise it won't work!
+
+.. code-block:: ruby
+
+ AWS::S3::Bucket.delete('my-new-bucket')
+
+
+Forced Delete for Non-empty Buckets
+-----------------------------------
+
+.. code-block:: ruby
+
+ AWS::S3::Bucket.delete('my-new-bucket', :force => true)
+
+
+Creating an Object
+------------------
+
+This creates a file ``hello.txt`` with the string ``"Hello World!"``
+
+.. code-block:: ruby
+
+ AWS::S3::S3Object.store(
+ 'hello.txt',
+ 'Hello World!',
+ 'my-new-bucket',
+ :content_type => 'text/plain'
+ )
+
+
+Change an Object's ACL
+----------------------
+
+This makes the object ``hello.txt`` to be publicly readable, and ``secret_plans.txt``
+to be private.
+
+.. code-block:: ruby
+
+ policy = AWS::S3::S3Object.acl('hello.txt', 'my-new-bucket')
+ policy.grants = [ AWS::S3::ACL::Grant.grant(:public_read) ]
+ AWS::S3::S3Object.acl('hello.txt', 'my-new-bucket', policy)
+
+ policy = AWS::S3::S3Object.acl('secret_plans.txt', 'my-new-bucket')
+ policy.grants = []
+ AWS::S3::S3Object.acl('secret_plans.txt', 'my-new-bucket', policy)
+
+
+Download an Object (to a file)
+------------------------------
+
+This downloads the object ``poetry.pdf`` and saves it in
+``/home/larry/documents/``
+
+.. code-block:: ruby
+
+ open('/home/larry/documents/poetry.pdf', 'w') do |file|
+ AWS::S3::S3Object.stream('poetry.pdf', 'my-new-bucket') do |chunk|
+ file.write(chunk)
+ end
+ end
+
+
+Delete an Object
+----------------
+
+This deletes the object ``goodbye.txt``
+
+.. code-block:: ruby
+
+ AWS::S3::S3Object.delete('goodbye.txt', 'my-new-bucket')
+
+
+Generate Object Download URLs (signed and unsigned)
+---------------------------------------------------
+
+This generates an unsigned download URL for ``hello.txt``. This works
+because we made ``hello.txt`` public by setting the ACL above.
+This then generates a signed download URL for ``secret_plans.txt`` that
+will work for 1 hour. Signed download URLs will work for the time
+period even if the object is private (when the time period is up, the
+URL will stop working).
+
+.. code-block:: ruby
+
+ puts AWS::S3::S3Object.url_for(
+ 'hello.txt',
+ 'my-new-bucket',
+ :authenticated => false
+ )
+
+ puts AWS::S3::S3Object.url_for(
+ 'secret_plans.txt',
+ 'my-new-bucket',
+ :expires_in => 60 * 60
+ )
+
+The output of this will look something like::
+
+ http://objects.dreamhost.com/my-bucket-name/hello.txt
+ http://objects.dreamhost.com/my-bucket-name/secret_plans.txt?Signature=XXXXXXXXXXXXXXXXXXXXXXXXXXX&Expires=1316027075&AWSAccessKeyId=XXXXXXXXXXXXXXXXXXX
+
+.. _`AWS::S3`: http://amazon.rubyforge.org/
+.. _`AWS::S3::Bucket`: http://amazon.rubyforge.org/doc/
+
diff --git a/doc/radosgw/s3/serviceops.rst b/doc/radosgw/s3/serviceops.rst
new file mode 100644
index 000000000..54b6ca375
--- /dev/null
+++ b/doc/radosgw/s3/serviceops.rst
@@ -0,0 +1,69 @@
+Service Operations
+==================
+
+List Buckets
+------------
+``GET /`` returns a list of buckets created by the user making the request. ``GET /`` only
+returns buckets created by an authenticated user. You cannot make an anonymous request.
+
+Syntax
+~~~~~~
+::
+
+ GET / HTTP/1.1
+ Host: cname.domain.com
+
+ Authorization: AWS {access-key}:{hash-of-header-and-secret}
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++----------------------------+-------------+-----------------------------------------------------------------+
+| Name | Type | Description |
++============================+=============+=================================================================+
+| ``Buckets`` | Container | Container for list of buckets. |
++----------------------------+-------------+-----------------------------------------------------------------+
+| ``Bucket`` | Container | Container for bucket information. |
++----------------------------+-------------+-----------------------------------------------------------------+
+| ``Name`` | String | Bucket name. |
++----------------------------+-------------+-----------------------------------------------------------------+
+| ``CreationDate`` | Date | UTC time when the bucket was created. |
++----------------------------+-------------+-----------------------------------------------------------------+
+| ``ListAllMyBucketsResult`` | Container | A container for the result. |
++----------------------------+-------------+-----------------------------------------------------------------+
+| ``Owner`` | Container | A container for the bucket owner's ``ID`` and ``DisplayName``. |
++----------------------------+-------------+-----------------------------------------------------------------+
+| ``ID`` | String | The bucket owner's ID. |
++----------------------------+-------------+-----------------------------------------------------------------+
+| ``DisplayName`` | String | The bucket owner's display name. |
++----------------------------+-------------+-----------------------------------------------------------------+
+
+
+Get Usage Stats
+---------------
+
+Gets usage stats per user, similar to the admin command :ref:`rgw_user_usage_stats`.
+
+Syntax
+~~~~~~
+::
+
+ GET /?usage HTTP/1.1
+ Host: cname.domain.com
+
+ Authorization: AWS {access-key}:{hash-of-header-and-secret}
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
++----------------------------+-------------+-----------------------------------------------------------------+
+| Name | Type | Description |
++============================+=============+=================================================================+
+| ``Summary`` | Container | Summary of total stats by user. |
++----------------------------+-------------+-----------------------------------------------------------------+
+| ``TotalBytes`` | Integer | Bytes used by user |
++----------------------------+-------------+-----------------------------------------------------------------+
+| ``TotalBytesRounded`` | Integer | Bytes rounded to the nearest 4k boundary |
++----------------------------+-------------+-----------------------------------------------------------------+
+| ``TotalEntries`` | Integer | Total object entries |
++----------------------------+-------------+-----------------------------------------------------------------+
diff --git a/doc/radosgw/s3select.rst b/doc/radosgw/s3select.rst
new file mode 100644
index 000000000..d46d4e96f
--- /dev/null
+++ b/doc/radosgw/s3select.rst
@@ -0,0 +1,796 @@
+===============
+ Ceph s3 select
+===============
+
+.. contents::
+
+Overview
+--------
+
+The **S3 Select** engine creates an efficient pipe between clients and Ceph
+back end nodes. The S3 Select engine works best when implemented as closely as
+possible to back end storage.
+
+The S3 Select engine makes it possible to use an SQL-like syntax to select a
+restricted subset of data stored in an S3 object. The S3 Select engine
+facilitates the use of higher level, analytic applications (for example:
+SPARK-SQL). The ability of the S3 Select engine to target a proper subset of
+structed data within an S3 object decreases latency and increases throughput.
+
+For example: assume that a user needs to extract a single column that is
+filtered by another column, and that these colums are stored in a CSV file in
+an S3 object that is several GB in size. The following query performs this
+extraction: ``select customer-id from s3Object where age>30 and age<65;``
+
+Without the use of S3 Select, the whole S3 object must be retrieved from an OSD
+via RGW before the data is filtered and extracted. Significant network and CPU
+overhead are saved by "pushing down" the query into radosgw.
+
+**The bigger the object and the more accurate the query,
+the better the performance of s3select**.
+
+Basic Workflow
+--------------
+
+S3 Select queries are sent to RGW via `AWS-CLI
+<https://docs.aws.amazon.com/cli/latest/reference/s3api/select-object-content.html>`_
+
+S3 Select passes the authentication and permission parameters as an incoming
+message (POST). ``RGWSelectObj_ObjStore_S3::send_response_data`` is the entry
+point and handles each fetched chunk according to the object key that was
+input. ``send_response_data`` is the first to handle the input query: it
+extracts the query and other CLI parameters.
+
+RGW executes an S3 Select query on each new fetched chunk (up to 4 MB). The
+current implementation supports CSV objects. CSV rows are sometimes "cut" in
+the middle by the limits of the chunks, and those broken-lines (the first or
+last per chunk) are skipped while processing the query. Such broken lines are
+stored and later merged with the next broken line (which belongs to the next
+chunk), and only then processed.
+
+For each processed chunk, an output message is formatted according to `aws
+specification
+<https://docs.aws.amazon.com/amazons3/latest/api/archive-restobjectselectcontent.html#archive-restobjectselectcontent-responses>`_
+and sent back to the client. RGW supports the following response:
+``{:event-type,records} {:content-type,application/octet-stream}
+{:message-type,event}``. For aggregation queries, the last chunk should be
+identified as the end of input.
+
+
+Basic Functionalities
+~~~~~~~~~~~~~~~~~~~~~
+
+**S3select** has a definite set of functionalities compliant with AWS.
+
+The implemented software architecture supports basic arithmetic expressions,
+logical and compare expressions, including nested function calls and casting
+operators, which enables the user great flexibility.
+
+review the below s3-select-feature-table_.
+
+
+Error Handling
+~~~~~~~~~~~~~~
+
+Upon an error being detected, RGW returns 400-Bad-Request and a specific error message sends back to the client.
+Currently, there are 2 main types of error.
+
+**Syntax error**: the s3select parser rejects user requests that are not aligned with parser syntax definitions, as
+described in this documentation.
+Upon Syntax Error, the engine creates an error message that points to the location of the error.
+RGW sends back the error message in a specific error response.
+
+**Processing Time error**: the runtime engine may detect errors that occur only on processing time, for that type of
+error, a different error message would describe that.
+RGW sends back the error message in a specific error response.
+
+.. _s3-select-feature-table:
+
+Features Support
+----------------
+
+Currently only part of `AWS select command
+<https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference-select.html>`_
+is implemented, table below describes what is currently supported.
+
+The following table describes the current implementation for s3-select
+functionalities:
+
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Feature | Detailed | Example / Description |
++=================================+=================+=======================================================================+
+| Arithmetic operators | ^ * % / + - ( ) | select (int(_1)+int(_2))*int(_9) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| | ``%`` modulo | select count(*) from s3object where cast(_1 as int)%2 = 0; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| | ``^`` power-of | select cast(2^10 as int) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Compare operators | > < >= <= = != | select _1,_2 from s3object where (int(_1)+int(_3))>int(_5); |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| logical operator | AND OR NOT | select count(*) from s3object where not (int(_1)>123 and int(_5)<200);|
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| logical operator | is null | return true/false for null indication in expression |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| logical operator | is not null | return true/false for null indication in expression |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| logical operator and NULL | unknown state | review null-handle_ observe how logical operator result with null. |
+| | | the following query return **0**. |
+| | | |
+| | | select count(*) from s3object where null and (3>2); |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Arithmetic operator with NULL | unknown state | review null-handle_ observe the results of binary operations with NULL|
+| | | the following query return **0**. |
+| | | |
+| | | select count(*) from s3object where (null+1) and (3>2); |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| compare with NULL | unknown state | review null-handle_ observe results of compare operations with NULL |
+| | | the following query return **0**. |
+| | | |
+| | | select count(*) from s3object where (null*1.5) != 3; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| missing column | unknown state | select count(*) from s3object where _1 is null; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| query is filtering rows where predicate | select count(*) from s3object where (_1 > 12 and _2 = 0) is not null; |
+| is returning non null results. | |
+| this predicate will return null | |
+| upon _1 or _2 is null | |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| projection column | similar to | select case |
+| | switch/case | cast(_1 as int) + 1 |
+| | default | when 2 then "a" |
+| | | when 3 then "b" |
+| | | else "c" end from s3object; |
+| | | |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| projection column | similar to | select case |
+| | if/then/else | when (1+1=(2+1)*3) then 'case_1' |
+| | | when ((4*3)=(12)) then 'case_2' |
+| | | else 'case_else' end, |
+| | | age*2 from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| logical operator | ``coalesce {expression,expression ...} :: return first non-null argument`` |
+| | |
+| | select coalesce(nullif(5,5),nullif(1,1.0),age+12) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| logical operator | ``nullif {expr1,expr2} ::return null in case both arguments are equal,`` |
+| | ``or else the first one`` |
+| | |
+| | select nullif(cast(_1 as int),cast(_2 as int)) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| logical operator | ``{expression} in ( .. {expression} ..)`` |
+| | |
+| | select count(*) from s3object |
+| | where 'ben' in (trim(_5),substring(_1,char_length(_1)-3,3),last_name); |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| logical operator | ``{expression} between {expression} and {expression}`` |
+| | |
+| | select count(*) from s3object |
+| | where substring(_3,char_length(_3),1) between "x" and trim(_1) |
+| | and substring(_3,char_length(_3)-1,1) = ":"; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| logical operator | ``{expression} like {match-pattern}`` |
+| | |
+| | select count(*) from s3object where first_name like '%de_'; |
+| | |
+| | select count(*) from s3object where _1 like \"%a[r-s]\; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| | ``{expression} like {match-pattern} escape {char}`` |
+| | |
+| logical operator | select count(*) from s3object where "jok_ai" like "%#_ai" escape "#"; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| true / false | select (cast(_1 as int)>123 = true) from s3object |
+| predicate as a projection | where address like '%new-york%'; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| an alias to | select (_1 like "_3_") as *likealias*,_1 from s3object |
+| predicate as a projection | where *likealias* = true and cast(_1 as int) between 800 and 900; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| casting operator | select cast(123 as int)%2 from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| casting operator | select cast(123.456 as float)%2 from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| casting operator | select cast('ABC0-9' as string),cast(substr('ab12cd',3,2) as int)*4 from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| casting operator | select cast(5 as bool) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| casting operator | select cast(substring('publish on 2007-01-01',12,10) as timestamp) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| non AWS casting operator | select int(_1),int( 1.2 + 3.4) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| non AWS casting operator | select float(1.2) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| not AWS casting operator | select to_timestamp('1999-10-10T12:23:44Z') from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Aggregation Function | sum | select sum(int(_1)) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Aggregation Function | avg | select avg(cast(_1 a float) + cast(_2 as int)) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Aggregation Function | min | select min( int(_1) * int(_5) ) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Aggregation Function | max | select max(float(_1)),min(int(_5)) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Aggregation Function | count | select count(*) from s3object where (int(_1)+int(_3))>int(_5); |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Timestamp Functions | extract | select count(*) from s3object where |
+| | | extract(year from to_timestamp(_2)) > 1950 |
+| | | and extract(year from to_timestamp(_1)) < 1960; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Timestamp Functions | date_add | select count(0) from s3object where |
+| | | date_diff(year,to_timestamp(_1),date_add(day,366, |
+| | | to_timestamp(_1))) = 1; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Timestamp Functions | date_diff | select count(0) from s3object where |
+| | | date_diff(month,to_timestamp(_1),to_timestamp(_2))) = 2; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Timestamp Functions | utcnow | select count(0) from s3object where |
+| | | date_diff(hours,utcnow(),date_add(day,1,utcnow())) = 24; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Timestamp Functions | to_string | select to_string( |
+| | | to_timestamp("2009-09-17T17:56:06.234567Z"), |
+| | | "yyyyMMdd-H:m:s") from s3object; |
+| | | |
+| | | ``result: "20090917-17:56:6"`` |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| String Functions | substring | select count(0) from s3object where |
+| | | int(substring(_1,1,4))>1950 and int(substring(_1,1,4))<1960; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| substring with ``from`` negative number is valid | select substring("123456789" from -4) from s3object; |
+| considered as first | |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| substring with ``from`` zero ``for`` out-of-bound | select substring("123456789" from 0 for 100) from s3object; |
+| number is valid just as (first,last) | |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| String Functions | trim | select trim(' foobar ') from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| String Functions | trim | select trim(trailing from ' foobar ') from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| String Functions | trim | select trim(leading from ' foobar ') from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| String Functions | trim | select trim(both '12' from '1112211foobar22211122') from s3objects; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| String Functions | lower/upper | select lower('ABcD12#$e') from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| String Functions | char_length | select count(*) from s3object where char_length(_3)=3; |
+| | character_length| |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Complex queries | select sum(cast(_1 as int)), |
+| | max(cast(_3 as int)), |
+| | substring('abcdefghijklm',(2-1)*3+sum(cast(_1 as int))/sum(cast(_1 as int))+1, |
+| | (count() + count(0))/count(0)) from s3object; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| alias support | | select int(_1) as a1, int(_2) as a2 , (a1+a2) as a3 |
+| | | from s3object where a3>100 and a3<300; |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+
+.. _null-handle:
+
+NULL
+~~~~
+NULL is a legit value in ceph-s3select systems similar to other DB systems, i.e. systems needs to handle the case where a value is NULL.
+
+The definition of NULL in our context, is missing/unknown, in that sense **NULL can not produce a value on ANY arithmetic operations** ( a + NULL will produce NULL value).
+
+The Same is with arithmetic comparison, **any comparison to NULL is NULL**, i.e. unknown.
+Below is a truth table contains the NULL use-case.
+
++---------------------------------+-----------------------------+
+| A is NULL | Result (NULL=UNKNOWN) |
++=================================+=============================+
+| NOT A | NULL |
++---------------------------------+-----------------------------+
+| A OR False | NULL |
++---------------------------------+-----------------------------+
+| A OR True | True |
++---------------------------------+-----------------------------+
+| A OR A | NULL |
++---------------------------------+-----------------------------+
+| A AND False | False |
++---------------------------------+-----------------------------+
+| A AND True | NULL |
++---------------------------------+-----------------------------+
+| A and A | NULL |
++---------------------------------+-----------------------------+
+
+S3-select Function Interfaces
+-----------------------------
+
+Timestamp Functions
+~~~~~~~~~~~~~~~~~~~
+The timestamp functionalities as described in `AWS-specs
+<https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference-date.html>`_
+is fully implemented.
+
+ ``to_timestamp( string )`` : The casting operator converts string to timestamp
+ basic type. to_timestamp operator is able to convert the following
+ ``YYYY-MM-DDTHH:mm:ss.SSSSSS+/-HH:mm`` , ``YYYY-MM-DDTHH:mm:ss.SSSSSSZ`` ,
+ ``YYYY-MM-DDTHH:mm:ss+/-HH:mm`` , ``YYYY-MM-DDTHH:mm:ssZ`` ,
+ ``YYYY-MM-DDTHH:mm+/-HH:mm`` , ``YYYY-MM-DDTHH:mmZ`` , ``YYYY-MM-DDT`` or
+ ``YYYYT`` string formats into timestamp. Where time (or part of it) is
+ missing in the string format, zero's are replacing the missing parts. And for
+ missing month and day, 1 is default value for them. Timezone part is in
+ format ``+/-HH:mm`` or ``Z`` , where the letter "Z" indicates Coordinated
+ Universal Time (UTC). Value of timezone can range between -12:00 and +14:00.
+
+ ``extract(date-part from timestamp)`` : The function extracts date-part from
+ input timestamp and returns it as integer. Supported date-part : year, month,
+ week, day, hour, minute, second, timezone_hour, timezone_minute.
+
+ ``date_add(date-part, quantity, timestamp)`` : The function adds quantity
+ (integer) to date-part of timestamp and returns result as timestamp. It also
+ includes timezone in calculation. Supported data-part : year, month, day,
+ hour, minute, second.
+
+ ``date_diff(date-part, timestamp, timestamp)`` : The function returns an
+ integer, a calculated result for difference between 2 timestamps according to
+ date-part. It includes timezone in calculation. supported date-part : year,
+ month, day, hour, minute, second.
+
+ ``utcnow()`` : return timestamp of current time.
+
+ ``to_string(timestamp, format_pattern)`` : returns a string representation of
+ the input timestamp in the given input string format.
+
+to_string parameters
+~~~~~~~~~~~~~~~~~~~~
+
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| Format | Example | Description |
++==============+=================+===================================================================================+
+| yy | 69 | 2-digit year |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| y | 1969 | 4-digit year |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| yyyy | 1969 | Zero-padded 4-digit year |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| M | 1 | Month of year |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| MM | 01 | Zero-padded month of year |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| MMM | Jan | Abbreviated month year name |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| MMMM | January | Full month of year name |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| MMMMM | J | Month of year first letter (NOTE: not valid for use with to_timestamp function) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| d | 2 | Day of month (1-31) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| dd | 02 | Zero-padded day of month (01-31) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| a | AM | AM or PM of day |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| h | 3 | Hour of day (1-12) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| hh | 03 | Zero-padded hour of day (01-12) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| H | 3 | Hour of day (0-23) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| HH | 03 | Zero-padded hour of day (00-23) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| m | 4 | Minute of hour (0-59) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| mm | 04 | Zero-padded minute of hour (00-59) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| s | 5 | Second of minute (0-59) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| ss | 05 | Zero-padded second of minute (00-59) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| S | 0 | Fraction of second (precision: 0.1, range: 0.0-0.9) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| SS | 6 | Fraction of second (precision: 0.01, range: 0.0-0.99) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| SSS | 60 | Fraction of second (precision: 0.001, range: 0.0-0.999) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| SSSSSS | 60000000 | Fraction of second (maximum precision: 1 nanosecond, range: 0.0-0999999999) |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| n | 60000000 | Nano of second |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| X | +07 or Z | Offset in hours or "Z" if the offset is 0 |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| XX or XXXX| +0700 or Z | Offset in hours and minutes or "Z" if the offset is 0 |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| XXX or XXXXX | +07:00 or Z | Offset in hours and minutes or "Z" if the offset is 0 |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| X | 7 | Offset in hours |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| xx or xxxx | 700 | Offset in hours and minutes |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+| xxx or xxxxx | +07:00 | Offset in hours and minutes |
++--------------+-----------------+-----------------------------------------------------------------------------------+
+
+
+Aggregation Functions
+~~~~~~~~~~~~~~~~~~~~~
+
+``count()`` : return integer according to number of rows matching condition(if such exist).
+
+``sum(expression)`` : return a summary of expression per all rows matching condition(if such exist).
+
+``avg(expression)`` : return a average of expression per all rows matching condition(if such exist).
+
+``max(expression)`` : return the maximal result for all expressions matching condition(if such exist).
+
+``min(expression)`` : return the minimal result for all expressions matching condition(if such exist).
+
+String Functions
+~~~~~~~~~~~~~~~~
+
+``substring(string,from,to)`` : substring( string ``from`` start [ ``for`` length ] )
+return a string extract from input string according to from,to inputs.
+``substring(string from )``
+``substring(string from for)``
+
+``char_length`` : return a number of characters in string (``character_length`` does the same).
+
+``trim`` : trim ( [[``leading`` | ``trailing`` | ``both`` remove_chars] ``from``] string )
+trims leading/trailing(or both) characters from target string, the default is blank character.
+
+``upper\lower`` : converts characters into lowercase/uppercase.
+
+SQL Limit Operator
+~~~~~~~~~~~~~~~~~~
+
+The SQL LIMIT operator is used to limit the number of rows processed by the query.
+Upon reaching the limit set by the user, the RGW stops fetching additional chunks.
+TODO : add examples, for aggregation and non-aggregation queries.
+
+Alias
+~~~~~
+**Alias** programming-construct is an essential part of s3-select language, it enables much better programming especially with objects containing many columns or in the case of complex queries.
+
+Upon parsing the statement containing alias construct, it replaces alias with reference to correct projection column, on query execution time the reference is evaluated as any other expression.
+
+There is a risk that self(or cyclic) reference may occur causing stack-overflow(endless-loop), for that concern upon evaluating an alias, it is validated for cyclic reference.
+
+Alias also maintains a result cache, meaning that successive uses of a given alias do not evaluate the expression again. The result is instead returned from the cache.
+
+With each new row the cache is invalidated as the results may then differ.
+
+Testing
+~~~~~~~
+
+``s3select`` contains several testing frameworks which provide a large coverage for its functionalities.
+
+(1) Tests comparison against a trusted engine, meaning, C/C++ compiler is a trusted expression evaluator,
+since the syntax for arithmetical and logical expressions are identical (s3select compare to C)
+the framework runs equal expressions and validates their results.
+A dedicated expression generator produces different sets of expressions per each new test session.
+
+(2) Compares results of queries whose syntax is different but which are semantically equivalent.
+This kind of test validates that different runtime flows produce an identical result
+on each run with a different, random dataset.
+
+For example, on a dataset which contains a random numbers(1-1000)
+the following queries will produce identical results.
+``select count(*) from s3object where char_length(_3)=3;``
+``select count(*) from s3object where cast(_3 as int)>99 and cast(_3 as int)<1000;``
+
+(3) Constant dataset, the conventional way of testing. A query is processing a constant dataset, its result is validated against constant results.
+
+Additional Syntax Support
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+S3select syntax supports table-alias ``select s._1 from s3object s where s._2 = ‘4’;``
+
+S3select syntax supports case insensitive ``Select SUM(Cast(_1 as int)) FROM S3Object;``
+
+S3select syntax supports statements without closing semicolon ``select count(*) from s3object``
+
+
+Sending Query to RGW
+--------------------
+
+Any HTTP client can send an ``s3-select`` request to RGW, which must be compliant with `AWS Request syntax <https://docs.aws.amazon.com/AmazonS3/latest/API/API_SelectObjectContent.html#API_SelectObjectContent_RequestSyntax>`_.
+
+
+
+When sending an ``s3-select`` request to RGW using AWS CLI, clients must follow `AWS command reference <https://docs.aws.amazon.com/cli/latest/reference/s3api/select-object-content.html>`_.
+Below is an example:
+
+::
+
+ aws --endpoint-url http://localhost:8000 s3api select-object-content
+ --bucket {BUCKET-NAME}
+ --expression-type 'SQL'
+ --scan-range '{"Start" : 1000, "End" : 1000000}'
+ --input-serialization
+ '{"CSV": {"FieldDelimiter": "," , "QuoteCharacter": "\"" , "RecordDelimiter" : "\n" , "QuoteEscapeCharacter" : "\\" , "FileHeaderInfo": "USE" }, "CompressionType": "NONE"}'
+ --output-serialization '{"CSV": {"FieldDelimiter": ":", "RecordDelimiter":"\t", "QuoteFields": "ALWAYS"}}'
+ --key {OBJECT-NAME}
+ --request-progress '{"Enabled": True}'
+ --expression "select count(0) from s3object where int(_1)<10;" output.csv
+
+Input Serialization
+~~~~~~~~~~~~~~~~~~~
+
+**FileHeaderInfo** -> (string)
+Describes the first line of input. Valid values are:
+
+**NONE** : The first line is not a header.
+**IGNORE** : The first line is a header, but you can't use the header values to indicate the column in an expression.
+it's possible to use column position (such as _1, _2, …) to indicate the column (``SELECT s._1 FROM S3OBJECT s``).
+**USE** : First line is a header, and you can use the header value to identify a column in an expression (``SELECT column_name FROM S3OBJECT``).
+
+**QuoteEscapeCharacter** -> (string)
+A single character used for escaping the quotation mark character inside an already escaped value.
+
+**RecordDelimiter** -> (string)
+A single character is used to separate individual records in the input. Instead of the default value, you can specify an arbitrary delimiter.
+
+**FieldDelimiter** -> (string)
+A single character is used to separate individual fields in a record. You can specify an arbitrary delimiter.
+
+Output Serialization
+~~~~~~~~~~~~~~~~~~~~
+
+**AWS CLI example**
+
+ aws s3api select-object-content \
+ --bucket "mybucket" \
+ --key keyfile1 \
+ --expression "SELECT * FROM s3object s" \
+ --expression-type 'SQL' \
+ --request-progress '{"Enabled": false}' \
+ --input-serialization '{"CSV": {"FieldDelimiter": ","}, "CompressionType": "NONE"}' \
+ --output-serialization '{"CSV": {"FieldDelimiter": ":", "RecordDelimiter":"\\t", "QuoteFields": "ALWAYS"}}' /dev/stdout
+
+ **QuoteFields** -> (string)
+ Indicates whether to use quotation marks around output fields.
+ **ALWAYS**: Always use quotation marks for output fields.
+ **ASNEEDED** (not implemented): Use quotation marks for output fields when needed.
+
+ **RecordDelimiter** -> (string)
+ A single character is used to separate individual records in the output. Instead of the default value, you can specify an
+ arbitrary delimiter.
+
+ **FieldDelimiter** -> (string)
+ The value used to separate individual fields in a record. You can specify an arbitrary delimiter.
+
+Scan Range Option
+~~~~~~~~~~~~~~~~~
+
+ The scan range option to AWS-CLI enables the client to scan and process only a selected part of the object.
+ This option reduces input/output operations and bandwidth by skipping parts of the object that are not of interest.
+ TODO : different data-sources (CSV, JSON, Parquet)
+
+CSV Parsing Behavior
+--------------------
+
+ The ``s3-select`` engine contains a CSV parser, which parses s3-objects as follows.
+ - Each row ends with ``row-delimiter``.
+ - ``field-separator`` separates adjacent columns, successive instances of ``field separator`` define a NULL column.
+ - ``quote-character`` overrides ``field separator``, meaning that ``field separator`` is treated like any character between quotes.
+ - ``escape character`` disables interpretation of special characters, except for ``row delimiter``.
+
+ Below are examples of CSV parsing rules.
+
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Feature | Description | input ==> tokens |
++=================================+=================+=======================================================================+
+| NULL | successive | ,,1,,2, ==> {null}{null}{1}{null}{2}{null} |
+| | field delimiter | |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| QUOTE | quote character | 11,22,"a,b,c,d",last ==> {11}{22}{"a,b,c,d"}{last} |
+| | overrides | |
+| | field delimiter | |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| Escape | escape char | 11,22,str=\\"abcd\\"\\,str2=\\"123\\",last |
+| | overrides | ==> {11}{22}{str="abcd",str2="123"}{last} |
+| | meta-character. | |
+| | escape removed | |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| row delimiter | no close quote, | 11,22,a="str,44,55,66 |
+| | row delimiter is| ==> {11}{22}{a="str,44,55,66} |
+| | closing line | |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+| csv header info | FileHeaderInfo | "**USE**" value means each token on first line is column-name, |
+| | tag | "**IGNORE**" value means to skip the first line |
++---------------------------------+-----------------+-----------------------------------------------------------------------+
+
+JSON
+--------------------
+
+A JSON reader has been integrated with the ``s3select-engine``, which allows the client to use SQL statements to scan and extract information from JSON documents.
+It should be noted that the data readers and parsers for CSV, Parquet, and JSON documents are separated from the SQL engine itself, so all of these readers use the same SQL engine.
+
+It's important to note that values in a JSON document can be nested in various ways, such as within objects or arrays.
+These objects and arrays can be nested within each other without any limitations.
+When using SQL to query a specific value in a JSON document, the client must specify the location of the value
+via a path in the SELECT statement.
+
+The SQL engine processes the SELECT statement in a row-based fashion.
+It uses the columns specified in the statement to perform its projection calculation, and each row contains values for these columns.
+In other words, the SQL engine processes each row one at a time (and aggregates results), using the values in the columns to perform SQL calculations.
+However, the generic structure of a JSON document does not have a row-and-column structure like CSV or Parquet.
+Instead, it is the SQL statement itself that defines the rows and columns when querying a JSON document.
+
+When querying JSON documents using SQL, the FROM clause in the SELECT statement defines the row boundaries.
+A row in a JSON document should be similar to how the row delimiter is used to define rows when querying CSV objects, and how row groups are used to define rows when querying Parquet objects.
+The statement "SELECT ... FROM s3object[*].aaa.bb.cc" instructs the reader to search for the path "aaa.bb.cc" and defines the row boundaries based on the occurrence of this path.
+A row begins when the reader encounters the path, and it ends when the reader exits the innermost part of the path, which in this case is the object "cc".
+
+NOTE : The semantics of querying JSON document may change and may not be the same as the current methodology described.
+
+TODO : relevant example for object and array values.
+
+A JSON Query Example
+--------------------
+
+::
+
+ {
+ "firstName": "Joe",
+ "lastName": "Jackson",
+ "gender": "male",
+ "age": "twenty",
+ "address": {
+ "streetAddress": "101",
+ "city": "San Diego",
+ "state": "CA"
+ },
+
+ "firstName": "Joe_2",
+ "lastName": "Jackson_2",
+ "gender": "male",
+ "age": 21,
+ "address": {
+ "streetAddress": "101",
+ "city": "San Diego",
+ "state": "CA"
+ },
+
+ "phoneNumbers": [
+ { "type": "home1", "number": "734928_1","addr": 11 },
+ { "type": "home2", "number": "734928_2","addr": 22 },
+ { "type": "home3", "number": "734928_3","addr": 33 },
+ { "type": "home4", "number": "734928_4","addr": 44 },
+ { "type": "home5", "number": "734928_5","addr": 55 },
+ { "type": "home6", "number": "734928_6","addr": 66 },
+ { "type": "home7", "number": "734928_7","addr": 77 },
+ { "type": "home8", "number": "734928_8","addr": 88 },
+ { "type": "home9", "number": "734928_9","addr": 99 },
+ { "type": "home10", "number": "734928_10","addr": 100 }
+ ],
+
+ "key_after_array": "XXX",
+
+ "description" : {
+ "main_desc" : "value_1",
+ "second_desc" : "value_2"
+ }
+ }
+
+ # the from-clause define a single row.
+ # _1 points to root object level.
+ # _1.age appears twice in Documnet-row, the last value is used for the operation.
+ query = "select _1.firstname,_1.key_after_array,_1.age+4,_1.description.main_desc,_1.description.second_desc from s3object[*];";
+ expected_result = Joe_2,XXX,25,value_1,value_2
+
+
+ # the from-clause points the phonenumbers array (it defines the _1)
+ # each element in phoneNumbers array define a row.
+ # in this case each element is an object contains 3 keys/values.
+ # the query "can not access" values outside phonenumbers array, the query can access only values appears on _1.phonenumbers path.
+ query = "select cast(substring(_1.number,1,6) as int) *10 from s3object[*].phonenumbers where _1.type='home2';";
+ expected_result = 7349280
+
+
+BOTO3
+-----
+
+using BOTO3 is "natural" and easy due to AWS-cli support.
+
+::
+
+ import pprint
+
+ def run_s3select(bucket,key,query,column_delim=",",row_delim="\n",quot_char='"',esc_char='\\',csv_header_info="NONE"):
+
+ s3 = boto3.client('s3',
+ endpoint_url=endpoint,
+ aws_access_key_id=access_key,
+ region_name=region_name,
+ aws_secret_access_key=secret_key)
+
+ result = ""
+ try:
+ r = s3.select_object_content(
+ Bucket=bucket,
+ Key=key,
+ ExpressionType='SQL',
+ InputSerialization = {"CSV": {"RecordDelimiter" : row_delim, "FieldDelimiter" : column_delim,"QuoteEscapeCharacter": esc_char, "QuoteCharacter": quot_char, "FileHeaderInfo": csv_header_info}, "CompressionType": "NONE"},
+ OutputSerialization = {"CSV": {}},
+ Expression=query,
+ RequestProgress = {"Enabled": progress})
+
+ except ClientError as c:
+ result += str(c)
+ return result
+
+ for event in r['Payload']:
+ if 'Records' in event:
+ result = ""
+ records = event['Records']['Payload'].decode('utf-8')
+ result += records
+ if 'Progress' in event:
+ print("progress")
+ pprint.pprint(event['Progress'],width=1)
+ if 'Stats' in event:
+ print("Stats")
+ pprint.pprint(event['Stats'],width=1)
+ if 'End' in event:
+ print("End")
+ pprint.pprint(event['End'],width=1)
+
+ return result
+
+
+
+
+ run_s3select(
+ "my_bucket",
+ "my_csv_object",
+ "select int(_1) as a1, int(_2) as a2 , (a1+a2) as a3 from s3object where a3>100 and a3<300;")
+
+
+S3 SELECT Responses
+-------------------
+
+Error Response
+~~~~~~~~~~~~~~
+
+::
+
+ <?xml version="1.0" encoding="UTF-8"?>
+ <Error>
+ <Code>NoSuchKey</Code>
+ <Message>The resource you requested does not exist</Message>
+ <Resource>/mybucket/myfoto.jpg</Resource>
+ <RequestId>4442587FB7D0A2F9</RequestId>
+ </Error>
+
+Report Response
+~~~~~~~~~~~~~~~
+::
+
+ HTTP/1.1 200
+ <?xml version="1.0" encoding="UTF-8"?>
+ <Payload>
+ <Records>
+ <Payload>blob</Payload>
+ </Records>
+ <Stats>
+ <Details>
+ <BytesProcessed>long</BytesProcessed>
+ <BytesReturned>long</BytesReturned>
+ <BytesScanned>long</BytesScanned>
+ </Details>
+ </Stats>
+ <Progress>
+ <Details>
+ <BytesProcessed>long</BytesProcessed>
+ <BytesReturned>long</BytesReturned>
+ <BytesScanned>long</BytesScanned>
+ </Details>
+ </Progress>
+ <Cont>
+ </Cont>
+ <End>
+ </End>
+ </Payload>
+
+Response Description
+~~~~~~~~~~~~~~~~~~~~
+
+For CEPH S3 Select, responses can be messages of the following types:
+
+**Records message**: Can contain a single record, partial records, or multiple records. Depending on the size of the result, a response can contain one or more of these messages.
+
+**Error message**: Upon an error being detected, RGW returns 400 Bad Request, and a specific error message sends back to the client, according to its type.
+
+**Continuation message**: Ceph S3 periodically sends this message to keep the TCP connection open.
+These messages appear in responses at random. The client must detect the message type and process it accordingly.
+
+**Progress message**: Ceph S3 periodically sends this message if requested. It contains information about the progress of a query that has started but has not yet been completed.
+
+**Stats message**: Ceph S3 sends this message at the end of the request. It contains statistics about the query.
+
+**End message**: Indicates that the request is complete, and no more messages will be sent. You should not assume that request is complete until the client receives an End message.
diff --git a/doc/radosgw/session-tags.rst b/doc/radosgw/session-tags.rst
new file mode 100644
index 000000000..46722c382
--- /dev/null
+++ b/doc/radosgw/session-tags.rst
@@ -0,0 +1,427 @@
+=======================================================
+Session tags for Attribute Based Access Control in STS
+=======================================================
+
+Session tags are key-value pairs that can be passed while federating a user (currently it
+is only supported as part of the web token passed to AssumeRoleWithWebIdentity). The session
+tags are passed along as aws:PrincipalTag in the session credentials (temporary credentials)
+that is returned back by STS. These Principal Tags consists of the session tags that come in
+as part of the web token and the tags that are attached to the role being assumed. Please note
+that the tags have to be always specified in the following namespace: https://aws.amazon.com/tags.
+
+An example of the session tags that are passed in by the IDP in the web token is as follows:
+
+.. code-block:: python
+
+ {
+ "jti": "947960a3-7e91-4027-99f6-da719b0d4059",
+ "exp": 1627438044,
+ "nbf": 0,
+ "iat": 1627402044,
+ "iss": "http://localhost:8080/auth/realms/quickstart",
+ "aud": "app-profile-jsp",
+ "sub": "test",
+ "typ": "ID",
+ "azp": "app-profile-jsp",
+ "auth_time": 0,
+ "session_state": "3a46e3e7-d198-4a64-8b51-69682bcfc670",
+ "preferred_username": "test",
+ "email_verified": false,
+ "acr": "1",
+ "https://aws.amazon.com/tags": [
+ {
+ "principal_tags": {
+ "Department": [
+ "Engineering",
+ "Marketing"
+ ]
+ }
+ }
+ ],
+ "client_id": "app-profile-jsp",
+ "username": "test",
+ "active": true
+ }
+
+Steps to configure Keycloak to pass tags in the web token are described here:
+:ref:`radosgw_keycloak`.
+
+The trust policy must have 'sts:TagSession' permission if the web token passed
+in by the federated user contains session tags, otherwise the
+AssumeRoleWithWebIdentity action will fail. An example of the trust policy with
+sts:TagSession is as follows:
+
+.. code-block:: python
+
+ {
+ "Version":"2012-10-17",
+ "Statement":[
+ {
+ "Effect":"Allow",
+ "Action":["sts:AssumeRoleWithWebIdentity","sts:TagSession"],
+ "Principal":{"Federated":["arn:aws:iam:::oidc-provider/localhost:8080/auth/realms/quickstart"]},
+ "Condition":{"StringEquals":{"localhost:8080/auth/realms/quickstart:sub":"test"}}
+ }]
+ }
+
+Tag Keys
+========
+
+The following are the tag keys that can be used in the role's trust policy or the role's permission policy:
+
+1. aws:RequestTag: This key is used to compare the key-value pair passed in the request with the key-value pair
+in the role's trust policy. In case of AssumeRoleWithWebIdentity, the session tags that are passed by the idp
+in the web token can be used as aws:RequestTag in the role's trust policy based on which a federated user can be
+allowed to assume a role.
+
+An example of a role trust policy that uses aws:RequestTag is as follows:
+
+.. code-block:: python
+
+ {
+ "Version":"2012-10-17",
+ "Statement":[
+ {
+ "Effect":"Allow",
+ "Action":["sts:AssumeRoleWithWebIdentity","sts:TagSession"],
+ "Principal":{"Federated":["arn:aws:iam:::oidc-provider/localhost:8080/auth/realms/quickstart"]},
+ "Condition":{"StringEquals":{"aws:RequestTag/Department":"Engineering"}}
+ }]
+ }
+
+2. aws:PrincipalTag: This key is used to compare the key-value pair attached to the principal with the key-value pair
+in the policy. In case of AssumeRoleWithWebIdentity, the session tags that are passed by the idp in the web token appear
+as Principal tags in the temporary credentials once a user has been authenticated, and these tags can be used as
+aws:PrincipalTag in the role's permission policy.
+
+An example of a role permission policy that uses aws:PrincipalTag is as follows:
+
+.. code-block:: python
+
+ {
+ "Version":"2012-10-17",
+ "Statement":[
+ {
+ "Effect":"Allow",
+ "Action":["s3:*"],
+ "Resource":["arn:aws:s3::t1tenant:my-test-bucket","arn:aws:s3::t1tenant:my-test-bucket/*],"+
+ "Condition":{"StringEquals":{"aws:PrincipalTag/Department":"Engineering"}}
+ }]
+ }
+
+3. iam:ResourceTag: This key is used to compare the key-value pair attached to the resource with the key-value pair
+in the policy. In case of AssumeRoleWithWebIdentity, tags attached to the role can be used to compare with that in
+the trust policy to allow a user to assume a role.
+RGW now supports REST APIs for tagging, listing tags and untagging actions on a role. More information related to
+role tagging can be found here :doc:`role`.
+
+An example of a role's trust policy that uses aws:ResourceTag is as follows:
+
+.. code-block:: python
+
+ {
+ "Version":"2012-10-17",
+ "Statement":[
+ {
+ "Effect":"Allow",
+ "Action":["sts:AssumeRoleWithWebIdentity","sts:TagSession"],
+ "Principal":{"Federated":["arn:aws:iam:::oidc-provider/localhost:8080/auth/realms/quickstart"]},
+ "Condition":{"StringEquals":{"iam:ResourceTag/Department":"Engineering"}}
+ }]
+ }
+
+For the above to work, you need to attach 'Department=Engineering' tag to the role.
+
+4. aws:TagKeys: This key is used to compare tags in the request with the tags in the policy. In case of
+AssumeRoleWithWebIdentity this can be used to check the tag keys in a role's trust policy before a user
+is allowed to assume a role.
+This can also be used in the role's permission policy.
+
+An example of a role's trust policy that uses aws:TagKeys is as follows:
+
+.. code-block:: python
+
+ {
+ "Version":"2012-10-17",
+ "Statement":[
+ {
+ "Effect":"Allow",
+ "Action":["sts:AssumeRoleWithWebIdentity","sts:TagSession"],
+ "Principal":{"Federated":["arn:aws:iam:::oidc-provider/localhost:8080/auth/realms/quickstart"]},
+ "Condition":{"ForAllValues:StringEquals":{"aws:TagKeys":["Department"]}}
+ }]
+ }
+
+'ForAllValues:StringEquals' tests whether every tag key in the request is a subset of the tag keys in the policy. So the above
+condition restricts the tag keys passed in the request.
+
+5. s3:ResourceTag: This key is used to compare tags present on the s3 resource (bucket or object) with the tags in
+the role's permission policy.
+
+An example of a role's permission policy that uses s3:ResourceTag is as follows:
+
+.. code-block:: python
+
+ {
+ "Version":"2012-10-17",
+ "Statement":[
+ {
+ "Effect":"Allow",
+ "Action":["s3:PutBucketTagging"],
+ "Resource":["arn:aws:s3::t1tenant:my-test-bucket\","arn:aws:s3::t1tenant:my-test-bucket/*"]
+ },
+ {
+ "Effect":"Allow",
+ "Action":["s3:*"],
+ "Resource":["*"],
+ "Condition":{"StringEquals":{"s3:ResourceTag/Department":\"Engineering"}}
+ }
+ }
+
+For the above to work, you need to attach 'Department=Engineering' tag to the bucket (and on the object too) on which you want this policy
+to be applied.
+
+More examples of policies using tags
+====================================
+
+1. To assume a role by matching the tags in the incoming request with the tag attached to the role.
+aws:RequestTag is the incoming tag in the JWT (access token) and iam:ResourceTag is the tag attached to the role being assumed:
+
+.. code-block:: python
+
+ {
+ "Version":"2012-10-17",
+ "Statement":[
+ {
+ "Effect":"Allow",
+ "Action":["sts:AssumeRoleWithWebIdentity","sts:TagSession"],
+ "Principal":{"Federated":["arn:aws:iam:::oidc-provider/localhost:8080/auth/realms/quickstart"]},
+ "Condition":{"StringEquals":{"aws:RequestTag/Department":"${iam:ResourceTag/Department}"}}
+ }]
+ }
+
+2. To evaluate a role's permission policy by matching principal tags with s3 resource tags.
+aws:PrincipalTag is the tag passed in along with the temporary credentials and s3:ResourceTag is the tag attached to
+the s3 resource (object/ bucket):
+
+.. code-block:: python
+
+
+ {
+ "Version":"2012-10-17",
+ "Statement":[
+ {
+ "Effect":"Allow",
+ "Action":["s3:PutBucketTagging"],
+ "Resource":["arn:aws:s3::t1tenant:my-test-bucket\","arn:aws:s3::t1tenant:my-test-bucket/*"]
+ },
+ {
+ "Effect":"Allow",
+ "Action":["s3:*"],
+ "Resource":["*"],
+ "Condition":{"StringEquals":{"s3:ResourceTag/Department":"${aws:PrincipalTag/Department}"}}
+ }
+ }
+
+Properties of Session Tags
+==========================
+
+1. Session Tags can be multi-valued. (Multi-valued session tags are not supported in AWS)
+2. A maximum of 50 session tags are allowed to be passed in by the IDP.
+3. The maximum size of a key allowed is 128 characters.
+4. The maximum size of a value allowed is 256 characters.
+5. The tag or the value can not start with "aws:".
+
+s3 Resource Tags
+================
+
+As stated above 's3:ResourceTag' key can be used for authorizing an s3 operation in RGW (this is not allowed in AWS).
+
+s3:ResourceTag is a key used to refer to tags that have been attached to an object or a bucket. Tags can be attached to an object or
+a bucket using REST APIs available for the same.
+
+The following table shows which s3 resource tag type (bucket/object) are supported for authorizing a particular operation.
+
++-----------------------------------+-------------------+
+| Operation | Tag type |
++===================================+===================+
+| **GetObject** | Object tags |
+| **GetObjectTags** | |
+| **DeleteObjectTags** | |
+| **DeleteObject** | |
+| **PutACLs** | |
+| **InitMultipart** | |
+| **AbortMultipart** | |
+| **ListMultipart** | |
+| **GetAttrs** | |
+| **PutObjectRetention** | |
+| **GetObjectRetention** | |
+| **PutObjectLegalHold** | |
+| **GetObjectLegalHold** | |
++-----------------------------------+-------------------+
+| **PutObjectTags** | Bucket tags |
+| **GetBucketTags** | |
+| **PutBucketTags** | |
+| **DeleteBucketTags** | |
+| **GetBucketReplication** | |
+| **DeleteBucketReplication** | |
+| **GetBucketVersioning** | |
+| **SetBucketVersioning** | |
+| **GetBucketWebsite** | |
+| **SetBucketWebsite** | |
+| **DeleteBucketWebsite** | |
+| **StatBucket** | |
+| **ListBucket** | |
+| **GetBucketLogging** | |
+| **GetBucketLocation** | |
+| **DeleteBucket** | |
+| **GetLC** | |
+| **PutLC** | |
+| **DeleteLC** | |
+| **GetCORS** | |
+| **PutCORS** | |
+| **GetRequestPayment** | |
+| **SetRequestPayment** | |
+| **PutBucketPolicy** | |
+| **GetBucketPolicy** | |
+| **DeleteBucketPolicy** | |
+| **PutBucketObjectLock** | |
+| **GetBucketObjectLock** | |
+| **GetBucketPolicyStatus** | |
+| **PutBucketPublicAccessBlock** | |
+| **GetBucketPublicAccessBlock** | |
+| **DeleteBucketPublicAccessBlock** | |
++-----------------------------------+-------------------+
+| **GetACLs** | Bucket tags for |
+| **PutACLs** | bucket ACLs |
+| | Object tags for |
+| | object ACLs |
++-----------------------------------+-------------------+
+| **PutObject** | Object tags of |
+| **CopyObject** | source object |
+| | Bucket tags of |
+| | destination bucket|
++-----------------------------------+-------------------+
+
+
+Sample code demonstrating usage of session tags
+===============================================
+
+The following is a sample code for tagging a role, a bucket, an object in it and using tag keys in a role's
+trust policy and its permission policy, assuming that a tag 'Department=Engineering' is passed in the
+JWT (access token) by the IDP
+
+.. code-block:: python
+
+ # -*- coding: utf-8 -*-
+
+ import boto3
+ import json
+ from nose.tools import eq_ as eq
+
+ access_key = 'TESTER'
+ secret_key = 'test123'
+ endpoint = 'http://s3.us-east.localhost:8000'
+
+ s3client = boto3.client('s3',
+ aws_access_key_id = access_key,
+ aws_secret_access_key = secret_key,
+ endpoint_url = endpoint,
+ region_name='',)
+
+ s3res = boto3.resource('s3',
+ aws_access_key_id = access_key,
+ aws_secret_access_key = secret_key,
+ endpoint_url = endpoint,
+ region_name='',)
+
+ iam_client = boto3.client('iam',
+ aws_access_key_id=access_key,
+ aws_secret_access_key=secret_key,
+ endpoint_url=endpoint,
+ region_name=''
+ )
+
+ bucket_name = 'test-bucket'
+ s3bucket = s3client.create_bucket(Bucket=bucket_name)
+
+ bucket_tagging = s3res.BucketTagging(bucket_name)
+ Set_Tag = bucket_tagging.put(Tagging={'TagSet':[{'Key':'Department', 'Value': 'Engineering'}]})
+ try:
+ response = iam_client.create_open_id_connect_provider(
+ Url='http://localhost:8080/auth/realms/quickstart',
+ ClientIDList=[
+ 'app-profile-jsp',
+ 'app-jee-jsp'
+ ],
+ ThumbprintList=[
+ 'F7D7B3515DD0D319DD219A43A9EA727AD6065287'
+ ]
+ )
+ except ClientError as e:
+ print ("Provider already exists")
+
+ policy_document = "{\"Version\":\"2012-10-17\",\"Statement\":[{\"Effect\":\"Allow\",\"Principal\":{\"Federated\":[\"arn:aws:iam:::oidc-provider/localhost:8080/auth/realms/quickstart\"]},\"Action\":[\"sts:AssumeRoleWithWebIdentity\",\"sts:TagSession\"],\"Condition\":{\"StringEquals\":{\"aws:RequestTag/Department\":\"${iam:ResourceTag/Department}\"}}}]}"
+ role_response = ""
+
+ print ("\n Getting Role \n")
+
+ try:
+ role_response = iam_client.get_role(
+ RoleName='S3Access'
+ )
+ print (role_response)
+ except ClientError as e:
+ if e.response['Code'] == 'NoSuchEntity':
+ print ("\n Creating Role \n")
+ tags_list = [
+ {'Key':'Department','Value':'Engineering'},
+ ]
+ role_response = iam_client.create_role(
+ AssumeRolePolicyDocument=policy_document,
+ Path='/',
+ RoleName='S3Access',
+ Tags=tags_list,
+ )
+ print (role_response)
+ else:
+ print("Unexpected error: %s" % e)
+
+ role_policy = "{\"Version\":\"2012-10-17\",\"Statement\":{\"Effect\":\"Allow\",\"Action\":\"s3:*\",\"Resource\":\"arn:aws:s3:::*\",\"Condition\":{\"StringEquals\":{\"s3:ResourceTag/Department\":[\"${aws:PrincipalTag/Department}\"]}}}}"
+
+ response = iam_client.put_role_policy(
+ RoleName='S3Access',
+ PolicyName='Policy1',
+ PolicyDocument=role_policy
+ )
+
+ sts_client = boto3.client('sts',
+ aws_access_key_id='abc',
+ aws_secret_access_key='def',
+ endpoint_url = endpoint,
+ region_name = '',
+ )
+
+
+ print ("\n Assuming Role with Web Identity\n")
+ response = sts_client.assume_role_with_web_identity(
+ RoleArn=role_response['Role']['Arn'],
+ RoleSessionName='Bob',
+ DurationSeconds=900,
+ WebIdentityToken='<web-token>')
+
+ s3client2 = boto3.client('s3',
+ aws_access_key_id = response['Credentials']['AccessKeyId'],
+ aws_secret_access_key = response['Credentials']['SecretAccessKey'],
+ aws_session_token = response['Credentials']['SessionToken'],
+ endpoint_url='http://s3.us-east.localhost:8000',
+ region_name='',)
+
+ bucket_body = 'this is a test file'
+ tags = 'Department=Engineering'
+ key = "test-1.txt"
+ s3_put_obj = s3client2.put_object(Body=bucket_body, Bucket=bucket_name, Key=key, Tagging=tags)
+ eq(s3_put_obj['ResponseMetadata']['HTTPStatusCode'],200)
+
+ s3_get_obj = s3client2.get_object(Bucket=bucket_name, Key=key)
+ eq(s3_get_obj['ResponseMetadata']['HTTPStatusCode'],200)
diff --git a/doc/radosgw/swift.rst b/doc/radosgw/swift.rst
new file mode 100644
index 000000000..24abd7728
--- /dev/null
+++ b/doc/radosgw/swift.rst
@@ -0,0 +1,79 @@
+.. _radosgw swift:
+
+===============================
+ Ceph Object Gateway Swift API
+===============================
+
+Ceph supports a RESTful API that is compatible with the basic data access model of the `Swift API`_.
+
+API
+---
+
+.. toctree::
+ :maxdepth: 1
+
+ Authentication <swift/auth>
+ Service Ops <swift/serviceops>
+ Container Ops <swift/containerops>
+ Object Ops <swift/objectops>
+ Temp URL Ops <swift/tempurl>
+ Tutorial <swift/tutorial>
+ Java <swift/java>
+ Python <swift/python>
+ Ruby <swift/ruby>
+
+
+Features Support
+----------------
+
+The following table describes the support status for current Swift functional features:
+
++---------------------------------+-----------------+----------------------------------------+
+| Feature | Status | Remarks |
++=================================+=================+========================================+
+| **Authentication** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Get Account Metadata** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Swift ACLs** | Supported | Supports a subset of Swift ACLs |
++---------------------------------+-----------------+----------------------------------------+
+| **List Containers** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Delete Container** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Create Container** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Get Container Metadata** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Update Container Metadata** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Delete Container Metadata** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **List Objects** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Static Website** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Create Object** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Create Large Object** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Delete Object** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Get Object** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Copy Object** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Get Object Metadata** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Update Object Metadata** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Expiring Objects** | Supported | |
++---------------------------------+-----------------+----------------------------------------+
+| **Temporary URLs** | Partial Support | No support for container-level keys |
++---------------------------------+-----------------+----------------------------------------+
+| **Object Versioning** | Partial Support | No support for ``X-History-Location`` |
++---------------------------------+-----------------+----------------------------------------+
+| **CORS** | Not Supported | |
++---------------------------------+-----------------+----------------------------------------+
+
+.. _Swift API: https://developer.openstack.org/api-ref/object-store/index.html
diff --git a/doc/radosgw/swift/auth.rst b/doc/radosgw/swift/auth.rst
new file mode 100644
index 000000000..12d6b23ff
--- /dev/null
+++ b/doc/radosgw/swift/auth.rst
@@ -0,0 +1,82 @@
+================
+ Authentication
+================
+
+Swift API requests that require authentication must contain an
+``X-Storage-Token`` authentication token in the request header.
+The token may be retrieved from RADOS Gateway, or from another authenticator.
+To obtain a token from RADOS Gateway, you must create a user. For example::
+
+ sudo radosgw-admin user create --subuser="{username}:{subusername}" --uid="{username}"
+ --display-name="{Display Name}" --key-type=swift --secret="{password}" --access=full
+
+For details on RADOS Gateway administration, see `radosgw-admin`_.
+
+.. _radosgw-admin: ../../../man/8/radosgw-admin/
+
+.. note::
+ For those used to the Swift API this is implementing the Swift auth v1.0 API, as such
+ `{username}` above is generally equivalent to a Swift `account` and `{subusername}`
+ is a user under that account.
+
+Auth Get
+--------
+
+To authenticate a user, make a request containing an ``X-Auth-User`` and a
+``X-Auth-Key`` in the header.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /auth HTTP/1.1
+ Host: swift.radosgwhost.com
+ X-Auth-User: johndoe
+ X-Auth-Key: R7UUOLFDI2ZI9PRCQ53K
+
+Request Headers
+~~~~~~~~~~~~~~~
+
+``X-Auth-User``
+
+:Description: The key RADOS GW username to authenticate.
+:Type: String
+:Required: Yes
+
+``X-Auth-Key``
+
+:Description: The key associated to a RADOS GW username.
+:Type: String
+:Required: Yes
+
+
+Response Headers
+~~~~~~~~~~~~~~~~
+
+The response from the server should include an ``X-Auth-Token`` value. The
+response may also contain a ``X-Storage-Url`` that provides the
+``{api version}/{account}`` prefix that is specified in other requests
+throughout the API documentation.
+
+
+``X-Storage-Token``
+
+:Description: The authorization token for the ``X-Auth-User`` specified in the request.
+:Type: String
+
+
+``X-Storage-Url``
+
+:Description: The URL and ``{api version}/{account}`` path for the user.
+:Type: String
+
+A typical response looks like this::
+
+ HTTP/1.1 204 No Content
+ Date: Mon, 16 Jul 2012 11:05:33 GMT
+ Server: swift
+ X-Storage-Url: https://swift.radosgwhost.com/v1/ACCT-12345
+ X-Auth-Token: UOlCCC8TahFKlWuv9DB09TWHF0nDjpPElha0kAa
+ Content-Length: 0
+ Content-Type: text/plain; charset=UTF-8
diff --git a/doc/radosgw/swift/containerops.rst b/doc/radosgw/swift/containerops.rst
new file mode 100644
index 000000000..434b90ef5
--- /dev/null
+++ b/doc/radosgw/swift/containerops.rst
@@ -0,0 +1,341 @@
+======================
+ Container Operations
+======================
+
+A container is a mechanism for storing data objects. An account may
+have many containers, but container names must be unique. This API enables a
+client to create a container, set access controls and metadata,
+retrieve a container's contents, and delete a container. Since this API
+makes requests related to information in a particular user's account, all
+requests in this API must be authenticated unless a container's access control
+is deliberately made publicly accessible (i.e., allows anonymous requests).
+
+.. note:: The Amazon S3 API uses the term 'bucket' to describe a data container.
+ When you hear someone refer to a 'bucket' within the Swift API, the term
+ 'bucket' may be construed as the equivalent of the term 'container.'
+
+One facet of object storage is that it does not support hierarchical paths
+or directories. Instead, it supports one level consisting of one or more
+containers, where each container may have objects. The RADOS Gateway's
+Swift-compatible API supports the notion of 'pseudo-hierarchical containers,'
+which is a means of using object naming to emulate a container (or directory)
+hierarchy without actually implementing one in the storage system. You may
+name objects with pseudo-hierarchical names
+(e.g., photos/buildings/empire-state.jpg), but container names cannot
+contain a forward slash (``/``) character.
+
+
+Create a Container
+==================
+
+To create a new container, make a ``PUT`` request with the API version, account,
+and the name of the new container. The container name must be unique, must not
+contain a forward-slash (/) character, and should be less than 256 bytes. You
+may include access control headers and metadata headers in the request. The
+operation is idempotent; that is, if you make a request to create a container
+that already exists, it will return with a HTTP 202 return code, but will not
+create another container.
+
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{api version}/{account}/{container} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+ X-Container-Read: {comma-separated-uids}
+ X-Container-Write: {comma-separated-uids}
+ X-Container-Meta-{key}: {value}
+
+
+Headers
+~~~~~~~
+
+``X-Container-Read``
+
+:Description: The user IDs with read permissions for the container.
+:Type: Comma-separated string values of user IDs.
+:Required: No
+
+``X-Container-Write``
+
+:Description: The user IDs with write permissions for the container.
+:Type: Comma-separated string values of user IDs.
+:Required: No
+
+``X-Container-Meta-{key}``
+
+:Description: A user-defined meta data key that takes an arbitrary string value.
+:Type: String
+:Required: No
+
+
+HTTP Response
+~~~~~~~~~~~~~
+
+If a container with the same name already exists, and the user is the
+container owner then the operation will succeed. Otherwise the operation
+will fail.
+
+``409``
+
+:Description: The container already exists under a different user's ownership.
+:Status Code: ``BucketAlreadyExists``
+
+
+
+
+List a Container's Objects
+==========================
+
+To list the objects within a container, make a ``GET`` request with the
+API version, account, and the name of the container. You can specify query
+parameters to filter the full list, or leave out the parameters to return a list
+of the first 10,000 object names stored in the container.
+
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{api version}/{container} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+
+
+Parameters
+~~~~~~~~~~
+
+``format``
+
+:Description: Defines the format of the result.
+:Type: String
+:Valid Values: ``json`` | ``xml``
+:Required: No
+
+``prefix``
+
+:Description: Limits the result set to objects beginning with the specified prefix.
+:Type: String
+:Required: No
+
+``marker``
+
+:Description: Returns a list of results greater than the marker value.
+:Type: String
+:Required: No
+
+``limit``
+
+:Description: Limits the number of results to the specified value.
+:Type: Integer
+:Valid Range: 0 - 10,000
+:Required: No
+
+``delimiter``
+
+:Description: The delimiter between the prefix and the rest of the object name.
+:Type: String
+:Required: No
+
+``path``
+
+:Description: The pseudo-hierarchical path of the objects.
+:Type: String
+:Required: No
+
+``allow_unordered``
+
+:Description: Allows the results to be returned unordered to reduce computation overhead. Cannot be used with ``delimiter``.
+:Type: Boolean
+:Required: No
+:Non-Standard Extension: Yes
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+``container``
+
+:Description: The container.
+:Type: Container
+
+``object``
+
+:Description: An object within the container.
+:Type: Container
+
+``name``
+
+:Description: The name of an object within the container.
+:Type: String
+
+``hash``
+
+:Description: A hash code of the object's contents.
+:Type: String
+
+``last_modified``
+
+:Description: The last time the object's contents were modified.
+:Type: Date
+
+``content_type``
+
+:Description: The type of content within the object.
+:Type: String
+
+
+
+Update a Container's ACLs
+=========================
+
+When a user creates a container, the user has read and write access to the
+container by default. To allow other users to read a container's contents or
+write to a container, you must specifically enable the user.
+You may also specify ``*`` in the ``X-Container-Read`` or ``X-Container-Write``
+settings, which effectively enables all users to either read from or write
+to the container. Setting ``*`` makes the container public. That is it
+enables anonymous users to either read from or write to the container.
+
+.. note:: If you are planning to expose public read ACL functionality
+ for the Swift API, it is strongly recommended to include the
+ Swift account name in the endpoint definition, so as to most
+ closely emulate the behavior of native OpenStack Swift. To
+ do so, set the ``ceph.conf`` configuration option ``rgw
+ swift account in url = true``, and update your Keystone
+ endpoint to the URL suffix ``/v1/AUTH_%(tenant_id)s``
+ (instead of just ``/v1``).
+
+
+Syntax
+~~~~~~
+
+::
+
+ POST /{api version}/{account}/{container} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+ X-Container-Read: *
+ X-Container-Write: {uid1}, {uid2}, {uid3}
+
+Request Headers
+~~~~~~~~~~~~~~~
+
+``X-Container-Read``
+
+:Description: The user IDs with read permissions for the container.
+:Type: Comma-separated string values of user IDs.
+:Required: No
+
+``X-Container-Write``
+
+:Description: The user IDs with write permissions for the container.
+:Type: Comma-separated string values of user IDs.
+:Required: No
+
+
+Add/Update Container Metadata
+=============================
+
+To add metadata to a container, make a ``POST`` request with the API version,
+account, and container name. You must have write permissions on the
+container to add or update metadata.
+
+Syntax
+~~~~~~
+
+::
+
+ POST /{api version}/{account}/{container} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+ X-Container-Meta-Color: red
+ X-Container-Meta-Taste: salty
+
+Request Headers
+~~~~~~~~~~~~~~~
+
+``X-Container-Meta-{key}``
+
+:Description: A user-defined meta data key that takes an arbitrary string value.
+:Type: String
+:Required: No
+
+
+Enable Object Versioning for a Container
+========================================
+
+To enable object versioning a container, make a ``POST`` request with
+the API version, account, and container name. You must have write
+permissions on the container to add or update metadata.
+
+.. note:: Object versioning support is not enabled in radosgw by
+ default; you must set ``rgw swift versioning enabled =
+ true`` in ``ceph.conf`` to enable this feature.
+
+Syntax
+~~~~~~
+
+::
+
+ POST /{api version}/{account}/{container} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+ X-Versions-Location: {archive-container}
+
+Request Headers
+~~~~~~~~~~~~~~~
+
+``X-Versions-Location``
+
+:Description: The name of a container (the "archive container") that
+ will be used to store versions of the objects in the
+ container that the ``POST`` request is made on (the
+ "current container"). The archive container need not
+ exist at the time it is being referenced, but once
+ ``X-Versions-Location`` is set on the current container,
+ and object versioning is thus enabled, the archive
+ container must exist before any further objects are
+ updated or deleted in the current container.
+
+ .. note:: ``X-Versions-Location`` is the only
+ versioning-related header that radosgw
+ interprets. ``X-History-Location``, supported
+ by native OpenStack Swift, is currently not
+ supported by radosgw.
+:Type: String
+:Required: No (if this header is passed with an empty value, object
+ versioning on the current container is disabled, but the
+ archive container continues to exist.)
+
+
+Delete a Container
+==================
+
+To delete a container, make a ``DELETE`` request with the API version, account,
+and the name of the container. The container must be empty. If you'd like to check
+if the container is empty, execute a ``HEAD`` request against the container. Once
+you have successfully removed the container, you will be able to reuse the container name.
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{api version}/{account}/{container} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+
+
+HTTP Response
+~~~~~~~~~~~~~
+
+``204``
+
+:Description: The container was removed.
+:Status Code: ``NoContent``
+
diff --git a/doc/radosgw/swift/java.rst b/doc/radosgw/swift/java.rst
new file mode 100644
index 000000000..8977a3b16
--- /dev/null
+++ b/doc/radosgw/swift/java.rst
@@ -0,0 +1,175 @@
+.. _java_swift:
+
+=====================
+ Java Swift Examples
+=====================
+
+Setup
+=====
+
+The following examples may require some or all of the following Java
+classes to be imported:
+
+.. code-block:: java
+
+ import org.javaswift.joss.client.factory.AccountConfig;
+ import org.javaswift.joss.client.factory.AccountFactory;
+ import org.javaswift.joss.client.factory.AuthenticationMethod;
+ import org.javaswift.joss.model.Account;
+ import org.javaswift.joss.model.Container;
+ import org.javaswift.joss.model.StoredObject;
+ import java.io.File;
+ import java.io.IOException;
+ import java.util.*;
+
+
+Create a Connection
+===================
+
+This creates a connection so that you can interact with the server:
+
+.. code-block:: java
+
+ String username = "USERNAME";
+ String password = "PASSWORD";
+ String authUrl = "https://radosgw.endpoint/auth/1.0";
+
+ AccountConfig config = new AccountConfig();
+ config.setUsername(username);
+ config.setPassword(password);
+ config.setAuthUrl(authUrl);
+ config.setAuthenticationMethod(AuthenticationMethod.BASIC);
+ Account account = new AccountFactory(config).createAccount();
+
+
+Create a Container
+==================
+
+This creates a new container called ``my-new-container``:
+
+.. code-block:: java
+
+ Container container = account.getContainer("my-new-container");
+ container.create();
+
+
+Create an Object
+================
+
+This creates an object ``foo.txt`` from the file named ``foo.txt`` in
+the container ``my-new-container``:
+
+.. code-block:: java
+
+ Container container = account.getContainer("my-new-container");
+ StoredObject object = container.getObject("foo.txt");
+ object.uploadObject(new File("foo.txt"));
+
+
+Add/Update Object Metadata
+==========================
+
+This adds the metadata key-value pair ``key``:``value`` to the object named
+``foo.txt`` in the container ``my-new-container``:
+
+.. code-block:: java
+
+ Container container = account.getContainer("my-new-container");
+ StoredObject object = container.getObject("foo.txt");
+ Map<String, Object> metadata = new TreeMap<String, Object>();
+ metadata.put("key", "value");
+ object.setMetadata(metadata);
+
+
+List Owned Containers
+=====================
+
+This gets a list of Containers that you own.
+This also prints out the container name.
+
+.. code-block:: java
+
+ Collection<Container> containers = account.list();
+ for (Container currentContainer : containers) {
+ System.out.println(currentContainer.getName());
+ }
+
+The output will look something like this::
+
+ mahbuckat1
+ mahbuckat2
+ mahbuckat3
+
+
+List a Container's Content
+==========================
+
+This gets a list of objects in the container ``my-new-container``; and, it also
+prints out each object's name, the file size, and last modified date:
+
+.. code-block:: java
+
+ Container container = account.getContainer("my-new-container");
+ Collection<StoredObject> objects = container.list();
+ for (StoredObject currentObject : objects) {
+ System.out.println(currentObject.getName());
+ }
+
+The output will look something like this::
+
+ myphoto1.jpg
+ myphoto2.jpg
+
+
+Retrieve an Object's Metadata
+=============================
+
+This retrieves metadata and gets the MIME type for an object named ``foo.txt``
+in a container named ``my-new-container``:
+
+.. code-block:: java
+
+ Container container = account.getContainer("my-new-container");
+ StoredObject object = container.getObject("foo.txt");
+ Map<String, Object> returnedMetadata = object.getMetadata();
+ for (String name : returnedMetadata.keySet()) {
+ System.out.println("META / "+name+": "+returnedMetadata.get(name));
+ }
+
+
+Retrieve an Object
+==================
+
+This downloads the object ``foo.txt`` in the container ``my-new-container``
+and saves it in ``./outfile.txt``:
+
+.. code-block:: java
+
+ Container container = account.getContainer("my-new-container");
+ StoredObject object = container.getObject("foo.txt");
+ object.downloadObject(new File("outfile.txt"));
+
+
+Delete an Object
+================
+
+This deletes the object ``goodbye.txt`` in the container "my-new-container":
+
+.. code-block:: java
+
+ Container container = account.getContainer("my-new-container");
+ StoredObject object = container.getObject("foo.txt");
+ object.delete();
+
+
+Delete a Container
+==================
+
+This deletes a container named "my-new-container":
+
+.. code-block:: java
+
+ Container container = account.getContainer("my-new-container");
+ container.delete();
+
+.. note:: The container must be empty! Otherwise it won't work!
diff --git a/doc/radosgw/swift/objectops.rst b/doc/radosgw/swift/objectops.rst
new file mode 100644
index 000000000..fc8d21967
--- /dev/null
+++ b/doc/radosgw/swift/objectops.rst
@@ -0,0 +1,271 @@
+===================
+ Object Operations
+===================
+
+An object is a container for storing data and metadata. A container may
+have many objects, but the object names must be unique. This API enables a
+client to create an object, set access controls and metadata, retrieve an
+object's data and metadata, and delete an object. Since this API makes requests
+related to information in a particular user's account, all requests in this API
+must be authenticated unless the container or object's access control is
+deliberately made publicly accessible (i.e., allows anonymous requests).
+
+
+Create/Update an Object
+=======================
+
+To create a new object, make a ``PUT`` request with the API version, account,
+container name and the name of the new object. You must have write permission
+on the container to create or update an object. The object name must be
+unique within the container. The ``PUT`` request is not idempotent, so if you
+do not use a unique name, the request will update the object. However, you may
+use pseudo-hierarchical syntax in your object name to distinguish it from
+another object of the same name if it is under a different pseudo-hierarchical
+directory. You may include access control headers and metadata headers in the
+request.
+
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{api version}/{account}/{container}/{object} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+
+
+Request Headers
+~~~~~~~~~~~~~~~
+
+``ETag``
+
+:Description: An MD5 hash of the object's contents. Recommended.
+:Type: String
+:Required: No
+
+
+``Content-Type``
+
+:Description: The type of content the object contains.
+:Type: String
+:Required: No
+
+
+``Transfer-Encoding``
+
+:Description: Indicates whether the object is part of a larger aggregate object.
+:Type: String
+:Valid Values: ``chunked``
+:Required: No
+
+
+Copy an Object
+==============
+
+Copying an object allows you to make a server-side copy of an object, so that
+you don't have to download it and upload it under another container/name.
+To copy the contents of one object to another object, you may make either a
+``PUT`` request or a ``COPY`` request with the API version, account, and the
+container name. For a ``PUT`` request, use the destination container and object
+name in the request, and the source container and object in the request header.
+For a ``Copy`` request, use the source container and object in the request, and
+the destination container and object in the request header. You must have write
+permission on the container to copy an object. The destination object name must be
+unique within the container. The request is not idempotent, so if you do not use
+a unique name, the request will update the destination object. However, you may
+use pseudo-hierarchical syntax in your object name to distinguish the destination
+object from the source object of the same name if it is under a different
+pseudo-hierarchical directory. You may include access control headers and metadata
+headers in the request.
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{api version}/{account}/{dest-container}/{dest-object} HTTP/1.1
+ X-Copy-From: {source-container}/{source-object}
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+
+
+or alternatively:
+
+::
+
+ COPY /{api version}/{account}/{source-container}/{source-object} HTTP/1.1
+ Destination: {dest-container}/{dest-object}
+
+Request Headers
+~~~~~~~~~~~~~~~
+
+``X-Copy-From``
+
+:Description: Used with a ``PUT`` request to define the source container/object path.
+:Type: String
+:Required: Yes, if using ``PUT``
+
+
+``Destination``
+
+:Description: Used with a ``COPY`` request to define the destination container/object path.
+:Type: String
+:Required: Yes, if using ``COPY``
+
+
+``If-Modified-Since``
+
+:Description: Only copies if modified since the date/time of the source object's ``last_modified`` attribute.
+:Type: Date
+:Required: No
+
+
+``If-Unmodified-Since``
+
+:Description: Only copies if not modified since the date/time of the source object's ``last_modified`` attribute.
+:Type: Date
+:Required: No
+
+``Copy-If-Match``
+
+:Description: Copies only if the ETag in the request matches the source object's ETag.
+:Type: ETag.
+:Required: No
+
+
+``Copy-If-None-Match``
+
+:Description: Copies only if the ETag in the request does not match the source object's ETag.
+:Type: ETag.
+:Required: No
+
+
+Delete an Object
+================
+
+To delete an object, make a ``DELETE`` request with the API version, account,
+container and object name. You must have write permissions on the container to delete
+an object within it. Once you have successfully deleted the object, you will be able to
+reuse the object name.
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{api version}/{account}/{container}/{object} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+
+
+Get an Object
+=============
+
+To retrieve an object, make a ``GET`` request with the API version, account,
+container and object name. You must have read permissions on the container to
+retrieve an object within it.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{api version}/{account}/{container}/{object} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+
+
+
+Request Headers
+~~~~~~~~~~~~~~~
+
+``range``
+
+:Description: To retrieve a subset of an object's contents, you may specify a byte range.
+:Type: Date
+:Required: No
+
+
+``If-Modified-Since``
+
+:Description: Only copies if modified since the date/time of the source object's ``last_modified`` attribute.
+:Type: Date
+:Required: No
+
+
+``If-Unmodified-Since``
+
+:Description: Only copies if not modified since the date/time of the source object's ``last_modified`` attribute.
+:Type: Date
+:Required: No
+
+``Copy-If-Match``
+
+:Description: Copies only if the ETag in the request matches the source object's ETag.
+:Type: ETag.
+:Required: No
+
+
+``Copy-If-None-Match``
+
+:Description: Copies only if the ETag in the request does not match the source object's ETag.
+:Type: ETag.
+:Required: No
+
+
+
+Response Headers
+~~~~~~~~~~~~~~~~
+
+``Content-Range``
+
+:Description: The range of the subset of object contents. Returned only if the range header field was specified in the request
+
+
+Get Object Metadata
+===================
+
+To retrieve an object's metadata, make a ``HEAD`` request with the API version,
+account, container and object name. You must have read permissions on the
+container to retrieve metadata from an object within the container. This request
+returns the same header information as the request for the object itself, but
+it does not return the object's data.
+
+Syntax
+~~~~~~
+
+::
+
+ HEAD /{api version}/{account}/{container}/{object} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+
+
+
+Add/Update Object Metadata
+==========================
+
+To add metadata to an object, make a ``POST`` request with the API version,
+account, container and object name. You must have write permissions on the
+parent container to add or update metadata.
+
+
+Syntax
+~~~~~~
+
+::
+
+ POST /{api version}/{account}/{container}/{object} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+
+Request Headers
+~~~~~~~~~~~~~~~
+
+``X-Object-Meta-{key}``
+
+:Description: A user-defined meta data key that takes an arbitrary string value.
+:Type: String
+:Required: No
+
diff --git a/doc/radosgw/swift/python.rst b/doc/radosgw/swift/python.rst
new file mode 100644
index 000000000..0b1f8d0da
--- /dev/null
+++ b/doc/radosgw/swift/python.rst
@@ -0,0 +1,114 @@
+.. _python_swift:
+
+=====================
+Python Swift Examples
+=====================
+
+Create a Connection
+===================
+
+This creates a connection so that you can interact with the server:
+
+.. code-block:: python
+
+ import swiftclient
+ user = 'account_name:username'
+ key = 'your_api_key'
+
+ conn = swiftclient.Connection(
+ user=user,
+ key=key,
+ authurl='https://objects.dreamhost.com/auth',
+ )
+
+
+Create a Container
+==================
+
+This creates a new container called ``my-new-container``:
+
+.. code-block:: python
+
+ container_name = 'my-new-container'
+ conn.put_container(container_name)
+
+
+Create an Object
+================
+
+This creates a file ``hello.txt`` from the file named ``my_hello.txt``:
+
+.. code-block:: python
+
+ with open('hello.txt', 'r') as hello_file:
+ conn.put_object(container_name, 'hello.txt',
+ contents= hello_file.read(),
+ content_type='text/plain')
+
+
+List Owned Containers
+=====================
+
+This gets a list of containers that you own, and prints out the container name:
+
+.. code-block:: python
+
+ for container in conn.get_account()[1]:
+ print(container['name'])
+
+The output will look something like this::
+
+ mahbuckat1
+ mahbuckat2
+ mahbuckat3
+
+List a Container's Content
+==========================
+
+This gets a list of objects in the container, and prints out each
+object's name, the file size, and last modified date:
+
+.. code-block:: python
+
+ for data in conn.get_container(container_name)[1]:
+ print('{0}\t{1}\t{2}'.format(data['name'], data['bytes'], data['last_modified']))
+
+The output will look something like this::
+
+ myphoto1.jpg 251262 2011-08-08T21:35:48.000Z
+ myphoto2.jpg 262518 2011-08-08T21:38:01.000Z
+
+
+Retrieve an Object
+==================
+
+This downloads the object ``hello.txt`` and saves it in
+``./my_hello.txt``:
+
+.. code-block:: python
+
+ obj_tuple = conn.get_object(container_name, 'hello.txt')
+ with open('my_hello.txt', 'w') as my_hello:
+ my_hello.write(obj_tuple[1])
+
+
+Delete an Object
+================
+
+This deletes the object ``hello.txt``:
+
+.. code-block:: python
+
+ conn.delete_object(container_name, 'hello.txt')
+
+Delete a Container
+==================
+
+.. note::
+
+ The container must be empty! Otherwise the request won't work!
+
+.. code-block:: python
+
+ conn.delete_container(container_name)
+
diff --git a/doc/radosgw/swift/ruby.rst b/doc/radosgw/swift/ruby.rst
new file mode 100644
index 000000000..a20b66d88
--- /dev/null
+++ b/doc/radosgw/swift/ruby.rst
@@ -0,0 +1,119 @@
+.. _ruby_swift:
+
+=====================
+ Ruby Swift Examples
+=====================
+
+Create a Connection
+===================
+
+This creates a connection so that you can interact with the server:
+
+.. code-block:: ruby
+
+ require 'cloudfiles'
+ username = 'account_name:user_name'
+ api_key = 'your_secret_key'
+
+ conn = CloudFiles::Connection.new(
+ :username => username,
+ :api_key => api_key,
+ :auth_url => 'http://objects.dreamhost.com/auth'
+ )
+
+
+Create a Container
+==================
+
+This creates a new container called ``my-new-container``
+
+.. code-block:: ruby
+
+ container = conn.create_container('my-new-container')
+
+
+Create an Object
+================
+
+This creates a file ``hello.txt`` from the file named ``my_hello.txt``
+
+.. code-block:: ruby
+
+ obj = container.create_object('hello.txt')
+ obj.load_from_filename('./my_hello.txt')
+ obj.content_type = 'text/plain'
+
+
+
+List Owned Containers
+=====================
+
+This gets a list of Containers that you own, and also prints out
+the container name:
+
+.. code-block:: ruby
+
+ conn.containers.each do |container|
+ puts container
+ end
+
+The output will look something like this::
+
+ mahbuckat1
+ mahbuckat2
+ mahbuckat3
+
+
+List a Container's Contents
+===========================
+
+This gets a list of objects in the container, and prints out each
+object's name, the file size, and last modified date:
+
+.. code-block:: ruby
+
+ require 'date' # not necessary in the next version
+
+ container.objects_detail.each do |name, data|
+ puts "#{name}\t#{data[:bytes]}\t#{data[:last_modified]}"
+ end
+
+The output will look something like this::
+
+ myphoto1.jpg 251262 2011-08-08T21:35:48.000Z
+ myphoto2.jpg 262518 2011-08-08T21:38:01.000Z
+
+
+
+Retrieve an Object
+==================
+
+This downloads the object ``hello.txt`` and saves it in
+``./my_hello.txt``:
+
+.. code-block:: ruby
+
+ obj = container.object('hello.txt')
+ obj.save_to_filename('./my_hello.txt')
+
+
+Delete an Object
+================
+
+This deletes the object ``goodbye.txt``:
+
+.. code-block:: ruby
+
+ container.delete_object('goodbye.txt')
+
+
+Delete a Container
+==================
+
+.. note::
+
+ The container must be empty! Otherwise the request won't work!
+
+.. code-block:: ruby
+
+ container.delete_container('my-new-container')
diff --git a/doc/radosgw/swift/serviceops.rst b/doc/radosgw/swift/serviceops.rst
new file mode 100644
index 000000000..a00f3d807
--- /dev/null
+++ b/doc/radosgw/swift/serviceops.rst
@@ -0,0 +1,76 @@
+====================
+ Service Operations
+====================
+
+To retrieve data about our Swift-compatible service, you may execute ``GET``
+requests using the ``X-Storage-Url`` value retrieved during authentication.
+
+List Containers
+===============
+
+A ``GET`` request that specifies the API version and the account will return
+a list of containers for a particular user account. Since the request returns
+a particular user's containers, the request requires an authentication token.
+The request cannot be made anonymously.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{api version}/{account} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``limit``
+
+:Description: Limits the number of results to the specified value.
+:Type: Integer
+:Required: No
+
+``format``
+
+:Description: Defines the format of the result.
+:Type: String
+:Valid Values: ``json`` | ``xml``
+:Required: No
+
+
+``marker``
+
+:Description: Returns a list of results greater than the marker value.
+:Type: String
+:Required: No
+
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+The response contains a list of containers, or returns with an HTTP
+204 response code
+
+``account``
+
+:Description: A list for account information.
+:Type: Container
+
+``container``
+
+:Description: The list of containers.
+:Type: Container
+
+``name``
+
+:Description: The name of a container.
+:Type: String
+
+``bytes``
+
+:Description: The size of the container.
+:Type: Integer \ No newline at end of file
diff --git a/doc/radosgw/swift/tempurl.rst b/doc/radosgw/swift/tempurl.rst
new file mode 100644
index 000000000..41dbb0ccb
--- /dev/null
+++ b/doc/radosgw/swift/tempurl.rst
@@ -0,0 +1,102 @@
+====================
+ Temp URL Operations
+====================
+
+To allow temporary access (for eg for `GET` requests) to objects
+without the need to share credentials, temp url functionality is
+supported by swift endpoint of radosgw. For this functionality,
+initially the value of `X-Account-Meta-Temp-URL-Key` and optionally
+`X-Account-Meta-Temp-URL-Key-2` should be set. The Temp URL
+functionality relies on a HMAC-SHA1 signature against these secret
+keys.
+
+.. note:: If you are planning to expose Temp URL functionality for the
+ Swift API, it is strongly recommended to include the Swift
+ account name in the endpoint definition, so as to most
+ closely emulate the behavior of native OpenStack Swift. To
+ do so, set the ``ceph.conf`` configuration option ``rgw
+ swift account in url = true``, and update your Keystone
+ endpoint to the URL suffix ``/v1/AUTH_%(tenant_id)s``
+ (instead of just ``/v1``).
+
+
+POST Temp-URL Keys
+==================
+
+A ``POST`` request to the Swift account with the required key will set
+the secret temp URL key for the account, against which temporary URL
+access can be provided to accounts. Up to two keys are supported, and
+signatures are checked against both the keys, if present, so that keys
+can be rotated without invalidating the temporary URLs.
+
+.. note:: Native OpenStack Swift also supports the option to set
+ temporary URL keys at the container level, issuing a
+ ``POST`` or ``PUT`` request against a container that sets
+ ``X-Container-Meta-Temp-URL-Key`` or
+ ``X-Container-Meta-Temp-URL-Key-2``. This functionality is
+ not supported in radosgw; temporary URL keys can only be set
+ and used at the account level.
+
+Syntax
+~~~~~~
+
+::
+
+ POST /{api version}/{account} HTTP/1.1
+ Host: {fqdn}
+ X-Auth-Token: {auth-token}
+
+Request Headers
+~~~~~~~~~~~~~~~
+
+``X-Account-Meta-Temp-URL-Key``
+
+:Description: A user-defined key that takes an arbitrary string value.
+:Type: String
+:Required: Yes
+
+``X-Account-Meta-Temp-URL-Key-2``
+
+:Description: A user-defined key that takes an arbitrary string value.
+:Type: String
+:Required: No
+
+
+GET Temp-URL Objects
+====================
+
+Temporary URL uses a cryptographic HMAC-SHA1 signature, which includes
+the following elements:
+
+#. The value of the Request method, "GET" for instance
+#. The expiry time, in format of seconds since the epoch, ie Unix time
+#. The request path starting from "v1" onwards
+
+The above items are normalized with newlines appended between them,
+and a HMAC is generated using the SHA-1 hashing algorithm against one
+of the Temp URL Keys posted earlier.
+
+A sample python script to demonstrate the above is given below:
+
+
+.. code-block:: python
+
+ import hmac
+ from hashlib import sha1
+ from time import time
+
+ method = 'GET'
+ host = 'https://objectstore.example.com/swift'
+ duration_in_seconds = 300 # Duration for which the url is valid
+ expires = int(time() + duration_in_seconds)
+ path = '/v1/your-bucket/your-object'
+ key = 'secret'
+ hmac_body = '%s\n%s\n%s' % (method, expires, path)
+ sig = hmac.new(key, hmac_body, sha1).hexdigest()
+ rest_uri = "{host}{path}?temp_url_sig={sig}&temp_url_expires={expires}".format(
+ host=host, path=path, sig=sig, expires=expires)
+ print(rest_uri)
+
+ # Example Output
+ # https://objectstore.example.com/swift/v1/your-bucket/your-object?temp_url_sig=ff4657876227fc6025f04fcf1e82818266d022c6&temp_url_expires=1423200992
+
diff --git a/doc/radosgw/swift/tutorial.rst b/doc/radosgw/swift/tutorial.rst
new file mode 100644
index 000000000..5d2889b19
--- /dev/null
+++ b/doc/radosgw/swift/tutorial.rst
@@ -0,0 +1,62 @@
+==========
+ Tutorial
+==========
+
+The Swift-compatible API tutorials follow a simple container-based object
+lifecycle. The first step requires you to setup a connection between your
+client and the RADOS Gateway server. Then, you may follow a natural
+container and object lifecycle, including adding and retrieving object
+metadata. See example code for the following languages:
+
+- `Java`_
+- `Python`_
+- `Ruby`_
+
+
+.. ditaa::
+
+ +----------------------------+ +-----------------------------+
+ | | | |
+ | Create a Connection |------->| Create a Container |
+ | | | |
+ +----------------------------+ +-----------------------------+
+ |
+ +--------------------------------------+
+ |
+ v
+ +----------------------------+ +-----------------------------+
+ | | | |
+ | Create an Object |------->| Add/Update Object Metadata |
+ | | | |
+ +----------------------------+ +-----------------------------+
+ |
+ +--------------------------------------+
+ |
+ v
+ +----------------------------+ +-----------------------------+
+ | | | |
+ | List Owned Containers |------->| List a Container's Contents |
+ | | | |
+ +----------------------------+ +-----------------------------+
+ |
+ +--------------------------------------+
+ |
+ v
+ +----------------------------+ +-----------------------------+
+ | | | |
+ | Get an Object's Metadata |------->| Retrieve an Object |
+ | | | |
+ +----------------------------+ +-----------------------------+
+ |
+ +--------------------------------------+
+ |
+ v
+ +----------------------------+ +-----------------------------+
+ | | | |
+ | Delete an Object |------->| Delete a Container |
+ | | | |
+ +----------------------------+ +-----------------------------+
+
+.. _Java: ../java
+.. _Python: ../python
+.. _Ruby: ../ruby
diff --git a/doc/radosgw/sync-modules.rst b/doc/radosgw/sync-modules.rst
new file mode 100644
index 000000000..61797edc8
--- /dev/null
+++ b/doc/radosgw/sync-modules.rst
@@ -0,0 +1,97 @@
+============
+Sync Modules
+============
+
+.. versionadded:: Kraken
+
+The :ref:`multisite` functionality of RGW introduced in Jewel allowed the ability to
+create multiple zones and mirror data and metadata between them. ``Sync Modules``
+are built atop of the multisite framework that allows for forwarding data and
+metadata to a different external tier. A sync module allows for a set of actions
+to be performed whenever a change in data occurs (metadata ops like bucket or
+user creation etc. are also regarded as changes in data). As the rgw multisite
+changes are eventually consistent at remote sites, changes are propagated
+asynchronously. This would allow for unlocking use cases such as backing up the
+object storage to an external cloud cluster or a custom backup solution using
+tape drives, indexing metadata in ElasticSearch etc.
+
+A sync module configuration is local to a zone. The sync module determines
+whether the zone exports data or can only consume data that was modified in
+another zone. As of luminous the supported sync plugins are `elasticsearch`_,
+``rgw``, which is the default sync plugin that synchronizes data between the
+zones and ``log`` which is a trivial sync plugin that logs the metadata
+operation that happens in the remote zones. The following docs are written with
+the example of a zone using `elasticsearch sync module`_, the process would be similar
+for configuring any sync plugin
+
+.. toctree::
+ :maxdepth: 1
+
+ ElasticSearch Sync Module <elastic-sync-module>
+ Cloud Sync Module <cloud-sync-module>
+ Archive Sync Module <archive-sync-module>
+
+.. note ``rgw`` is the default sync plugin and there is no need to explicitly
+ configure this
+
+Requirements and Assumptions
+----------------------------
+
+Let us assume a simple multisite configuration as described in the :ref:`multisite`
+docs, of 2 zones ``us-east`` and ``us-west``, let's add a third zone
+``us-east-es`` which is a zone that only processes metadata from the other
+sites. This zone can be in the same or a different ceph cluster as ``us-east``.
+This zone would only consume metadata from other zones and RGWs in this zone
+will not serve any end user requests directly.
+
+
+Configuring Sync Modules
+------------------------
+
+Create the third zone similar to the :ref:`multisite` docs, for example
+
+::
+
+ # radosgw-admin zone create --rgw-zonegroup=us --rgw-zone=us-east-es \
+ --access-key={system-key} --secret={secret} --endpoints=http://rgw-es:80
+
+
+
+A sync module can be configured for this zone via the following
+
+::
+
+ # radosgw-admin zone modify --rgw-zone={zone-name} --tier-type={tier-type} --tier-config={set of key=value pairs}
+
+
+For example in the ``elasticsearch`` sync module
+
+::
+
+ # radosgw-admin zone modify --rgw-zone={zone-name} --tier-type=elasticsearch \
+ --tier-config=endpoint=http://localhost:9200,num_shards=10,num_replicas=1
+
+
+For the various supported tier-config options refer to the `elasticsearch sync module`_ docs
+
+Finally update the period
+
+
+::
+
+ # radosgw-admin period update --commit
+
+
+Now start the radosgw in the zone
+
+::
+
+ # systemctl start ceph-radosgw@rgw.`hostname -s`
+ # systemctl enable ceph-radosgw@rgw.`hostname -s`
+
+
+
+.. _`elasticsearch sync module`: ../elastic-sync-module
+.. _`elasticsearch`: ../elastic-sync-module
+.. _`cloud sync module`: ../cloud-sync-module
+.. _`archive sync module`: ../archive-sync-module
diff --git a/doc/radosgw/troubleshooting.rst b/doc/radosgw/troubleshooting.rst
new file mode 100644
index 000000000..4a084e82a
--- /dev/null
+++ b/doc/radosgw/troubleshooting.rst
@@ -0,0 +1,208 @@
+=================
+ Troubleshooting
+=================
+
+
+The Gateway Won't Start
+=======================
+
+If you cannot start the gateway (i.e., there is no existing ``pid``),
+check to see if there is an existing ``.asok`` file from another
+user. If an ``.asok`` file from another user exists and there is no
+running ``pid``, remove the ``.asok`` file and try to start the
+process again. This may occur when you start the process as a ``root`` user and
+the startup script is trying to start the process as a
+``www-data`` or ``apache`` user and an existing ``.asok`` is
+preventing the script from starting the daemon.
+
+The radosgw init script (/etc/init.d/radosgw) also has a verbose argument that
+can provide some insight as to what could be the issue::
+
+ /etc/init.d/radosgw start -v
+
+or ::
+
+ /etc/init.d radosgw start --verbose
+
+HTTP Request Errors
+===================
+
+Examining the access and error logs for the web server itself is
+probably the first step in identifying what is going on. If there is
+a 500 error, that usually indicates a problem communicating with the
+``radosgw`` daemon. Ensure the daemon is running, its socket path is
+configured, and that the web server is looking for it in the proper
+location.
+
+
+Crashed ``radosgw`` process
+===========================
+
+If the ``radosgw`` process dies, you will normally see a 500 error
+from the web server (apache, nginx, etc.). In that situation, simply
+restarting radosgw will restore service.
+
+To diagnose the cause of the crash, check the log in ``/var/log/ceph``
+and/or the core file (if one was generated).
+
+
+Blocked ``radosgw`` Requests
+============================
+
+If some (or all) radosgw requests appear to be blocked, you can get
+some insight into the internal state of the ``radosgw`` daemon via
+its admin socket. By default, there will be a socket configured to
+reside in ``/var/run/ceph``, and the daemon can be queried with::
+
+ ceph daemon /var/run/ceph/client.rgw help
+
+ help list available commands
+ objecter_requests show in-progress osd requests
+ perfcounters_dump dump perfcounters value
+ perfcounters_schema dump perfcounters schema
+ version get protocol version
+
+Of particular interest::
+
+ ceph daemon /var/run/ceph/client.rgw objecter_requests
+ ...
+
+will dump information about current in-progress requests with the
+RADOS cluster. This allows one to identify if any requests are blocked
+by a non-responsive OSD. For example, one might see::
+
+ { "ops": [
+ { "tid": 1858,
+ "pg": "2.d2041a48",
+ "osd": 1,
+ "last_sent": "2012-03-08 14:56:37.949872",
+ "attempts": 1,
+ "object_id": "fatty_25647_object1857",
+ "object_locator": "@2",
+ "snapid": "head",
+ "snap_context": "0=[]",
+ "mtime": "2012-03-08 14:56:37.949813",
+ "osd_ops": [
+ "write 0~4096"]},
+ { "tid": 1873,
+ "pg": "2.695e9f8e",
+ "osd": 1,
+ "last_sent": "2012-03-08 14:56:37.970615",
+ "attempts": 1,
+ "object_id": "fatty_25647_object1872",
+ "object_locator": "@2",
+ "snapid": "head",
+ "snap_context": "0=[]",
+ "mtime": "2012-03-08 14:56:37.970555",
+ "osd_ops": [
+ "write 0~4096"]}],
+ "linger_ops": [],
+ "pool_ops": [],
+ "pool_stat_ops": [],
+ "statfs_ops": []}
+
+In this dump, two requests are in progress. The ``last_sent`` field is
+the time the RADOS request was sent. If this is a while ago, it suggests
+that the OSD is not responding. For example, for request 1858, you could
+check the OSD status with::
+
+ ceph pg map 2.d2041a48
+
+ osdmap e9 pg 2.d2041a48 (2.0) -> up [1,0] acting [1,0]
+
+This tells us to look at ``osd.1``, the primary copy for this PG::
+
+ ceph daemon osd.1 ops
+ { "num_ops": 651,
+ "ops": [
+ { "description": "osd_op(client.4124.0:1858 fatty_25647_object1857 [write 0~4096] 2.d2041a48)",
+ "received_at": "1331247573.344650",
+ "age": "25.606449",
+ "flag_point": "waiting for sub ops",
+ "client_info": { "client": "client.4124",
+ "tid": 1858}},
+ ...
+
+The ``flag_point`` field indicates that the OSD is currently waiting
+for replicas to respond, in this case ``osd.0``.
+
+
+Java S3 API Troubleshooting
+===========================
+
+
+Peer Not Authenticated
+----------------------
+
+You may receive an error that looks like this::
+
+ [java] INFO: Unable to execute HTTP request: peer not authenticated
+
+The Java SDK for S3 requires a valid certificate from a recognized certificate
+authority, because it uses HTTPS by default. If you are just testing the Ceph
+Object Storage services, you can resolve this problem in a few ways:
+
+#. Prepend the IP address or hostname with ``http://``. For example, change this::
+
+ conn.setEndpoint("myserver");
+
+ To::
+
+ conn.setEndpoint("http://myserver")
+
+#. After setting your credentials, add a client configuration and set the
+ protocol to ``Protocol.HTTP``. ::
+
+ AWSCredentials credentials = new BasicAWSCredentials(accessKey, secretKey);
+
+ ClientConfiguration clientConfig = new ClientConfiguration();
+ clientConfig.setProtocol(Protocol.HTTP);
+
+ AmazonS3 conn = new AmazonS3Client(credentials, clientConfig);
+
+
+
+405 MethodNotAllowed
+--------------------
+
+If you receive an 405 error, check to see if you have the S3 subdomain set up correctly.
+You will need to have a wild card setting in your DNS record for subdomain functionality
+to work properly.
+
+Also, check to ensure that the default site is disabled. ::
+
+ [java] Exception in thread "main" Status Code: 405, AWS Service: Amazon S3, AWS Request ID: null, AWS Error Code: MethodNotAllowed, AWS Error Message: null, S3 Extended Request ID: null
+
+
+
+Numerous objects in default.rgw.meta pool
+=========================================
+
+Clusters created prior to *jewel* have a metadata archival feature enabled by default, using the ``default.rgw.meta`` pool.
+This archive keeps all old versions of user and bucket metadata, resulting in large numbers of objects in the ``default.rgw.meta`` pool.
+
+Disabling the Metadata Heap
+---------------------------
+
+Users who want to disable this feature going forward should set the ``metadata_heap`` field to an empty string ``""``::
+
+ $ radosgw-admin zone get --rgw-zone=default > zone.json
+ [edit zone.json, setting "metadata_heap": ""]
+ $ radosgw-admin zone set --rgw-zone=default --infile=zone.json
+ $ radosgw-admin period update --commit
+
+This will stop new metadata from being written to the ``default.rgw.meta`` pool, but does not remove any existing objects or pool.
+
+Cleaning the Metadata Heap Pool
+-------------------------------
+
+Clusters created prior to *jewel* normally use ``default.rgw.meta`` only for the metadata archival feature.
+
+However, from *luminous* onwards, radosgw uses :ref:`Pool Namespaces <radosgw-pool-namespaces>` within ``default.rgw.meta`` for an entirely different purpose, that is, to store ``user_keys`` and other critical metadata.
+
+Users should check zone configuration before proceeding any cleanup procedures::
+
+ $ radosgw-admin zone get --rgw-zone=default | grep default.rgw.meta
+ [should not match any strings]
+
+Having confirmed that the pool is not used for any purpose, users may safely delete all objects in the ``default.rgw.meta`` pool, or optionally, delete the entire pool itself.
diff --git a/doc/radosgw/vault.rst b/doc/radosgw/vault.rst
new file mode 100644
index 000000000..da34a3919
--- /dev/null
+++ b/doc/radosgw/vault.rst
@@ -0,0 +1,442 @@
+===========================
+HashiCorp Vault Integration
+===========================
+
+HashiCorp `Vault`_ can be used as a secure key management service for
+`Server-Side Encryption`_ (SSE-KMS).
+
+.. ditaa::
+
+ +---------+ +---------+ +-------+ +-------+
+ | Client | | RadosGW | | Vault | | OSD |
+ +---------+ +---------+ +-------+ +-------+
+ | create secret | | |
+ | key for key ID | | |
+ |-----------------+---------------->| |
+ | | | |
+ | upload object | | |
+ | with key ID | | |
+ |---------------->| request secret | |
+ | | key for key ID | |
+ | |---------------->| |
+ | |<----------------| |
+ | | return secret | |
+ | | key | |
+ | | | |
+ | | encrypt object | |
+ | | with secret key | |
+ | |--------------+ | |
+ | | | | |
+ | |<-------------+ | |
+ | | | |
+ | | store encrypted | |
+ | | object | |
+ | |------------------------------>|
+
+#. `Vault secrets engines`_
+#. `Vault authentication`_
+#. `Vault namespaces`_
+#. `Create a key in Vault`_
+#. `Configure the Ceph Object Gateway`_
+#. `Upload object`_
+
+Some examples below use the Vault command line utility to interact with
+Vault. You may need to set the following environment variable with the correct
+address of your Vault server to use this utility::
+
+ export VAULT_ADDR='https://vault-server-fqdn:8200'
+
+Vault secrets engines
+=====================
+
+Vault provides several secrets engines, which can store, generate, and encrypt
+data. Currently, the Object Gateway supports:
+
+- `KV secrets engine`_ version 2
+- `Transit engine`_
+
+KV secrets engine
+-----------------
+
+The KV secrets engine is used to store arbitrary key/value secrets in Vault. To
+enable the KV engine version 2 in Vault, use the following command::
+
+ vault secrets enable -path secret kv-v2
+
+The Object Gateway can be configured to use the KV engine version 2 with the
+following setting::
+
+ rgw crypt vault secret engine = kv
+
+Transit secrets engine
+----------------------
+
+The transit engine handles cryptographic functions on data in-transit. To enable
+it in Vault, use the following command::
+
+ vault secrets enable transit
+
+The Object Gateway can be configured to use the transit engine with the
+following setting::
+
+ rgw crypt vault secret engine = transit
+
+Vault authentication
+====================
+
+Vault supports several authentication mechanisms. Currently, the Object
+Gateway can be configured to authenticate to Vault using the
+`Token authentication method`_ or a `Vault agent`_.
+
+Most tokens in Vault have limited lifetimes and powers. The only
+sort of Vault token that does not have a lifetime are root tokens.
+For all other tokens, it is necessary to periodically refresh them,
+either by performing initial authentication, or by renewing the token.
+Ceph does not have any logic to perform either operation.
+The simplest best way to use Vault tokens with ceph is to
+also run the Vault agent and have it refresh the token file.
+When the Vault agent is used in this mode, file system permissions
+can be used to restrict who has the use of tokens.
+
+Instead of having Vault agent refresh a token file, it can be told
+to act as a proxy server. In this mode, Vault will add a token when
+necessary and add it to requests passed to it before forwarding them on
+to the real server. Vault agent will still handle token renewal just
+as it would when storing a token in the filesystem. In this mode, it
+is necessary to properly secure the network path rgw uses to reach the
+Vault agent, such as having the Vault agent listen only to localhost.
+
+Token policies for the object gateway
+-------------------------------------
+
+All Vault tokens have powers as specified by the polices attached
+to that token. Multiple policies may be associated with one
+token. You should only use the policies necessary for your
+configuration.
+
+When using the kv secret engine with the object gateway::
+
+ vault policy write rgw-kv-policy -<<EOF
+ path "secret/data/*" {
+ capabilities = ["read"]
+ }
+ EOF
+
+When using the transit secret engine with the object gateway::
+
+ vault policy write rgw-transit-policy -<<EOF
+ path "transit/keys/*" {
+ capabilities = [ "create", "update" ]
+ denied_parameters = {"exportable" = [], "allow_plaintext_backup" = [] }
+ }
+
+ path "transit/keys/*" {
+ capabilities = ["read", "delete"]
+ }
+
+ path "transit/keys/" {
+ capabilities = ["list"]
+ }
+
+ path "transit/keys/+/rotate" {
+ capabilities = [ "update" ]
+ }
+
+ path "transit/*" {
+ capabilities = [ "update" ]
+ }
+ EOF
+
+If you had previously used an older version of ceph with the
+transit secret engine, you might need the following policy::
+
+ vault policy write old-rgw-transit-policy -<<EOF
+ path "transit/export/encryption-key/*" {
+ capabilities = ["read"]
+ }
+ EOF
+
+If you are using both sse-kms and sse-s3, then you should point
+each to separate containers. You could either use separate
+vault instances, or you could use either separately mounted
+transit instances, or different branches under a common transit
+point. If you are not using separate vault instances, you can
+use these to point kms and sse-s3 to separate containers:
+``rgw_crypt_vault_prefix``
+and/or
+``rgw_crypt_sse_s3_vault_prefix``.
+When granting vault permissions to sse-kms bucket owners, you should
+not give them permission to muck around with sse-s3 keys;
+only ceph itself should be doing that.
+
+Token authentication
+--------------------
+
+.. note: Never use root tokens with ceph in production environments.
+
+The token authentication method expects a Vault token to be present in a
+plaintext file. The Object Gateway can be configured to use token authentication
+with the following settings::
+
+ rgw crypt vault auth = token
+ rgw crypt vault token file = /run/.rgw-vault-token
+ rgw crypt vault addr = https://vault-server-fqdn:8200
+
+Adjust these settings to match your configuration.
+For security reasons, the token file must be readable by the Object Gateway
+only.
+
+Vault agent
+-----------
+
+The Vault agent is a client daemon that provides authentication to Vault and
+manages token renewal and caching. It typically runs on the same host as the
+Object Gateway. With a Vault agent, it is possible to use other Vault
+authentication mechanism such as AppRole, AWS, Certs, JWT, and Azure.
+
+The Object Gateway can be configured to use a Vault agent with the following
+settings::
+
+ rgw crypt vault auth = agent
+ rgw crypt vault addr = http://127.0.0.1:8100
+
+You might set up vault agent as follows::
+
+ vault write auth/approle/role/rgw-ap \
+ token_policies=rgw-transit-policy,default \
+ token_max_ttl=60m
+
+Change the policy here to match your configuration.
+
+Get the role-id:
+ vault read auth/approle/role/rgw-ap/role-id -format=json | \
+ jq -r .data.role_id
+
+Store the output in some file, such as /usr/local/etc/vault/.rgw-ap-role-id
+
+Get the secret-id:
+ vault read auth/approle/role/rgw-ap/role-id -format=json | \
+ jq -r .data.role_id
+
+Store the output in some file, such as /usr/local/etc/vault/.rgw-ap-secret-id
+
+Create configuration for the Vault agent, such as::
+
+ pid_file = "/run/rgw-vault-agent-pid"
+ auto_auth {
+ method "AppRole" {
+ mount_path = "auth/approle"
+ config = {
+ role_id_file_path ="/usr/local/etc/vault/.rgw-ap-role-id"
+ secret_id_file_path ="/usr/local/etc/vault/.rgw-ap-secret-id"
+ remove_secret_id_file_after_reading ="false"
+ }
+ }
+ }
+ cache {
+ use_auto_auth_token = true
+ }
+ listener "tcp" {
+ address = "127.0.0.1:8100"
+ tls_disable = true
+ }
+ vault {
+ address = "https://vault-server-fqdn:8200"
+ }
+
+Then use systemctl or another method of your choice to run
+a persistent daemon with the following arguments::
+
+ /usr/local/bin/vault agent -config=/usr/local/etc/vault/rgw-agent.hcl
+
+Once the vault agent is running, you should find it listening
+to port 8100 on localhost, and you should be able to interact
+with it using the vault command.
+
+Vault namespaces
+================
+
+In the Enterprise version, Vault supports the concept of `namespaces`_, which
+allows centralized management for teams within an organization while ensuring
+that those teams operate within isolated environments known as tenants.
+
+The Object Gateway can be configured to access Vault within a particular
+namespace using the following configuration setting::
+
+ rgw crypt vault namespace = tenant1
+
+Create a key in Vault
+=====================
+
+.. note:: Keys for server-side encryption must be 256-bit long and base-64
+ encoded.
+
+Using the KV engine
+-------------------
+
+A key for server-side encryption can be created in the KV version 2 engine using
+the command line utility, as in the following example::
+
+ vault kv put secret/myproject/mybucketkey key=$(openssl rand -base64 32)
+
+Sample output::
+
+ ====== Metadata ======
+ Key Value
+ --- -----
+ created_time 2019-08-29T17:01:09.095824999Z
+ deletion_time n/a
+ destroyed false
+ version 1
+
+Note that in the KV secrets engine, secrets are stored as key-value pairs, and
+the Gateway expects the key name to be ``key``, i.e. the secret must be in the
+form ``key=<secret key>``.
+
+Using the Transit engine
+------------------------
+
+Keys created for use with the Transit engine should no longer be marked
+exportable. They can be created with::
+
+ vault write -f transit/keys/mybucketkey
+
+The command above creates a keyring, which contains a key of type
+``aes256-gcm96`` by default. To verify that the key was correctly created, use
+the following command::
+
+ vault read transit/keys/mybucketkey
+
+Sample output::
+
+ Key Value
+ --- -----
+ derived false
+ exportable false
+ name mybucketkey
+ type aes256-gcm96
+
+Configure the Ceph Object Gateway
+=================================
+
+Edit the Ceph configuration file to enable Vault as a KMS backend for
+server-side encryption::
+
+ rgw crypt s3 kms backend = vault
+
+Choose the Vault authentication method, e.g.::
+
+ rgw crypt vault auth = token
+ rgw crypt vault token file = /run/.rgw-vault-token
+ rgw crypt vault addr = https://vault-server-fqdn:8200
+
+Or::
+
+ rgw crypt vault auth = agent
+ rgw crypt vault addr = http://localhost:8100
+
+Choose the secrets engine::
+
+ rgw crypt vault secret engine = kv
+
+Or::
+
+ rgw crypt vault secret engine = transit
+
+Optionally, set the Vault namespace where encryption keys will be fetched from::
+
+ rgw crypt vault namespace = tenant1
+
+Finally, the URLs where the Gateway will retrieve encryption keys from Vault can
+be restricted by setting a path prefix. For instance, the Gateway can be
+restricted to fetch KV keys as follows::
+
+ rgw crypt vault prefix = /v1/secret/data
+
+Or, when using the transit secret engine::
+
+ rgw crypt vault prefix = /v1/transit
+
+In the example above, the Gateway would only fetch transit encryption keys under
+``https://vault-server:8200/v1/transit``.
+
+You can use custom ssl certs to authenticate with vault with help of
+following options::
+
+ rgw crypt vault verify ssl = true
+ rgw crypt vault ssl cacert = /etc/ceph/vault.ca
+ rgw crypt vault ssl clientcert = /etc/ceph/vault.crt
+ rgw crypt vault ssl clientkey = /etc/ceph/vault.key
+
+where vault.ca is CA certificate and vault.key/vault.crt are private key and ssl
+certificate generated for RGW to access the vault server. It highly recommended to
+set this option true, setting false is very dangerous and need to avoid since this
+runs in very secured environments.
+
+Transit engine compatibility support
+------------------------------------
+The transit engine has compatibility support for previous
+versions of ceph, which used the transit engine as a simple key store.
+
+There is a "compat" option which can be given to the transit
+engine to configure the compatibility support,
+
+To entirely disable backwards support, use::
+
+ rgw crypt vault secret engine = transit compat=0
+
+This will be the default in future versions. and is safe to use
+for new installs using the current version.
+
+This is the normal default with the current version::
+
+ rgw crypt vault secret engine = transit compat=1
+
+This enables the new engine for newly created objects,
+but still allows the old engine to be used for old objects.
+In order to access old and new objects, the vault token given
+to ceph must have both the old and new transit policies.
+
+To force use of only the old engine, use::
+
+ rgw crypt vault secret engine = transit compat=2
+
+This mode is automatically selected if the vault prefix
+ends in export/encryption-key, which was the previously
+documented setting.
+
+Upload object
+=============
+
+When uploading an object to the Gateway, provide the SSE key ID in the request.
+As an example, for the kv engine, using the AWS command-line client::
+
+ aws --endpoint=http://radosgw:8000 s3 cp plaintext.txt s3://mybucket/encrypted.txt --sse=aws:kms --sse-kms-key-id myproject/mybucketkey
+
+As an example, for the transit engine (new flavor), using the AWS command-line client::
+
+ aws --endpoint=http://radosgw:8000 s3 cp plaintext.txt s3://mybucket/encrypted.txt --sse=aws:kms --sse-kms-key-id mybucketkey
+
+The Object Gateway will fetch the key from Vault, encrypt the object and store
+it in the bucket. Any request to download the object will make the Gateway
+automatically retrieve the correspondent key from Vault and decrypt the object.
+
+Note that the secret will be fetched from Vault using a URL constructed by
+concatenating the base address (``rgw crypt vault addr``), the (optional)
+URL prefix (``rgw crypt vault prefix``), and finally the key ID.
+
+In the kv engine example above, the Gateway would fetch the secret from::
+
+ http://vaultserver:8200/v1/secret/data/myproject/mybucketkey
+
+In the transit engine example above, the Gateway would encrypt the secret using this key::
+
+ http://vaultserver:8200/v1/transit/mybucketkey
+
+.. _Server-Side Encryption: ../encryption
+.. _Vault: https://www.vaultproject.io/docs/
+.. _Token authentication method: https://www.vaultproject.io/docs/auth/token.html
+.. _Vault agent: https://www.vaultproject.io/docs/agent/index.html
+.. _KV Secrets engine: https://www.vaultproject.io/docs/secrets/kv/
+.. _Transit engine: https://www.vaultproject.io/docs/secrets/transit
+.. _namespaces: https://www.vaultproject.io/docs/enterprise/namespaces/index.html