summaryrefslogtreecommitdiffstats
path: root/doc/dev
diff options
context:
space:
mode:
Diffstat (limited to 'doc/dev')
-rw-r--r--doc/dev/PlanningImplementation.txt43
-rw-r--r--doc/dev/blkin.rst169
-rw-r--r--doc/dev/bluestore.rst85
-rw-r--r--doc/dev/cache-pool.rst200
-rw-r--r--doc/dev/ceph-volume/index.rst14
-rw-r--r--doc/dev/ceph-volume/lvm.rst179
-rw-r--r--doc/dev/ceph-volume/plugins.rst65
-rw-r--r--doc/dev/ceph-volume/systemd.rst37
-rw-r--r--doc/dev/ceph-volume/zfs.rst176
-rw-r--r--doc/dev/ceph_krb_auth.rst1094
-rw-r--r--doc/dev/cephadm/cephadm-exporter.rst306
-rw-r--r--doc/dev/cephadm/compliance-check.rst121
-rw-r--r--doc/dev/cephadm/developing-cephadm.rst263
-rw-r--r--doc/dev/cephadm/host-maintenance.rst104
-rw-r--r--doc/dev/cephadm/index.rst15
-rw-r--r--doc/dev/cephadm/scalability-notes.rst95
-rw-r--r--doc/dev/cephfs-mirroring.rst403
-rw-r--r--doc/dev/cephfs-reclaim.rst104
-rw-r--r--doc/dev/cephfs-snapshots.rst133
-rw-r--r--doc/dev/cephx.rst406
-rw-r--r--doc/dev/cephx_protocol.rst341
-rw-r--r--doc/dev/config-key.rst68
-rw-r--r--doc/dev/config.rst166
-rw-r--r--doc/dev/context.rst20
-rw-r--r--doc/dev/continuous-integration.rst285
-rw-r--r--doc/dev/corpus.rst100
-rw-r--r--doc/dev/cpu-profiler.rst54
-rw-r--r--doc/dev/crimson/crimson.rst284
-rw-r--r--doc/dev/crimson/error-handling.rst158
-rw-r--r--doc/dev/crimson/index.rst11
-rw-r--r--doc/dev/crimson/poseidonstore.rst586
-rw-r--r--doc/dev/cxx.rst27
-rw-r--r--doc/dev/dashboard/ui_goals.rst78
-rw-r--r--doc/dev/delayed-delete.rst13
-rw-r--r--doc/dev/dev_cluster_deployement.rst169
-rw-r--r--doc/dev/developer_guide/basic-workflow.rst515
-rw-r--r--doc/dev/developer_guide/dash-devel.rst2590
-rw-r--r--doc/dev/developer_guide/essentials.rst338
-rw-r--r--doc/dev/developer_guide/index.rst25
-rw-r--r--doc/dev/developer_guide/intro.rst25
-rw-r--r--doc/dev/developer_guide/issue-tracker.rst39
-rw-r--r--doc/dev/developer_guide/merging.rst138
-rw-r--r--doc/dev/developer_guide/running-tests-in-cloud.rst289
-rw-r--r--doc/dev/developer_guide/running-tests-locally.rst138
-rw-r--r--doc/dev/developer_guide/running-tests-using-teuth.rst183
-rw-r--r--doc/dev/developer_guide/tests-integration-tests.rst522
-rw-r--r--doc/dev/developer_guide/tests-unit-tests.rst177
-rw-r--r--doc/dev/development-workflow.rst248
-rw-r--r--doc/dev/documenting.rst144
-rw-r--r--doc/dev/encoding.rst95
-rw-r--r--doc/dev/erasure-coded-pool.rst135
-rw-r--r--doc/dev/file-striping.rst161
-rw-r--r--doc/dev/freebsd.rst53
-rw-r--r--doc/dev/generatedocs.rst83
-rw-r--r--doc/dev/iana.rst16
-rw-r--r--doc/dev/internals.rst52
-rw-r--r--doc/dev/kubernetes.rst228
-rw-r--r--doc/dev/libs.rst18
-rw-r--r--doc/dev/logging.rst106
-rw-r--r--doc/dev/logs.rst55
-rw-r--r--doc/dev/macos.rst50
-rw-r--r--doc/dev/mds_internals/data-structures.rst44
-rw-r--r--doc/dev/mds_internals/exports.rst76
-rw-r--r--doc/dev/mds_internals/index.rst10
-rw-r--r--doc/dev/messenger.rst33
-rw-r--r--doc/dev/mon-bootstrap.rst212
-rw-r--r--doc/dev/mon-elections.rst130
-rw-r--r--doc/dev/mon-on-disk-formats.rst91
-rw-r--r--doc/dev/mon-osdmap-prune.rst415
-rw-r--r--doc/dev/msgr2.rst840
-rw-r--r--doc/dev/network-encoding.rst214
-rw-r--r--doc/dev/network-protocol.rst197
-rw-r--r--doc/dev/object-store.rst67
-rw-r--r--doc/dev/osd-class-path.rst16
-rw-r--r--doc/dev/osd_internals/async_recovery.rst53
-rw-r--r--doc/dev/osd_internals/backfill_reservation.rst93
-rw-r--r--doc/dev/osd_internals/erasure_coding.rst87
-rw-r--r--doc/dev/osd_internals/erasure_coding/developer_notes.rst223
-rw-r--r--doc/dev/osd_internals/erasure_coding/ecbackend.rst207
-rw-r--r--doc/dev/osd_internals/erasure_coding/jerasure.rst33
-rw-r--r--doc/dev/osd_internals/erasure_coding/proposals.rst385
-rw-r--r--doc/dev/osd_internals/index.rst10
-rw-r--r--doc/dev/osd_internals/last_epoch_started.rst60
-rw-r--r--doc/dev/osd_internals/log_based_pg.rst208
-rw-r--r--doc/dev/osd_internals/manifest.rst623
-rw-r--r--doc/dev/osd_internals/map_message_handling.rst131
-rw-r--r--doc/dev/osd_internals/osd_overview.rst106
-rw-r--r--doc/dev/osd_internals/osd_throttles.rst93
-rw-r--r--doc/dev/osd_internals/osd_throttles.txt21
-rw-r--r--doc/dev/osd_internals/osdmap_versions.txt259
-rw-r--r--doc/dev/osd_internals/partial_object_recovery.rst148
-rw-r--r--doc/dev/osd_internals/pg.rst31
-rw-r--r--doc/dev/osd_internals/pg_removal.rst56
-rw-r--r--doc/dev/osd_internals/pgpool.rst22
-rw-r--r--doc/dev/osd_internals/recovery_reservation.rst83
-rw-r--r--doc/dev/osd_internals/refcount.rst45
-rw-r--r--doc/dev/osd_internals/scrub.rst41
-rw-r--r--doc/dev/osd_internals/snaps.rst128
-rw-r--r--doc/dev/osd_internals/stale_read.rst101
-rw-r--r--doc/dev/osd_internals/watch_notify.rst81
-rw-r--r--doc/dev/osd_internals/wbthrottle.rst28
-rw-r--r--doc/dev/peering.rst259
-rw-r--r--doc/dev/perf.rst55
-rw-r--r--doc/dev/perf_counters.rst198
-rw-r--r--doc/dev/perf_histograms.rst677
-rw-r--r--doc/dev/placement-group.rst210
-rw-r--r--doc/dev/quick_guide.rst140
-rw-r--r--doc/dev/rados-client-protocol.rst117
-rw-r--r--doc/dev/radosgw/admin/adminops_nonimplemented.rst495
-rw-r--r--doc/dev/radosgw/index.rst13
-rw-r--r--doc/dev/radosgw/s3_compliance.rst304
-rw-r--r--doc/dev/radosgw/usage.rst84
-rw-r--r--doc/dev/rbd-diff.rst146
-rw-r--r--doc/dev/rbd-export.rst104
-rw-r--r--doc/dev/rbd-layering.rst281
-rw-r--r--doc/dev/release-checklists.rst107
-rw-r--r--doc/dev/release-process.rst225
-rw-r--r--doc/dev/seastore.rst323
-rw-r--r--doc/dev/sepia.rst8
-rw-r--r--doc/dev/session_authentication.rst160
-rw-r--r--doc/dev/testing.rst40
-rw-r--r--doc/dev/versions.rst42
-rw-r--r--doc/dev/vstart-ganesha.rst45
-rw-r--r--doc/dev/wireshark.rst41
-rw-r--r--doc/dev/zoned-storage.rst134
125 files changed, 22704 insertions, 0 deletions
diff --git a/doc/dev/PlanningImplementation.txt b/doc/dev/PlanningImplementation.txt
new file mode 100644
index 000000000..871eb5f37
--- /dev/null
+++ b/doc/dev/PlanningImplementation.txt
@@ -0,0 +1,43 @@
+ <big>About this Document</big>
+This document contains planning and implementation procedures for Ceph. The audience for this document includes technical support personnel, installation engineers, system administrators, and quality assurance.
+<B>Prerequisites<b>
+Users of this document must be familiar with Linux command line options. They must also be familiar with the overall Ceph product.
+Before You Begin
+Before implementing a new Ceph System, first answer the questions in the Ceph Getting Started Guide to determine your configuration needs. Once you have determined your hardware and configuration needs, the following decisions must be made:
+ Determine what level of technical support you need. Pick from the Ceph Technical Support options in the next section.
+ Determine how much and what level of training your organization needs.
+Ceph Technical Support Options
+The Ceph Technical support model provides 4 tiers of technical support options:
+1st This option is for brand new customers that need installation, configuration, and setup on their production environment.
+2nd This level of support requires a trouble ticket to be generated on a case by case basis as customer difficulties arise. Customers can choose between two maintenance options; they can either purchase a yearly maintenance contract, or pay for each trouble resolution as it occurs.
+3rd This option comes with our bundled packages for customers who have also purchased our hosting plans. In this case, the customer is a service provider. The Help Desk can generally provide this level of incident resolution. (NEED MORE INFO)
+4th This level of support requires a Service Level Agreement (SLA) between the customer and Dreamhost. This level is used for handling the most difficult or advanced problems.
+Planning a Ceph Cluster Configuration
+The following section contains guidelines for planning the deployment for a Ceph cluster configuration. A Ceph cluster consists of the following core components:
+ Monitors These must be an odd number, such as one, three, or five. Three is the preferred configuration.
+ Object Storage Devices (OSD) used as storage nodes
+ Metadata Servers (MDS)
+For redundancy, you should employ several of these components.
+Monitors
+The monitors handle central cluster management, configuration, and state.
+Hardware Requirements:
+ A few gigs of local disk space
+ A fixed network address
+ Warning: Never configure 2 monitors per cluster. If you do, they will both have to be up all of the time, which will greatly degrade system performance.
+Object Storage Devices
+The OSDs store the actual data on the disks. A minimum of two is required.
+Hardware Requirements:
+ As many disks as possible for faster performance and scalability
+ An SSD or NVRAM for a journal, or a RAID controller with a battery-backed NVRAM.
+ Ample RAM for better file system caching
+ Fast network
+ Metadata Servers
+The metadata server daemon commands act as a distributed, coherent cache of file system metadata. They do not store data locally; all metadata is stored on disk via the storage nodes.
+Metadata servers can be added into the cluster on an as-needed basis. The load is automatically balanced. The max_mds parameter controls how many cmds instances are active. Any additional running instances are put in standby mode and can be activated if one of the active daemons becomes unresponsive.
+Hardware Requirements:
+ Large amount of RAM
+ Fast CPU
+ Fast (low latency) network
+ At least two servers for redundancy and load balancing
+TIPS: If you have just a few nodes, put cmon, cmds, and cosd on the same node. For moderate node configurations, put cmon and cmds together, and cosd on the disk nodes. For large node configurations, put cmon, cmds, and cosd each on their own dedicated machine.
+
diff --git a/doc/dev/blkin.rst b/doc/dev/blkin.rst
new file mode 100644
index 000000000..00a5748fa
--- /dev/null
+++ b/doc/dev/blkin.rst
@@ -0,0 +1,169 @@
+=========================
+ Tracing Ceph With Blkin
+=========================
+
+Ceph can use Blkin, a library created by Marios Kogias and others,
+which enables tracking a specific request from the time it enters
+the system at higher levels till it is finally served by RADOS.
+
+In general, Blkin implements the Dapper_ tracing semantics
+in order to show the causal relationships between the different
+processing phases that an IO request may trigger. The goal is an
+end-to-end visualisation of the request's route in the system,
+accompanied by information concerning latencies in each processing
+phase. Thanks to LTTng this can happen with a minimal overhead and
+in realtime. The LTTng traces can then be visualized with Twitter's
+Zipkin_.
+
+.. _Dapper: http://static.googleusercontent.com/media/research.google.com/el//pubs/archive/36356.pdf
+.. _Zipkin: https://zipkin.io/
+
+
+Installing Blkin
+================
+
+You can install Markos Kogias' upstream Blkin_ by hand.::
+
+ cd blkin/
+ make && make install
+
+or build distribution packages using DistroReadyBlkin_, which also comes with
+pkgconfig support. If you choose the latter, then you must generate the
+configure and make files first.::
+
+ cd blkin
+ autoreconf -i
+
+.. _Blkin: https://github.com/marioskogias/blkin
+.. _DistroReadyBlkin: https://github.com/agshew/blkin
+
+
+Configuring Ceph with Blkin
+===========================
+
+If you built and installed Blkin by hand, rather than building and
+installing packages, then set these variables before configuring
+Ceph.::
+
+ export BLKIN_CFLAGS=-Iblkin/
+ export BLKIN_LIBS=-lzipkin-cpp
+
+Blkin support in Ceph is disabled by default, so you may
+want to configure with something like::
+
+ ./do_cmake -DWITH_BLKIN=ON
+
+Config option for blkin must be set to true in ceph.conf to get
+traces from rbd through OSDC and OSD::
+
+ rbd_blkin_trace_all = true
+
+
+Testing Blkin
+=============
+
+It's easy to test Ceph's Blkin tracing. Let's assume you don't have
+Ceph already running, and you compiled Ceph with Blkin support but
+you didn't install it. Then launch Ceph with the ``vstart.sh`` script
+in Ceph's src directory so you can see the possible tracepoints.::
+
+ cd src
+ OSD=3 MON=3 RGW=1 ./vstart.sh -n
+ lttng list --userspace
+
+You'll see something like the following:::
+
+ UST events:
+ -------------
+ PID: 8987 - Name: ./ceph-osd
+ zipkin:timestamp (loglevel: TRACE_WARNING (4)) (type: tracepoint)
+ zipkin:keyval_integer (loglevel: TRACE_WARNING (4)) (type: tracepoint)
+ zipkin:keyval_string (loglevel: TRACE_WARNING (4)) (type: tracepoint)
+ lttng_ust_tracelog:TRACE_DEBUG (loglevel: TRACE_DEBUG (14)) (type: tracepoint)
+
+ PID: 8407 - Name: ./ceph-mon
+ zipkin:timestamp (loglevel: TRACE_WARNING (4)) (type: tracepoint)
+ zipkin:keyval_integer (loglevel: TRACE_WARNING (4)) (type: tracepoint)
+ zipkin:keyval_string (loglevel: TRACE_WARNING (4)) (type: tracepoint)
+ lttng_ust_tracelog:TRACE_DEBUG (loglevel: TRACE_DEBUG (14)) (type: tracepoint)
+
+ ...
+
+Next, stop Ceph so that the tracepoints can be enabled.::
+
+ ./stop.sh
+
+Start up an LTTng session and enable the tracepoints.::
+
+ lttng create blkin-test
+ lttng enable-event --userspace zipkin:timestamp
+ lttng enable-event --userspace zipkin:keyval_integer
+ lttng enable-event --userspace zipkin:keyval_string
+ lttng start
+
+Then start up Ceph again.::
+
+ OSD=3 MON=3 RGW=1 ./vstart.sh -n
+
+You may want to check that ceph is up.::
+
+ ./ceph status
+
+Now put something in using rados, check that it made it, get it back, and remove it.::
+
+ ./ceph osd pool create test-blkin
+ ./rados put test-object-1 ./vstart.sh --pool=test-blkin
+ ./rados -p test-blkin ls
+ ./ceph osd map test-blkin test-object-1
+ ./rados get test-object-1 ./vstart-copy.sh --pool=test-blkin
+ md5sum vstart*
+ ./rados rm test-object-1 --pool=test-blkin
+
+You could also use the example in ``examples/librados/`` or ``rados bench``.
+
+Then stop the LTTng session and see what was collected.::
+
+ lttng stop
+ lttng view
+
+You'll see something like:::
+
+ [15:33:08.884275486] (+0.000225472) ubuntu zipkin:timestamp: { cpu_id = 53 }, { trace_name = "op", service_name = "Objecter", port_no = 0, ip = "0.0.0.0", trace_id = 5485970765435202833, span_id = 5485970765435202833, parent_span_id = 0, event = "osd op reply" }
+ [15:33:08.884614135] (+0.000002839) ubuntu zipkin:keyval_integer: { cpu_id = 10 }, { trace_name = "", service_name = "Messenger", port_no = 6805, ip = "0.0.0.0", trace_id = 7381732770245808782, span_id = 7387710183742669839, parent_span_id = 1205040135881905799, key = "tid", val = 2 }
+ [15:33:08.884616431] (+0.000002296) ubuntu zipkin:keyval_string: { cpu_id = 10 }, { trace_name = "", service_name = "Messenger", port_no = 6805, ip = "0.0.0.0", trace_id = 7381732770245808782, span_id = 7387710183742669839, parent_span_id = 1205040135881905799, key = "entity type", val = "client" }
+
+
+Install Zipkin
+===============
+One of the points of using Blkin is so that you can look at the traces
+using Zipkin. Users should run Zipkin as a tracepoints collector and
+also a web service. The executable jar runs a collector on port 9410 and
+the web interface on port 9411
+
+Download Zipkin Package::
+
+ git clone https://github.com/openzipkin/zipkin && cd zipkin
+ wget -O zipkin.jar 'https://search.maven.org/remote_content?g=io.zipkin.java&a=zipkin-server&v=LATEST&c=exec'
+ java -jar zipkin.jar
+
+
+Show Ceph's Blkin Traces in Zipkin-web
+======================================
+Download babeltrace-zipkin project. This project takes the traces
+generated with blkin and sends them to a Zipkin collector using scribe::
+
+ git clone https://github.com/vears91/babeltrace-zipkin
+ cd babeltrace-zipkin
+
+Send lttng data to Zipkin::
+
+ python3 babeltrace_zipkin.py ${lttng-traces-dir}/${blkin-test}/ust/uid/0/64-bit/ -p ${zipkin-collector-port(9410 by default)} -s ${zipkin-collector-ip}
+
+Example::
+
+ python3 babeltrace_zipkin.py ~/lttng-traces-dir/blkin-test-20150225-160222/ust/uid/0/64-bit/ -p 9410 -s 127.0.0.1
+
+Check Ceph traces on webpage::
+
+ Browse http://${zipkin-collector-ip}:9411
+ Click "Find traces"
diff --git a/doc/dev/bluestore.rst b/doc/dev/bluestore.rst
new file mode 100644
index 000000000..91d71d037
--- /dev/null
+++ b/doc/dev/bluestore.rst
@@ -0,0 +1,85 @@
+===================
+BlueStore Internals
+===================
+
+
+Small write strategies
+----------------------
+
+* *U*: Uncompressed write of a complete, new blob.
+
+ - write to new blob
+ - kv commit
+
+* *P*: Uncompressed partial write to unused region of an existing
+ blob.
+
+ - write to unused chunk(s) of existing blob
+ - kv commit
+
+* *W*: WAL overwrite: commit intent to overwrite, then overwrite
+ async. Must be chunk_size = MAX(block_size, csum_block_size)
+ aligned.
+
+ - kv commit
+ - wal overwrite (chunk-aligned) of existing blob
+
+* *N*: Uncompressed partial write to a new blob. Initially sparsely
+ utilized. Future writes will either be *P* or *W*.
+
+ - write into a new (sparse) blob
+ - kv commit
+
+* *R+W*: Read partial chunk, then to WAL overwrite.
+
+ - read (out to chunk boundaries)
+ - kv commit
+ - wal overwrite (chunk-aligned) of existing blob
+
+* *C*: Compress data, write to new blob.
+
+ - compress and write to new blob
+ - kv commit
+
+Possible future modes
+---------------------
+
+* *F*: Fragment lextent space by writing small piece of data into a
+ piecemeal blob (that collects random, noncontiguous bits of data we
+ need to write).
+
+ - write to a piecemeal blob (min_alloc_size or larger, but we use just one block of it)
+ - kv commit
+
+* *X*: WAL read/modify/write on a single block (like legacy
+ bluestore). No checksum.
+
+ - kv commit
+ - wal read/modify/write
+
+Mapping
+-------
+
+This very roughly maps the type of write onto what we do when we
+encounter a given blob. In practice it's a bit more complicated since there
+might be several blobs to consider (e.g., we might be able to *W* into one or
+*P* into another), but it should communicate a rough idea of strategy.
+
++--------------------------+--------+--------------+-------------+--------------+---------------+
+| | raw | raw (cached) | csum (4 KB) | csum (16 KB) | comp (128 KB) |
++--------------------------+--------+--------------+-------------+--------------+---------------+
+| 128+ KB (over)write | U | U | U | U | C |
++--------------------------+--------+--------------+-------------+--------------+---------------+
+| 64 KB (over)write | U | U | U | U | U or C |
++--------------------------+--------+--------------+-------------+--------------+---------------+
+| 4 KB overwrite | W | P | W | P | W | P | R+W | P | N (F?) |
++--------------------------+--------+--------------+-------------+--------------+---------------+
+| 100 byte overwrite | R+W | P | W | P | R+W | P | R+W | P | N (F?) |
++--------------------------+--------+--------------+-------------+--------------+---------------+
+| 100 byte append | R+W | P | W | P | R+W | P | R+W | P | N (F?) |
++--------------------------+--------+--------------+-------------+--------------+---------------+
++--------------------------+--------+--------------+-------------+--------------+---------------+
+| 4 KB clone overwrite | P | N | P | N | P | N | P | N | N (F?) |
++--------------------------+--------+--------------+-------------+--------------+---------------+
+| 100 byte clone overwrite | P | N | P | N | P | N | P | N | N (F?) |
++--------------------------+--------+--------------+-------------+--------------+---------------+
diff --git a/doc/dev/cache-pool.rst b/doc/dev/cache-pool.rst
new file mode 100644
index 000000000..7dc71c828
--- /dev/null
+++ b/doc/dev/cache-pool.rst
@@ -0,0 +1,200 @@
+Cache pool
+==========
+
+Purpose
+-------
+
+Use a pool of fast storage devices (probably SSDs) and use it as a
+cache for an existing slower and larger pool.
+
+Use a replicated pool as a front-end to service most I/O, and destage
+cold data to a separate erasure coded pool that does not currently (and
+cannot efficiently) handle the workload.
+
+We should be able to create and add a cache pool to an existing pool
+of data, and later remove it, without disrupting service or migrating
+data around.
+
+Use cases
+---------
+
+Read-write pool, writeback
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have an existing data pool and put a fast cache pool "in front" of
+it. Writes will go to the cache pool and immediately ack. We flush
+them back to the data pool based on the defined policy.
+
+Read-only pool, weak consistency
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have an existing data pool and add one or more read-only cache
+pools. We copy data to the cache pool(s) on read. Writes are
+forwarded to the original data pool. Stale data is expired from the
+cache pools based on the defined policy.
+
+This is likely only useful for specific applications with specific
+data access patterns. It may be a match for rgw, for example.
+
+
+Interface
+---------
+
+Set up a read/write cache pool foo-hot for pool foo::
+
+ ceph osd tier add foo foo-hot
+ ceph osd tier cache-mode foo-hot writeback
+
+Direct all traffic for foo to foo-hot::
+
+ ceph osd tier set-overlay foo foo-hot
+
+Set the target size and enable the tiering agent for foo-hot::
+
+ ceph osd pool set foo-hot hit_set_type bloom
+ ceph osd pool set foo-hot hit_set_count 1
+ ceph osd pool set foo-hot hit_set_period 3600 # 1 hour
+ ceph osd pool set foo-hot target_max_bytes 1000000000000 # 1 TB
+ ceph osd pool set foo-hot min_read_recency_for_promote 1
+ ceph osd pool set foo-hot min_write_recency_for_promote 1
+
+Drain the cache in preparation for turning it off::
+
+ ceph osd tier cache-mode foo-hot forward
+ rados -p foo-hot cache-flush-evict-all
+
+When cache pool is finally empty, disable it::
+
+ ceph osd tier remove-overlay foo
+ ceph osd tier remove foo foo-hot
+
+Read-only pools with lazy consistency::
+
+ ceph osd tier add foo foo-east
+ ceph osd tier cache-mode foo-east readonly
+ ceph osd tier add foo foo-west
+ ceph osd tier cache-mode foo-west readonly
+
+
+
+Tiering agent
+-------------
+
+The tiering policy is defined as properties on the cache pool itself.
+
+HitSet metadata
+~~~~~~~~~~~~~~~
+
+First, the agent requires HitSet information to be tracked on the
+cache pool in order to determine which objects in the pool are being
+accessed. This is enabled with::
+
+ ceph osd pool set foo-hot hit_set_type bloom
+ ceph osd pool set foo-hot hit_set_count 1
+ ceph osd pool set foo-hot hit_set_period 3600 # 1 hour
+
+The supported HitSet types include 'bloom' (a bloom filter, the
+default), 'explicit_hash', and 'explicit_object'. The latter two
+explicitly enumerate accessed objects and are less memory efficient.
+They are there primarily for debugging and to demonstrate pluggability
+for the infrastructure. For the bloom filter type, you can additionally
+define the false positive probability for the bloom filter (default is 0.05)::
+
+ ceph osd pool set foo-hot hit_set_fpp 0.15
+
+The hit_set_count and hit_set_period define how much time each HitSet
+should cover, and how many such HitSets to store. Binning accesses
+over time allows Ceph to independently determine whether an object was
+accessed at least once and whether it was accessed more than once over
+some time period ("age" vs "temperature").
+
+The ``min_read_recency_for_promote`` defines how many HitSets to check for the
+existence of an object when handling a read operation. The checking result is
+used to decide whether to promote the object asynchronously. Its value should be
+between 0 and ``hit_set_count``. If it's set to 0, the object is always promoted.
+If it's set to 1, the current HitSet is checked. And if this object is in the
+current HitSet, it's promoted. Otherwise not. For the other values, the exact
+number of archive HitSets are checked. The object is promoted if the object is
+found in any of the most recent ``min_read_recency_for_promote`` HitSets.
+
+A similar parameter can be set for the write operation, which is
+``min_write_recency_for_promote``. ::
+
+ ceph osd pool set {cachepool} min_read_recency_for_promote 1
+ ceph osd pool set {cachepool} min_write_recency_for_promote 1
+
+Note that the longer the ``hit_set_period`` and the higher the
+``min_read_recency_for_promote``/``min_write_recency_for_promote`` the more RAM
+will be consumed by the ceph-osd process. In particular, when the agent is active
+to flush or evict cache objects, all hit_set_count HitSets are loaded into RAM.
+
+Cache mode
+~~~~~~~~~~
+
+The most important policy is the cache mode:
+
+ ceph osd pool set foo-hot cache-mode writeback
+
+The supported modes are 'none', 'writeback', 'forward', and
+'readonly'. Most installations want 'writeback', which will write
+into the cache tier and only later flush updates back to the base
+tier. Similarly, any object that is read will be promoted into the
+cache tier.
+
+The 'forward' mode is intended for when the cache is being disabled
+and needs to be drained. No new objects will be promoted or written
+to the cache pool unless they are already present. A background
+operation can then do something like::
+
+ rados -p foo-hot cache-try-flush-evict-all
+ rados -p foo-hot cache-flush-evict-all
+
+to force all data to be flushed back to the base tier.
+
+The 'readonly' mode is intended for read-only workloads that do not
+require consistency to be enforced by the storage system. Writes will
+be forwarded to the base tier, but objects that are read will get
+promoted to the cache. No attempt is made by Ceph to ensure that the
+contents of the cache tier(s) are consistent in the presence of object
+updates.
+
+Cache sizing
+~~~~~~~~~~~~
+
+The agent performs two basic functions: flushing (writing 'dirty'
+cache objects back to the base tier) and evicting (removing cold and
+clean objects from the cache).
+
+The thresholds at which Ceph will flush or evict objects is specified
+relative to a 'target size' of the pool. For example::
+
+ ceph osd pool set foo-hot cache_target_dirty_ratio .4
+ ceph osd pool set foo-hot cache_target_dirty_high_ratio .6
+ ceph osd pool set foo-hot cache_target_full_ratio .8
+
+will begin flushing dirty objects when 40% of the pool is dirty and begin
+evicting clean objects when we reach 80% of the target size.
+
+The target size can be specified either in terms of objects or bytes::
+
+ ceph osd pool set foo-hot target_max_bytes 1000000000000 # 1 TB
+ ceph osd pool set foo-hot target_max_objects 1000000 # 1 million objects
+
+Note that if both limits are specified, Ceph will begin flushing or
+evicting when either threshold is triggered.
+
+Other tunables
+~~~~~~~~~~~~~~
+
+You can specify a minimum object age before a recently updated object is
+flushed to the base tier::
+
+ ceph osd pool set foo-hot cache_min_flush_age 600 # 10 minutes
+
+You can specify the minimum age of an object before it will be evicted from
+the cache tier::
+
+ ceph osd pool set foo-hot cache_min_evict_age 1800 # 30 minutes
+
+
+
diff --git a/doc/dev/ceph-volume/index.rst b/doc/dev/ceph-volume/index.rst
new file mode 100644
index 000000000..5feef8089
--- /dev/null
+++ b/doc/dev/ceph-volume/index.rst
@@ -0,0 +1,14 @@
+===================================
+ceph-volume developer documentation
+===================================
+
+.. rubric:: Contents
+
+.. toctree::
+ :maxdepth: 1
+
+
+ plugins
+ lvm
+ zfs
+ systemd
diff --git a/doc/dev/ceph-volume/lvm.rst b/doc/dev/ceph-volume/lvm.rst
new file mode 100644
index 000000000..f2df6d850
--- /dev/null
+++ b/doc/dev/ceph-volume/lvm.rst
@@ -0,0 +1,179 @@
+
+.. _ceph-volume-lvm-api:
+
+LVM
+===
+The backend of ``ceph-volume lvm`` is LVM, it relies heavily on the usage of
+tags, which is a way for LVM to allow extending its volume metadata. These
+values can later be queried against devices and it is how they get discovered
+later.
+
+.. warning:: These APIs are not meant to be public, but are documented so that
+ it is clear what the tool is doing behind the scenes. Do not alter
+ any of these values.
+
+
+.. _ceph-volume-lvm-tag-api:
+
+Tag API
+-------
+The process of identifying logical volumes as part of Ceph relies on applying
+tags on all volumes. It follows a naming convention for the namespace that
+looks like::
+
+ ceph.<tag name>=<tag value>
+
+All tags are prefixed by the ``ceph`` keyword to claim ownership of that
+namespace and make it easily identifiable. This is how the OSD ID would be used
+in the context of lvm tags::
+
+ ceph.osd_id=0
+
+
+.. _ceph-volume-lvm-tags:
+
+Metadata
+--------
+The following describes all the metadata from Ceph OSDs that is stored on an
+LVM volume:
+
+
+``type``
+--------
+Describes if the device is an OSD or Journal, with the ability to expand to
+other types when supported (for example a lockbox)
+
+Example::
+
+ ceph.type=osd
+
+
+``cluster_fsid``
+----------------
+Example::
+
+ ceph.cluster_fsid=7146B649-AE00-4157-9F5D-1DBFF1D52C26
+
+
+``data_device``
+---------------
+Example::
+
+ ceph.data_device=/dev/ceph/data-0
+
+
+``data_uuid``
+-------------
+Example::
+
+ ceph.data_uuid=B76418EB-0024-401C-8955-AE6919D45CC3
+
+
+``journal_device``
+------------------
+Example::
+
+ ceph.journal_device=/dev/ceph/journal-0
+
+
+``journal_uuid``
+----------------
+Example::
+
+ ceph.journal_uuid=2070E121-C544-4F40-9571-0B7F35C6CB2B
+
+
+``encrypted``
+-------------
+Example for enabled encryption with ``luks``::
+
+ ceph.encrypted=1
+
+When encryption is not supported or simply disabled::
+
+ ceph.encrypted=0
+
+
+``osd_fsid``
+------------
+Example::
+
+ ceph.osd_fsid=88ab9018-f84b-4d62-90b4-ce7c076728ff
+
+
+``osd_id``
+----------
+Example::
+
+ ceph.osd_id=1
+
+
+``block_device``
+----------------
+Just used on :term:`bluestore` backends. Captures the path to the logical
+volume path.
+
+Example::
+
+ ceph.block_device=/dev/mapper/vg-block-0
+
+
+``block_uuid``
+--------------
+Just used on :term:`bluestore` backends. Captures either the logical volume UUID or
+the partition UUID.
+
+Example::
+
+ ceph.block_uuid=E5F041BB-AAD4-48A8-B3BF-31F7AFD7D73E
+
+
+``db_device``
+-------------
+Just used on :term:`bluestore` backends. Captures the path to the logical
+volume path.
+
+Example::
+
+ ceph.db_device=/dev/mapper/vg-db-0
+
+
+``db_uuid``
+-----------
+Just used on :term:`bluestore` backends. Captures either the logical volume UUID or
+the partition UUID.
+
+Example::
+
+ ceph.db_uuid=F9D02CF1-31AB-4910-90A3-6A6302375525
+
+
+``wal_device``
+--------------
+Just used on :term:`bluestore` backends. Captures the path to the logical
+volume path.
+
+Example::
+
+ ceph.wal_device=/dev/mapper/vg-wal-0
+
+
+``wal_uuid``
+------------
+Just used on :term:`bluestore` backends. Captures either the logical volume UUID or
+the partition UUID.
+
+Example::
+
+ ceph.wal_uuid=A58D1C68-0D6E-4CB3-8E99-B261AD47CC39
+
+
+``vdo``
+-------
+A VDO-enabled device is detected when device is getting prepared, and then
+stored for later checks when activating. This affects mount options by
+appending the ``discard`` mount flag, regardless of mount flags being used.
+
+Example for an enabled VDO device::
+
+ ceph.vdo=1
diff --git a/doc/dev/ceph-volume/plugins.rst b/doc/dev/ceph-volume/plugins.rst
new file mode 100644
index 000000000..95bc761e2
--- /dev/null
+++ b/doc/dev/ceph-volume/plugins.rst
@@ -0,0 +1,65 @@
+.. _ceph-volume-plugins:
+
+Plugins
+=======
+``ceph-volume`` started initially to provide support for using ``lvm`` as
+the underlying system for an OSD. It is included as part of the tool but it is
+treated like a plugin.
+
+This modularity, allows for other device or device-like technologies to be able
+to consume and re-use the utilities and workflows provided.
+
+Adding Plugins
+--------------
+As a Python tool, plugins ``setuptools`` entry points. For a new plugin to be
+available, it should have an entry similar to this in its ``setup.py`` file:
+
+.. code-block:: python
+
+ setup(
+ ...
+ entry_points = dict(
+ ceph_volume_handlers = [
+ 'my_command = my_package.my_module:MyClass',
+ ],
+ ),
+
+The ``MyClass`` should be a class that accepts ``sys.argv`` as its argument,
+``ceph-volume`` will pass that in at instantiation and call them ``main``
+method.
+
+This is how a plugin for ``ZFS`` could look like for example:
+
+.. code-block:: python
+
+ class ZFS(object):
+
+ help_menu = 'Deploy OSDs with ZFS'
+ _help = """
+ Use ZFS as the underlying technology for OSDs
+
+ --verbose Increase the verbosity level
+ """
+
+ def __init__(self, argv):
+ self.argv = argv
+
+ def main(self):
+ parser = argparse.ArgumentParser()
+ args = parser.parse_args(self.argv)
+ ...
+
+And its entry point (via ``setuptools``) in ``setup.py`` would looke like:
+
+.. code-block:: python
+
+ entry_points = {
+ 'ceph_volume_handlers': [
+ 'zfs = ceph_volume_zfs.zfs:ZFS',
+ ],
+ },
+
+After installation, the ``zfs`` subcommand would be listed and could be used
+as::
+
+ ceph-volume zfs
diff --git a/doc/dev/ceph-volume/systemd.rst b/doc/dev/ceph-volume/systemd.rst
new file mode 100644
index 000000000..8553430ee
--- /dev/null
+++ b/doc/dev/ceph-volume/systemd.rst
@@ -0,0 +1,37 @@
+.. _ceph-volume-systemd-api:
+
+systemd
+=======
+The workflow to *"activate"* an OSD is by relying on systemd unit files and its
+ability to persist information as a suffix to the instance name.
+
+``ceph-volume`` exposes the following convention for unit files::
+
+ ceph-volume@<sub command>-<extra metadata>
+
+For example, this is how enabling an OSD could look like for the
+:ref:`ceph-volume-lvm` sub command::
+
+ systemctl enable ceph-volume@lvm-0-8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+
+These 3 pieces of persisted information are needed by the sub-command so that
+it understands what OSD it needs to activate.
+
+Since ``lvm`` is not the only subcommand that will be supported, this
+is how it will allow other device types to be defined.
+
+At some point for example, for plain disks, it could be::
+
+ systemctl enable ceph-volume@disk-0-8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+At startup, the systemd unit will execute a helper script that will parse the
+suffix and will end up calling ``ceph-volume`` back. Using the previous
+example for lvm, that call will look like::
+
+ ceph-volume lvm activate 0 8715BEB4-15C5-49DE-BA6F-401086EC7B41
+
+
+.. warning:: These workflows are not meant to be public, but are documented so that
+ it is clear what the tool is doing behind the scenes. Do not alter
+ any of these values.
diff --git a/doc/dev/ceph-volume/zfs.rst b/doc/dev/ceph-volume/zfs.rst
new file mode 100644
index 000000000..18de7652a
--- /dev/null
+++ b/doc/dev/ceph-volume/zfs.rst
@@ -0,0 +1,176 @@
+
+.. _ceph-volume-zfs-api:
+
+ZFS
+===
+The backend of ``ceph-volume zfs`` is ZFS, it relies heavily on the usage of
+tags, which is a way for ZFS to allow extending its volume metadata. These
+values can later be queried against devices and it is how they get discovered
+later.
+
+Currently this interface is only usable when running on FreeBSD.
+
+.. warning:: These APIs are not meant to be public, but are documented so that
+ it is clear what the tool is doing behind the scenes. Do not alter
+ any of these values.
+
+
+.. _ceph-volume-zfs-tag-api:
+
+Tag API
+-------
+The process of identifying filesystems, volumes and pools as part of Ceph relies
+on applying tags on all volumes. It follows a naming convention for the
+namespace that looks like::
+
+ ceph.<tag name>=<tag value>
+
+All tags are prefixed by the ``ceph`` keyword to claim ownership of that
+namespace and make it easily identifiable. This is how the OSD ID would be used
+in the context of zfs tags::
+
+ ceph.osd_id=0
+
+Tags on filesystems are stored as property.
+Tags on a zpool are stored in the comment property as a concatenated list
+separated by ``;``
+
+.. _ceph-volume-zfs-tags:
+
+Metadata
+--------
+The following describes all the metadata from Ceph OSDs that is stored on a
+ZFS filesystem, volume, pool:
+
+
+``type``
+--------
+Describes if the device is an OSD or Journal, with the ability to expand to
+other types when supported
+
+Example::
+
+ ceph.type=osd
+
+
+``cluster_fsid``
+----------------
+Example::
+
+ ceph.cluster_fsid=7146B649-AE00-4157-9F5D-1DBFF1D52C26
+
+
+``data_device``
+---------------
+Example::
+
+ ceph.data_device=/dev/ceph/data-0
+
+
+``data_uuid``
+-------------
+Example::
+
+ ceph.data_uuid=B76418EB-0024-401C-8955-AE6919D45CC3
+
+
+``journal_device``
+------------------
+Example::
+
+ ceph.journal_device=/dev/ceph/journal-0
+
+
+``journal_uuid``
+----------------
+Example::
+
+ ceph.journal_uuid=2070E121-C544-4F40-9571-0B7F35C6CB2B
+
+
+``osd_fsid``
+------------
+Example::
+
+ ceph.osd_fsid=88ab9018-f84b-4d62-90b4-ce7c076728ff
+
+
+``osd_id``
+----------
+Example::
+
+ ceph.osd_id=1
+
+
+``block_device``
+----------------
+Just used on :term:`bluestore` backends. Captures the path to the logical
+volume path.
+
+Example::
+
+ ceph.block_device=/dev/gpt/block-0
+
+
+``block_uuid``
+--------------
+Just used on :term:`bluestore` backends. Captures either the logical volume UUID or
+the partition UUID.
+
+Example::
+
+ ceph.block_uuid=E5F041BB-AAD4-48A8-B3BF-31F7AFD7D73E
+
+
+``db_device``
+-------------
+Just used on :term:`bluestore` backends. Captures the path to the logical
+volume path.
+
+Example::
+
+ ceph.db_device=/dev/gpt/db-0
+
+
+``db_uuid``
+-----------
+Just used on :term:`bluestore` backends. Captures either the logical volume UUID or
+the partition UUID.
+
+Example::
+
+ ceph.db_uuid=F9D02CF1-31AB-4910-90A3-6A6302375525
+
+
+``wal_device``
+--------------
+Just used on :term:`bluestore` backends. Captures the path to the logical
+volume path.
+
+Example::
+
+ ceph.wal_device=/dev/gpt/wal-0
+
+
+``wal_uuid``
+------------
+Just used on :term:`bluestore` backends. Captures either the logical volume UUID or
+the partition UUID.
+
+Example::
+
+ ceph.wal_uuid=A58D1C68-0D6E-4CB3-8E99-B261AD47CC39
+
+
+``compression``
+---------------
+A compression-enabled device can always be set using the native zfs settings on
+a volume or filesystem. This will/can be activated during creation of the volume
+of filesystem.
+When activated by ``ceph-volume zfs`` this tag will be created.
+Compression manually set AFTER ``ceph-volume`` will go unnoticed, unless this
+tag is also manually set.
+
+Example for an enabled compression device::
+
+ ceph.vdo=1
diff --git a/doc/dev/ceph_krb_auth.rst b/doc/dev/ceph_krb_auth.rst
new file mode 100644
index 000000000..627b9bd8e
--- /dev/null
+++ b/doc/dev/ceph_krb_auth.rst
@@ -0,0 +1,1094 @@
+===============================================================================
+A Detailed Documentation on How to Set up Ceph Kerberos Authentication
+===============================================================================
+
+This document provides details on the Kerberos authorization protocol. This is
+the 1st draft and we will try to keep it updated along with code changes that
+might take place.
+
+Several free implementations of this protocol are available (MIT, Heimdal,
+MS...), covering a wide range of operating systems. The Massachusetts
+Institute of Technology (MIT), where Kerberos was originally developed,
+continues to develop their Kerberos package and it is the implementation we
+chose to work with. `MIT Kerberos <http://web.mit.edu/Kerberos/>`_.
+
+Please, provide feedback to Daniel Oliveira (doliveira@suse.com)
+
+*Last update: Dec 3, 2018*
+
+|
+
+Background
+----------
+
+Before we get into *Kerberos details*, let us define a few terms so we can
+understand what to expect from it, *what it can and can't do*:
+
+Directory Services
+ A directory service is a customizable information store that functions as
+ a single point from which users can locate resources and services
+ distributed throughout the network. This customizable information store
+ also gives administrators a single point for managing its objects and their
+ attributes. Although this information store appears as a single point to
+ the users of the network, it is actually most often stored in a distributed
+ form. A directory service consists of at least one *Directory Server and a
+ Directory Client* and are implemented based on *X.500 standards*.
+
+ *OpenLDAP, 389 Directory Server, MS Active Directory, NetIQ eDirectory* are
+ some good examples.
+
+ A directory service is often characterized as a *write-once-read-many-times
+ service*, meaning the data that would normally be stored in an directory
+ service would not be expected to change on every access.
+
+ The database that forms a directory service *is not designed for
+ transactional data*.
+
+|
+
+LDAP (Lightweight Directory Access Protocol v3)
+ LDAP is a set of LDAP Protocol Exchanges *(not an implementation of a
+ server)* that defines the method by which data is accessed. LDAPv3 is a
+ standard defined by the IETF in RFC 2251 and describes how data is
+ represented in the Directory Service (the Data Model or DIT).
+
+ Finally, it defines how data is loaded into (imported) and saved from
+ (exported) a directory service (using LDIF). LDAP does not define how data
+ is stored or manipulated. Data Store is an 'automagic' process as far as
+ the standard is concerned and is generally handled by back-end modules.
+
+ No Directory Service implementation has all the features of LDAP v3
+ protocol implemented. All Directory Server implementations have their
+ different problems and/or anomalies, and features that may not return
+ results as another Directory Server implementation would.
+
+|
+
+Authentication
+ Authentication is about validating credentials (like User Name/ID and
+ password) to verify the identity. The system determines whether one is what
+ they say they are using their credentials.
+
+ Usually, authentication is done by a username and password, and sometimes
+ in conjunction with *(single, two, or multi) factors of authentication*,
+ which refers to the various ways to be authenticated.
+
+|
+
+Authorization
+ Authorization occurs after the identity is successfully authenticated by
+ the system, which ultimately gives one full permission to access the
+ resources such as information, files, databases, and so forth, almost
+ anything. It determines the ability to access the system and up to what
+ extent (what kind of permissions/rights are given and to where/what).
+
+|
+
+Auditing
+ Auditing takes the results from both *authentication and authorization* and
+ records them into an audit log. The audit log records records all actions
+ taking by/during the authentication and authorization for later review by
+ the administrators. While authentication and authorization are preventive
+ systems (in which unauthorized access is prevented), auditing is a reactive
+ system (in which it gives detailed log of how/when/where someone accessed
+ the environment).
+
+|
+
+Kerberos (KRB v5)
+ Kerberos is a network *authentication protocol*. It is designed to provide
+ strong authentication for client/server applications by using secret-key
+ cryptography (symmetric key). A free implementation of this protocol is
+ available from the MIT. However, Kerberos is available in many commercial
+ products as well.
+
+ It was designed to provide secure authentication to services over an
+ insecure network. Kerberos uses tickets to authenticate a user, or service
+ application and never transmits passwords over the network in the clear.
+ So both client and server can prove their identity without sending any
+ unencrypted secrets over the network.
+
+ Kerberos can be used for single sign-on (SSO). The idea behind SSO is
+ simple, we want to login just once and be able to use any service that we
+ are entitled to, without having to login on each of those services.
+
+|
+
+Simple Authentication and Security Layer (SASL)
+ SASL **(RFC 4422)** is a framework that helps developers to implement
+ different authentication mechanisms (implementing a series of challenges
+ and responses), allowing both clients and servers to negotiate a mutually
+ acceptable mechanism for each connection, instead of hard-coding them.
+
+ Examples of SASL mechanisms:
+
+ * ANONYMOUS **(RFC 4505)**
+
+ - For guest access, meaning *unauthenticated*
+
+ * CRAM-MD5 **(RFC 2195)**
+
+ - Simple challenge-response scheme based on *HMAC-MD5*.
+ It does not establish any security layer. *Less secure than
+ DIGEST-MD5 and GSSAPI.*
+
+ * DIGEST-MD5 **(RFC 2831)**
+
+ - HTTP Digest compatible *(partially)* challenge-response scheme
+ based upon MD5, offering a *data security layer*. It is preferred
+ over PLAIN text passwords, protecting against plain text attacks.
+ It is a mandatory authentication method for LDAPv3 servers.
+
+ * EXTERNAL **(RFCs 4422, 5246, 4301, 2119)**
+
+ - Where *authentication is implicit* in the context (i.e; for
+ protocols using IPsec or TLS [TLS/SSL to performing certificate-
+ based authentication] already). This method uses public keys for
+ strong authentication.
+
+ * GS2 **(RFC 5801)**
+
+ - Family of mechanisms supports arbitrary GSS-API mechanisms in
+ SASL
+
+ * NTLM (MS Proprietary)
+
+ - MS Windows NT LAN Manager authentication mechanism
+
+ * OAuth 1.0/2.0 **(RFCs 5849, 6749, 7628)**
+
+ - Authentication protocol for delegated resource access
+
+ * OTP **(RFC 2444)**
+
+ - One-time password mechanism *(obsoletes the SKEY mechanism)*
+
+ * PLAIN **(RFC 4616)**
+
+ - Simple Cleartext password mechanism **(RFC 4616)**. This is not a
+ preferred mechanism for most applications because of its relative
+ lack of strength.
+
+ * SCRAM **(RFCs 5802, 7677)**
+
+ - Modern challenge-response scheme based mechanism with channel
+ binding support
+
+|
+
+Generic Security Services Application Program Interface (GSSAPI)
+ GSSAPI **(RFCs 2078, 2743, 2744, 4121, 4752)** is widely used by protocol
+ implementers as a way to implement Kerberos v5 support in their
+ applications. It provides a generic interface and message format that can
+ encapsulate authentication exchanges from any authentication method that
+ has a GSSAPI-compliant library.
+
+ It does not define a protocol, authentication, or security mechanism
+ itself; it instead makes it easier for application programmers to support
+ multiple authentication mechanisms by providing a uniform, generic API for
+ security services. It is a set of functions that include both an API and a
+ methodology for approaching authentication, aiming to insulate application
+ protocols from the specifics of security protocols as much as possible.
+
+ *Microsoft Windows Kerberos* implementation does not include GSSAPI support
+ but instead includes a *Microsoft-specific API*, the *Security Support
+ Provider Interface (SSPI)*. In Windows, an SSPI client can communicate with
+ a *GSSAPI server*.
+
+ *Most applications that support GSSAPI also support Kerberos v5.*
+
+|
+
+Simple and Protected GSSAPI Negotiation Mechanism (SPNEGO)
+ As we can see, GSSAPI solves the problem of providing a single API to
+ different authentication mechanisms. However, it does not solve the problem
+ of negotiating which mechanism to use. In fact for GSSAPI to work, the two
+ applications communicating with each other must know in advance what
+ authentication mechanism they plan to use, which usually is not a problem
+ if only one mechanism is supported (meaning Kerberos v5).
+
+ However, if there are multiple mechanisms to choose from, a method is
+ needed to securely negotiate an authentication mechanism that is mutually
+ supported between both client and server; which is where
+ *SPNEGO (RFC 2478, 4178)* makes a difference.
+
+ *SPNEGO* provides a framework for two parties that are engaged in
+ authentication to select from a set of possible authentication mechanisms,
+ in a manner that preserves the opaque nature of the security protocols to
+ the application protocol that uses it.
+
+ It is a security protocol that uses a *GSSAPI authentication mechanism* and
+ negotiates among several available authentication mechanisms in an
+ implementation, selecting one for use to satisfy the authentication needs
+ of the application protocol.
+
+ It is a *meta protocol* that travels entirely in other application
+ protocols; it is never used directly without an application protocol.
+
+|
+
+*Why is this important and why do we care? Like, at all?*
+
+ Having this background information in mind, we can easily describe things
+ like:
+
+ 1. *Ceph Kerberos authentication* is based totally on MIT *Kerberos*
+ implementation using *GSSAPI*.
+
+ 2. At the moment we are still using *Kerberos default backend
+ database*, however we plan on adding LDAP as a backend which would
+ provide us with *authentication with GSSAPI (KRB5)* and *authorization
+ with LDAP (LDAPv3)*, via *SASL mechanism*.
+
+|
+
+Before We Start
+---------------
+
+We assume the environment already has some external services up and running
+properly:
+
+ * Kerberos needs to be properly configured, which also means (for both
+ every server and KDC):
+
+ - Time Synchronization (either using `NTP <http://www.ntp.org/>`_ or `chrony <https://chrony.tuxfamily.org/>`_).
+
+ + Not only Kerberos, but also Ceph depends and relies on time
+ synchronization.
+
+ - DNS resolution
+
+ + Both *(forward and reverse)* zones, with *fully qualified domain
+ name (fqdn)* ``(hostname + domain.name)``
+
+ + KDC discover can be set up to use DNS ``(srv resources)`` as
+ service location protocol *(RFCs 2052, 2782)*, as well as *host
+ or domain* to the *appropriate realm* ``(txt record)``.
+
+ + Even though these DNS entries/settings are not required to run a
+ ``Kerberos realm``, they certainly help to eliminate the need for
+ manual configuration on all clients.
+
+ + This is extremely important, once most of the Kerberos issues are
+ usually related to name resolution. Kerberos is very picky when
+ checking on systems names and host lookups.
+
+ * Whenever possible, in order to avoid a *single point of failure*, set up
+ a *backup, secondary, or slave*, for every piece/part in the
+ infrastructure ``(ntp, dns, and kdc servers)``.
+
+
+Also, the following *Kerberos terminology* is important:
+
+ * Ticket
+
+ - Tickets or Credentials, are a set of information that can be used to
+ verify the client's identity. Kerberos tickets may be stored in a
+ file, or they may exist only in memory.
+
+ - The first ticket obtained is a ticket-granting ticket (TGT), which
+ allows the clients to obtain additional tickets. These additional
+ tickets give the client permission for specific services. The
+ requesting and granting of these additional tickets happens
+ transparently.
+
+ + The TGT, which expires at a specified time, permits the client to
+ obtain additional tickets, which give permission for specific
+ services. The requesting and granting of these additional tickets
+ is user-transparent.
+
+ * Key Distribution Center (KDC).
+
+ - The KDC creates a ticket-granting ticket (TGT) for the client,
+ encrypts it using the client's password as the key, and sends the
+ encrypted TGT back to the client. The client then attempts to decrypt
+ the TGT, using its password. If the client successfully decrypts the
+ TGT (i.e., if the client gave the correct password), it keeps the
+ decrypted TGT, which indicates proof of the client's identity.
+
+ - The KDC is comprised of three components:
+
+ + Kerberos database, which stores all the information about the
+ principals and the realm they belong to, among other things.
+ + Authentication service (AS)
+ + Ticket-granting service (TGS)
+
+ * Client
+
+ - Either a *user, host or a service* who sends a request for a ticket.
+
+ * Principal
+
+ - It is a unique identity to which Kerberos can assign tickets.
+ Principals can have an arbitrary number of components. Each component
+ is separated by a component separator, generally ``/``. The last
+ component is the *realm*, separated from the rest of the principal by
+ the realm separator, generally ``@``.
+
+ - If there is no realm component in the principal, then it will be
+ assumed that the principal is in the default realm for the context in
+ which it is being used.
+
+ - Usually, a principal is divided into three parts:
+
+ + The ``primary``, the ``instance``, and the ``realm``
+
+ + The format of a typical Kerberos V5 principal is
+ ``primary/instance@REALM``.
+
+ + The ``primary`` is the first part of the principal. In the case
+ of a user, it's the same as the ``username``. For a host, the
+ primary is the word ``host``. For Ceph, will use ``ceph`` as a
+ primary name which makes it easier to organize and identify Ceph
+ related principals.
+
+ + The ``instance`` is an optional string that qualifies the
+ primary. The instance is separated from the primary by a slash
+ ``/``. In the case of a user, the instance is usually ``null``,
+ but a user might also have an additional principal, with an
+ instance called ``admin``, which one uses to administrate a
+ database.
+
+ The principal ``johndoe@MYDOMAIN.COM`` is completely separate
+ from the principal ``johndoe/admin@MYDOMAIN.COM``, with a
+ separate password, and separate permissions. In the case of a
+ host, the instance is the fully qualified hostname,
+ i.e., ``osd1.MYDOMAIN.COM``.
+
+ + The ``realm`` is the Kerberos realm. Usually, the Kerberos realm
+ is the domain name, in *upper-case letters*. For example, the
+ machine ``osd1.MYDOMAIN.COM`` would be in the realm
+ ``MYDOMAIN.COM``.
+
+ * Keytab
+
+ - A keytab file stores the actual encryption key that can be used in
+ lieu of a password challenge for a given principal. Creating keytab
+ files are useful for noninteractive principals, such as *Service
+ Principal Names*, which are often associated with long-running
+ processes like Ceph daemons. A keytab file does not have to be a
+ "1:1 mapping" to a single principal. Multiple different principal
+ keys can be stored in a single keytab file:
+
+ + The keytab file allows a user/service to authenticate without
+ knowledge of the password. Due to this, *keytabs should be
+ protected* with appropriate controls to prevent unauthorized
+ users from authenticating with it.
+
+ + The default client keytab file is ``/etc/krb5.keytab``
+
+|
+
+The 'Ceph side' of the things
+------------------------------
+
+In order to configure connections (from Ceph nodes) to the KDC:
+
+1. Login to the Kerberos client (Ceph server nodes) and confirm it is properly
+ configured, by checking and editing ``/etc/krb5.conf`` file properly: ::
+
+ /etc/krb5.conf
+ [libdefaults]
+ dns_canonicalize_hostname = false
+ rdns = false
+ forwardable = true
+ dns_lookup_realm = true
+ dns_lookup_kdc = true
+ allow_weak_crypto = false
+ default_realm = MYDOMAIN.COM
+ default_ccache_name = KEYRING:persistent:%{uid}
+ [realms]
+ MYDOMAIN.COM = {
+ kdc = kerberos.mydomain.com
+ admin_server = kerberos.mydomain.com
+ ...
+ }
+ ...
+
+
+2. Login to the *KDC Server* and confirm it is properly configured to
+ authenticate to the Kerberos realm in question:
+
+ a. Kerberos related DNS RRs: ::
+
+ /var/lib/named/master/mydomain.com
+ kerberos IN A 192.168.10.21
+ kerberos-slave IN A 192.168.10.22
+ _kerberos IN TXT "MYDOMAIN.COM"
+ _kerberos._udp IN SRV 1 0 88 kerberos
+ _kerberos._tcp IN SRV 1 0 88 kerberos
+ _kerberos._udp IN SRV 20 0 88 kerberos-slave
+ _kerberos-master._udp IN SRV 0 0 88 kerberos
+ _kerberos-adm._tcp IN SRV 0 0 749 kerberos
+ _kpasswd._udp IN SRV 0 0 464 kerberos
+ ...
+
+
+ b. KDC configuration file: ::
+
+ /var/lib/kerberos/krb5kdc/kdc.conf
+ [kdcdefaults]
+ kdc_ports = 750,88
+ [realms]
+ MYDOMAIN.COM = {
+ acl_file = /var/lib/kerberos/krb5kdc/kadm5.acl
+ admin_keytab = FILE:/var/lib/kerberos/krb5kdc/kadm5.keytab
+ default_principal_flags = +postdateable +forwardable +renewable +proxiable
+ +dup-skey -preauth -hwauth +service
+ +tgt-based +allow-tickets -pwchange
+ -pwservice
+ dict_file = /var/lib/kerberos/krb5kdc/kadm5.dict
+ key_stash_file = /var/lib/kerberos/krb5kdc/.k5.MYDOMAIN.COM
+ kdc_ports = 750,88
+ max_life = 0d 10h 0m 0s
+ max_renewable_life = 7d 0h 0m 0s
+ }
+ ...
+
+
+3. Still on the KDC Server, run the Kerberos administration utility;
+ ``kadmin.local`` so we can list all the principals already created. ::
+
+ kadmin.local: listprincs
+ K/M@MYDOMAIN.COM
+ krbtgt/MYDOMAIN.COM@MYDOMAIN.COM
+ kadmin/admin@MYDOMAIN.COM
+ kadmin/changepw@MYDOMAIN.COM
+ kadmin/history@MYDOMAIN.COM
+ kadmin/kerberos.mydomain.com@MYDOMAIN.COM
+ root/admin@MYDOMAIN.COM
+ ...
+
+
+4. Add a *principal for each Ceph cluster node* we want to be authenticated by
+ Kerberos:
+
+ a. Adding principals: ::
+
+ kadmin.local: addprinc -randkey ceph/ceph-mon1
+ Principal "ceph/ceph-mon1@MYDOMAIN.COM" created.
+ kadmin.local: addprinc -randkey ceph/ceph-osd1
+ Principal "ceph/ceph-osd1@MYDOMAIN.COM" created.
+ kadmin.local: addprinc -randkey ceph/ceph-osd2
+ Principal "ceph/ceph-osd2@MYDOMAIN.COM" created.
+ kadmin.local: addprinc -randkey ceph/ceph-osd3
+ Principal "ceph/ceph-osd3@MYDOMAIN.COM" created.
+ kadmin.local: addprinc -randkey ceph/ceph-osd4
+ Principal "ceph/ceph-osd4@MYDOMAIN.COM" created.
+ kadmin.local: listprincs
+ K/M@MYDOMAIN.COM
+ krbtgt/MYDOMAIN.COM@MYDOMAIN.COM
+ kadmin/admin@MYDOMAIN.COM
+ kadmin/changepw@MYDOMAIN.COM
+ kadmin/history@MYDOMAIN.COM
+ kadmin/kerberos.mydomain.com@MYDOMAIN.COM
+ root/admin@MYDOMAIN.COM
+ ceph/ceph-mon1@MYDOMAIN.COM
+ ceph/ceph-osd1@MYDOMAIN.COM
+ ceph/ceph-osd2@MYDOMAIN.COM
+ ceph/ceph-osd3@MYDOMAIN.COM
+ ceph/ceph-osd4@MYDOMAIN.COM
+ ...
+
+
+ b. This follows the same idea if we are creating a *user principal* ::
+
+ kadmin.local: addprinc johndoe
+ WARNING: no policy specified for johndoe@MYDOMAIN.COM; defaulting to no policy
+ Enter password for principal "johndoe@MYDOMAIN.COM":
+ Re-enter password for principal "johndoe@MYDOMAIN.COM":
+ Principal "johndoe@MYDOMAIN.COM" created.
+ ...
+
+
+5. Create a *keytab file* for each Ceph cluster node:
+
+ As the default client keytab file is ``/etc/krb5.keytab``, we will want to
+ use a different file name, so we especify which *keytab file to create* and
+ which *principal to export keys* from: ::
+
+ kadmin.local: ktadd -k /etc/gss_client_mon1.ktab ceph/ceph-mon1
+ Entry for principal ceph/ceph-mon1 with kvno 2, encryption type aes256-cts-hmac-sha1-96 added to keytab WRFILE:/etc/gss_client_mon1.ktab.
+ Entry for principal ceph/ceph-mon1 with kvno 2, encryption type aes128-cts-hmac-sha1-96 added to keytab WRFILE:/etc/gss_client_mon1.ktab.
+ Entry for principal ceph/ceph-mon1 with kvno 2, encryption type des3-cbc-sha1 added to keytab WRFILE:/etc/gss_client_mon1.ktab.
+ Entry for principal ceph/ceph-mon1 with kvno 2, encryption type arcfour-hmac added to keytab WRFILE:/etc/gss_client_mon1.ktab.
+ kadmin.local: ktadd -k /etc/gss_client_osd1.ktab ceph/ceph-osd1
+ Entry for principal ceph/ceph-osd1 with kvno 2, encryption type aes256-cts-hmac-sha1-96 added to keytab WRFILE:/etc/gss_client_osd1.ktab.
+ Entry for principal ceph/ceph-osd1 with kvno 2, encryption type aes128-cts-hmac-sha1-96 added to keytab WRFILE:/etc/gss_client_osd1.ktab.
+ Entry for principal ceph/ceph-osd1 with kvno 2, encryption type des3-cbc-sha1 added to keytab WRFILE:/etc/gss_client_osd1.ktab.
+ Entry for principal ceph/ceph-osd1 with kvno 2, encryption type arcfour-hmac added to keytab WRFILE:/etc/gss_client_osd1.ktab.
+ kadmin.local: ktadd -k /etc/gss_client_osd2.ktab ceph/ceph-osd2
+ Entry for principal ceph/ceph-osd2 with kvno 2, encryption type aes256-cts-hmac-sha1-96 added to keytab WRFILE:/etc/gss_client_osd2.ktab.
+ Entry for principal ceph/ceph-osd2 with kvno 2, encryption type aes128-cts-hmac-sha1-96 added to keytab WRFILE:/etc/gss_client_osd2.ktab.
+ Entry for principal ceph/ceph-osd2 with kvno 2, encryption type des3-cbc-sha1 added to keytab WRFILE:/etc/gss_client_osd2.ktab.
+ Entry for principal ceph/ceph-osd2 with kvno 2, encryption type arcfour-hmac added to keytab WRFILE:/etc/gss_client_osd2.ktab.
+ kadmin.local: ktadd -k /etc/gss_client_osd3.ktab ceph/ceph-osd3
+ Entry for principal ceph/ceph-osd3 with kvno 3, encryption type aes256-cts-hmac-sha1-96 added to keytab WRFILE:/etc/gss_client_osd3.ktab.
+ Entry for principal ceph/ceph-osd3 with kvno 3, encryption type aes128-cts-hmac-sha1-96 added to keytab WRFILE:/etc/gss_client_osd3.ktab.
+ Entry for principal ceph/ceph-osd3 with kvno 3, encryption type des3-cbc-sha1 added to keytab WRFILE:/etc/gss_client_osd3.ktab.
+ Entry for principal ceph/ceph-osd3 with kvno 3, encryption type arcfour-hmac added to keytab WRFILE:/etc/gss_client_osd3.ktab.
+ kadmin.local: ktadd -k /etc/gss_client_osd4.ktab ceph/ceph-osd4
+ Entry for principal ceph/ceph-osd4 with kvno 4, encryption type aes256-cts-hmac-sha1-96 added to keytab WRFILE:/etc/gss_client_osd4.ktab.
+ Entry for principal ceph/ceph-osd4 with kvno 4, encryption type aes128-cts-hmac-sha1-96 added to keytab WRFILE:/etc/gss_client_osd4.ktab.
+ Entry for principal ceph/ceph-osd4 with kvno 4, encryption type des3-cbc-sha1 added to keytab WRFILE:/etc/gss_client_osd4.ktab.
+ Entry for principal ceph/ceph-osd4 with kvno 4, encryption type arcfour-hmac added to keytab WRFILE:/etc/gss_client_osd4.ktab.
+
+ # ls -1 /etc/gss_client_*
+ /etc/gss_client_mon1.ktab
+ /etc/gss_client_osd1.ktab
+ /etc/gss_client_osd2.ktab
+ /etc/gss_client_osd3.ktab
+ /etc/gss_client_osd4.ktab
+
+
+ We can also check these newly created keytab client files by: ::
+
+ # klist -kte /etc/gss_client_mon1.ktab
+ Keytab name: FILE:/etc/gss_client_mon1.ktab
+ KVNO Timestamp Principal
+ ---- ------------------- ------------------------------------------------------
+ 2 10/8/2018 14:35:30 ceph/ceph-mon1@MYDOMAIN.COM (aes256-cts-hmac-sha1-96)
+ 2 10/8/2018 14:35:31 ceph/ceph-mon1@MYDOMAIN.COM (aes128-cts-hmac-sha1-96)
+ 2 10/8/2018 14:35:31 ceph/ceph-mon1@MYDOMAIN.COM (des3-cbc-sha1)
+ 2 10/8/2018 14:35:31 ceph/ceph-mon1@MYDOMAIN.COM (arcfour-hmac)
+ ...
+
+
+6. A new *set parameter* was added in Ceph, ``gss ktab client file`` which
+ points to the keytab file related to the Ceph node *(or principal)* in
+ question.
+
+ By default it points to ``/var/lib/ceph/$name/gss_client_$name.ktab``. So,
+ in the case of a Ceph server ``osd1.mydomain.com``, the location and name
+ of the keytab file should be: ``/var/lib/ceph/osd1/gss_client_osd1.ktab``
+
+ Therefore, we need to ``scp`` each of these newly created keytab files from
+ the KDC to their respective Ceph cluster nodes (i.e):
+ ``# for node in mon1 osd1 osd2 osd3 osd4; do scp /etc/gss_client_$node*.ktab root@ceph-$node:/var/lib/ceph/$node/; done``
+
+ Or whatever other way one feels comfortable with, as long as each keytab
+ client file gets copied over to the proper location.
+
+ At this point, even *without using any keytab client file* we should be
+ already able to authenticate a *user principal*: ::
+
+ # kdestroy -A && kinit -f johndoe && klist -f
+ Password for johndoe@MYDOMAIN.COM:
+ Ticket cache: KEYRING:persistent:0:0
+ Default principal: johndoe@MYDOMAIN.COM
+
+ Valid starting Expires Service principal
+ 10/10/2018 15:32:01 10/11/2018 07:32:01 krbtgt/MYDOMAIN.COM@MYDOMAIN.COM
+ renew until 10/11/2018 15:32:01, Flags: FRI
+ ...
+
+
+ Given that the *keytab client file* is/should already be copied and available at the
+ Kerberos client (Ceph cluster node), we should be able to athenticate using it before
+ going forward: ::
+
+ # kdestroy -A && kinit -k -t /etc/gss_client_mon1.ktab -f 'ceph/ceph-mon1@MYDOMAIN.COM' && klist -f
+ Ticket cache: KEYRING:persistent:0:0
+ Default principal: ceph/ceph-mon1@MYDOMAIN.COM
+
+ Valid starting Expires Service principal
+ 10/10/2018 15:54:25 10/11/2018 07:54:25 krbtgt/MYDOMAIN.COM@MYDOMAIN.COM
+ renew until 10/11/2018 15:54:25, Flags: FRI
+ ...
+
+
+7. The default client keytab is used, if it is present and readable, to
+ automatically obtain initial credentials for GSSAPI client applications. The
+ principal name of the first entry in the client keytab is used by default
+ when obtaining initial credentials:
+
+ a. The ``KRB5_CLIENT_KTNAME environment`` variable.
+ b. The ``default_client_keytab_name`` profile variable in ``[libdefaults]``.
+ c. The hardcoded default, ``DEFCKTNAME``.
+
+ So, what we do is to internally, set the environment variable
+ ``KRB5_CLIENT_KTNAME`` to the same location as ``gss_ktab_client_file``,
+ so ``/var/lib/ceph/osd1/gss_client_osd1.ktab``, and change the ``ceph.conf``
+ file to add the new authentication method. ::
+
+ /etc/ceph/ceph.conf
+ [global]
+ ...
+ auth cluster required = gss
+ auth service required = gss
+ auth client required = gss
+ gss ktab client file = /{$my_new_location}/{$my_new_ktab_client_file.keytab}
+ ...
+
+
+8. With that the GSSAPIs will then be able to read the keytab file and using
+ the process of name and service resolution *(provided by the DNS)*, able to
+ request a *TGT* as follows:
+
+ a. User/Client sends principal identity and credentials to the KDC Server
+ (TGT request).
+ b. KDC checks its internal database for the principal in question.
+ c. a TGT is created and wrapped by the KDC, using the principal's key
+ (TGT + Key).
+ d. The newly created TGT, is decrypted and stored in the credentials
+ cache.
+ e. At this point, Kerberos/GSSAPI aware applications (and/or services) are
+ able to check the list of active TGT in the keytab file.
+
+|
+|
+
+** *For Ceph Developers Only* **
+=================================
+
+We certainly could have used straight native ``KRB5 APIs`` (instead of
+``GSSAPIs``), but we wanted a more portable option as regards network security,
+which is the hallmark of the ``GSS`` *(Generic Security Standard)* ``-API``.
+It does not actually provide security services itself.
+
+Rather, it is a framework that provides security services to callers in a
+generic way. ::
+
+ +---------------------------------+
+ | Application |
+ +---------------------------------+
+ | Protocol (RPC, Etc. [Optional]) |
+ +---------------------------------+
+ | GSS-API |
+ +---------------------------------+
+ | Security Mechs (Krb v5, Etc) |
+ +---------------------------------+
+
+
+The GSS-API does two main things:
+
+ 1. It creates a security context in which data can be passed between
+ applications. A context can be thought of as a sort of *"state of trust"*
+ between two applications.
+
+ Applications that share a context know who each other are and thus can
+ permit data transfers between them as long as the context lasts.
+
+ 2. It applies one or more types of protection, known as *"security services"*,
+ to the data to be transmitted.
+
+
+GSS-API provides several types of portability for applications:
+
+ a. **Mechanism independence.** GSS-API provides a generic interface to the
+ mechanisms for which it has been implemented. By specifying a default
+ security mechanism, an application does not need to know which mechanism
+ it is using (for example, Kerberos v5), or even what type of mechanism
+ it uses. As an example, when an application forwards a user's credential
+ to a server, it does not need to know if that credential has a Kerberos
+ format or the format used by some other mechanism, nor how the
+ credentials are stored by the mechanism and accessed by the application.
+ (If necessary, an application can specify a particular mechanism to use)
+
+ b. **Protocol independence.** The GSS-API is independent of any
+ communications protocol or protocol suite. It can be used with
+ applications that use, for example, sockets, RCP, or TCP/IP.
+ RPCSEC_GSS "RPCSEC_GSS Layer" is an additional layer that smoothly
+ integrates GSS-API with RPC.
+
+ c. **Platform independence.** The GSS-API is completely oblivious to the
+ type of operating system on which an application is running.
+
+ d. **Quality of Protection independence.** Quality of Protection (QOP) is
+ the name given to the type of algorithm used in encrypting data or
+ generating cryptographic tags; the GSS-API allows a programmer to ignore
+ QOP, using a default provided by the GSS-API.
+ (On the other hand, an application can specify the QOP if necessary.)
+
+ The basic security offered by the GSS-API is authentication. Authentication
+ is the verification of an identity: *if you are authenticated, it means
+ that you are recognized to be who you say you are.*
+
+ The GSS-API provides for two additional security services, if supported by the
+ underlying mechanisms:
+
+ 1. **Integrity:** It's not always sufficient to know that an application
+ sending you data is who it claims to be. The data itself could have
+ become corrupted or compromised.
+
+ The GSS-API provides for data to be accompanied by a cryptographic tag,
+ known as an ``Message Integrity Code (MIC)``, to prove that the data
+ that arrives at your doorstep is the same as the data that the sender
+ transmitted. This verification of the data's validity is known as
+ *"integrity"*.
+
+ 2. **Confidentiality:** Both authentication and integrity, however, leave
+ the data itself alone, so if it's somehow intercepted, others can read
+ it.
+
+ The GSS-API therefore allows data to be encrypted, if underlying
+ mechanisms support it. This encryption of data is known as *"confidentiality"*.
+
+|
+
+Mechanisms Available With GSS-API:
+
+ The current implementation of the GSS-API works only with the Kerberos v5 security
+ mechanism. ::
+
+ Mechanism Name Object Identifier Shared Library Kernel Module
+ ---------------------- ---------------------- -------------- --------------
+ diffie_hellman_640_0 1.3.6.4.1.42.2.26.2.4 dh640-0.so.1
+ diffie_hellman_1024_0 1.3.6.4.1.42.2.26.2.5 dh1024-0.so.1
+ SPNEGO 1.3.6.1.5.5.2
+ iakerb 1.3.6.1.5.2.5
+ SCRAM-SHA-1 1.3.6.1.5.5.14
+ SCRAM-SHA-256 1.3.6.1.5.5.18
+ GSS-EAP (arc) 1.3.6.1.5.5.15.1.1.*
+ kerberos_v5 1.2.840.113554.1.2.2 gl/mech_krb5.so gl_kmech_krb5
+
+ Therefore:
+ Kerberos Version 5 GSS-API Mechanism
+ OID {1.2.840.113554.1.2.2}
+
+ Kerberos Version 5 GSS-API Mechanism
+ Simple and Protected GSS-API Negotiation Mechanism
+ OID {1.3.6.1.5.5.2}
+
+
+ There are two different formats:
+
+ 1. The first, ``{ 1 2 3 4 }``, is officially mandated by the GSS-API
+ specs. ``gss_str_to_oid()`` expects this first format.
+
+ 2. The second, ``1.2.3.4``, is more widely used but is not an official
+ standard format.
+
+ Although the GSS-API makes protecting data simple, it does not do certain
+ things, in order to maximize its generic nature. These include:
+
+ a. Provide security credentials for a user or application. These must
+ be provided by the underlying security mechanism(s). The GSS-API
+ does allow applications to acquire credentials, either automatically
+ or explicitly.
+
+ b. Transfer data between applications. It is the application's
+ responsibility to handle the transfer of all data between peers,
+ whether it is security-related or "plain" data.
+
+ c. Distinguish between different types of transmitted data (for
+ example, to know or determine that a data packet is plain data and
+ not GSS-API related).
+
+ d. Indicate status due to remote (asynchronous) errors.
+
+ e. Automatically protect information sent between processes of a
+ multiprocess program.
+
+ f. Allocate string buffers ("Strings and Similar Data") to be passed to
+ GSS-API functions.
+
+ g. Deallocate GSS-API data spaces. These must be explicitly deallocated
+ with functions such as ``gss_release_buffer()`` and
+ ``gss_delete_name()``.
+
+|
+
+These are the basic steps in using the GSS-API:
+
+ 1. Each application, sender and recipient, acquires credentials explicitly,
+ if credentials have not been acquired automatically.
+
+ 2. The sender initiates a security context and the recipient accepts it.
+
+ 3. The sender applies security protection to the message (data) it wants to
+ transmit. This means that it either encrypts the message or stamps it
+ with an identification tag. The sender transmits the protected message.
+ (The sender can choose not to apply either security protection, in which
+ case the message has only the default GSS-API security service
+ associated with it. That is authentication, in which the recipient knows
+ that the sender is who it claims to be.)
+
+ 4. The recipient decrypts the message (if needed) and verifies it
+ (if appropriate).
+
+ 5. (Optional) The recipient returns an identification tag to the sender for
+ confirmation.
+
+ 6. Both applications destroy the shared security context. If necessary,
+ they can also deallocate any *"leftover"* GSS-API data.
+
+ Applications that use the GSS-API should include the file ``gssapi.h``.
+
+ Good References:
+ - `rfc1964 <https://tools.ietf.org/html/rfc1964>`_.
+ - `rfc2743 <https://tools.ietf.org/html/rfc2743>`_.
+ - `rfc2744 <https://tools.ietf.org/html/rfc2744>`_.
+ - `rfc4178 <https://tools.ietf.org/html/rfc4178>`_.
+ - `rfc6649 <https://tools.ietf.org/html/rfc6649>`_.
+ - `MIT Kerberos Documentation <https://web.mit.edu/kerberos/krb5-latest/doc/appdev/gssapi.html>`_.
+
+|
+
+** *Kerberos Server Setup* **
+------------------------------
+
+First and foremost, ``this is not a recommendation for a production
+environment``. We are not covering ``Master/Slave replication cluster`` or
+anything production environment related (*ntp/chrony, dns, pam/nss, sssd, etc*).
+
+Also, on the server side there might be different dependencies and/or
+configuration steps needed, depending on which backend database will be used.
+``LDAP as a backend database`` is a good example of that.
+
+On the client side there are different steps depending on which client backend
+configuration will be used. For example ``PAM/NSS`` or ``SSSD`` (along with
+LDAP for identity service, [and Kerberos for authentication service]) which is
+the best suited option for joining ``MS Active Directory domains``, and doing
+``User Logon Management``.
+
+By no means we intend to cover every possible scenario/combination here. These
+steps are for a simple *get a (MIT) Kerberos Server up and running*.
+
+Please, note that *rpm packages might have slightly different names*, as well
+as the locations for the binaries and/or configuration files, depending on
+which Linux distro we are referring to.
+
+Finally, keep in mind that some Linux distros will have their own ``wizards``,
+which can perform the basic needed configuration: ::
+
+ SUSE:
+ Kerberos server:
+ yast2 auth-server
+
+ Kerberos client:
+ pam/nss: yast2 ldapkrb
+ sssd: yast2 auth-client
+
+
+However, we are going through the ``manual configuration``.
+
+
+In order to get a new MIT KDC Server running:
+
+1. Install the KDC server by:
+
+ a. Install the needed packages: ::
+
+ SUSE: zypper install krb5 krb5-server krb5-client
+ Additionally:
+ for development: krb5-devel
+ if using 'sssd': sssd-krb5 sssd-krb5-common
+
+ REDHAT: yum install krb5-server krb5-libs krb5-workstation
+ Additionally: 'Needs to be checked'
+
+
+ b. Edit the KDC Server configuration file: ::
+
+ /var/lib/kerberos/krb5kdc/kdc.conf
+ [kdcdefaults]
+ kdc_ports = 750,88
+ [realms]
+ MYDOMAIN.COM = {
+ acl_file = /var/lib/kerberos/krb5kdc/kadm5.acl
+ admin_keytab = FILE:/var/lib/kerberos/krb5kdc/kadm5.keytab
+ default_principal_flags = +postdateable +forwardable +renewable +proxiable
+ +dup-skey -preauth -hwauth +service
+ +tgt-based +allow-tickets -pwchange
+ -pwservice
+ dict_file = /var/lib/kerberos/krb5kdc/kadm5.dict
+ key_stash_file = /var/lib/kerberos/krb5kdc/.k5.MYDOMAIN.COM
+ kdc_ports = 750,88
+ max_life = 0d 10h 0m 0s
+ max_renewable_life = 7d 0h 0m 0s
+ }
+ ...
+
+
+ c. Edit the Kerberos Client configuration file: ::
+
+ /etc/krb5.conf
+ [libdefaults]
+ dns_canonicalize_hostname = false
+ rdns = false
+ forwardable = true
+ dns_lookup_realm = true //--> if using DNS/DNSMasq
+ dns_lookup_kdc = true //--> if using DNS/DNSMasq
+ allow_weak_crypto = false
+ default_realm = MYDOMAIN.COM
+ default_ccache_name = KEYRING:persistent:%{uid}
+
+ [realms]
+ MYDOMAIN.COM = {
+ kdc = kerberos.mydomain.com
+ admin_server = kerberos.mydomain.com
+ ...
+ }
+ ...
+
+
+2. Create the Kerberos database: ::
+
+ SUSE: kdb5_util create -s
+
+ REDHAT: kdb5_util create -s
+
+
+3. Enable and Start both 'KDC and KDC admin' servers: ::
+
+ SUSE: systemctl enable/start krb5kdc
+ systemctl enable/start kadmind
+
+ REDHAT: systemctl enable/start krb5kdc
+ systemctl enable/start kadmin
+
+
+4. Create a Kerberos Administrator
+ Kerberos principals can be created either locally on the KDC server itself
+ or through the network, using an 'admin principal'. On the KDC server,
+ using ``kadmin.local``:
+
+ a. List the existing principals: ::
+
+ kadmin.local: listprincs
+ K/M@MYDOMAIN.COM
+ krbtgt/MYDOMAIN.COM@MYDOMAIN.COM
+ kadmin/admin@MYDOMAIN.COM
+ kadmin/changepw@MYDOMAIN.COM
+ kadmin/history@MYDOMAIN.COM
+ kadmin/kerberos.mydomain.com@MYDOMAIN.COM
+ root/admin@MYDOMAIN.COM
+ ...
+
+
+ b. In case we don't have a built-in 'admin principal', we then create one
+ (whatever ``principal name``, we are using ``root``, once by default
+ ``kinit`` tries to authenticate using the same system login user name,
+ unless a ``principal`` is passed as an argument ``kinit principal``): ::
+
+ # kadmin.local -q "addprinc root/admin"
+ Authenticating as principal root/admin@MYDOMAIN.COM with password.
+ WARNING: no policy specified for root/admin@MYDOMAIN.COM; defaulting to no policy
+ Enter password for principal "root/admin@MYDOMAIN.COM":
+
+
+ c. Confirm the newly created 'admin principal' has the needed permissions
+ in the KDC ACL (if ACLs are changed, ``kadmind`` needs to be restarted): ::
+
+ SUSE: /var/lib/kerberos/krb5kdc/kadm5.acl
+ REDHAT: /var/kerberos/krb5kdc/kadm5.acl
+
+ ###############################################################################
+ #Kerberos_principal permissions [target_principal] [restrictions]
+ ###############################################################################
+ #
+ */admin@MYDOMAIN.COM *
+
+
+ d. Create a simple 'user principal' (same steps as by *The 'Ceph side' of
+ the things*; 4a): ::
+
+ kadmin.local: addprinc johndoe
+ WARNING: no policy specified for johndoe@MYDOMAIN.COM; defaulting to no policy
+ Enter password for principal "johndoe@MYDOMAIN.COM":
+ Re-enter password for principal "johndoe@MYDOMAIN.COM":
+ Principal "johndoe@MYDOMAIN.COM" created.
+
+
+ e. Confirm the newly created 'user principal' is able to authenticate (same
+ steps as by *The 'Ceph side' of the things*; 6): ::
+
+ # kdestroy -A && kinit -f johndoe && klist -f
+ Password for johndoe@MYDOMAIN.COM:
+ Ticket cache: KEYRING:persistent:0:0
+ Default principal: johndoe@MYDOMAIN.COM
+
+ Valid starting Expires Service principal
+ 11/16/2018 13:11:16 11/16/2018 23:11:16 krbtgt/MYDOMAIN.COM@MYDOMAIN.COM
+ renew until 11/17/2018 13:11:16, Flags: FRI
+ ...
+
+
+5. At this point, we should have a *simple (MIT) Kerberos Server up and running*:
+
+ a. Considering we will want to work with keytab files, for both 'user and
+ service' principals, refer to The *'Ceph side' of the things* starting
+ at step 4.
+
+ b. Make sure you are comfortable with following and their ``manpages``: ::
+
+ krb5.conf -> Krb client config file
+ kdc.conf -> KDC server config file
+
+ krb5kdc -> KDC server daemon
+ kadmind -> KDC administration daemon
+
+ kadmin -> Krb administration tool
+ kdb5_util -> Krb low-level database administration tool
+
+ kinit -> Obtain and cache Kerberos ticket-granting ticket tool
+ klist -> List cached Kerberos tickets tool
+ kdestroy -> Destroy Kerberos tickets tool
+
+
+6. Name Resolution
+ As mentioned earlier, Kerberos *relies heavly on name resolution*. Most of
+ the Kerberos issues are usually related to name resolution, since Kerberos
+ is *very picky* on both *systems names* and *host lookups*.
+
+ a. As described in *The 'Ceph side' of the things*; step 2a, DNS RRs
+ greatly improves service location and host/domain resolution, by using
+ ``(srv resources)`` and ``(txt record)`` respectively (as per
+ *Before We Start*; *DNS resolution*). ::
+
+ /var/lib/named/master/mydomain.com
+ kerberos IN A 192.168.10.21
+ kerberos-slave IN A 192.168.10.22
+ _kerberos IN TXT "MYDOMAIN.COM"
+ _kerberos._udp IN SRV 1 0 88 kerberos
+ _kerberos._tcp IN SRV 1 0 88 kerberos
+ _kerberos._udp IN SRV 20 0 88 kerberos-slave
+ _kerberos-master._udp IN SRV 0 0 88 kerberos
+ _kerberos-adm._tcp IN SRV 0 0 749 kerberos
+ _kpasswd._udp IN SRV 0 0 464 kerberos
+ ...
+
+
+ b. For a small network or development environment, where a *DNS server is
+ not available*, we have the option to use ``DNSMasq``, an
+ ease-to-configure lightweight DNS server (along with some other
+ capabilities).
+
+ These records can be added to ``/etc/dnsmasq.conf`` (in addition to the
+ needed 'host records'): ::
+
+ /etc/dnsmasq.conf
+ ...
+ txt-record=_kerberos.mydomain.com,"MYDOMAIN.COM"
+ srv-host=_kerberos._udp.mydomain.com,kerberos.mydomain.com,88,1
+ srv-host=_kerberos._udp.mydomain.com,kerberos-2.mydomain.com,88,20
+ srv-host=_kerberos-master._udp.mydomain.com,kerberos.mydomain.com,88,0
+ srv-host=_kerberos-adm._tcp.mydomain.com,kerberos.mydomain.com,749,0
+ srv-host=_kpasswd._udp.mydomain.com,kerberos.mydomain.com,464,0
+ srv-host=_kerberos._tcp.mydomain.com,kerberos.mydomain.com,88,1
+ ...
+
+
+ c. After 'b)' is all set, and ``dnsmasq`` service up and running, we can
+ test it using: ::
+
+ # nslookup kerberos
+ Server: 192.168.10.1
+ Address: 192.168.10.1#53
+
+ Name: kerberos.mydomain.com
+ Address: 192.168.10.21
+
+ # host -t SRV _kerberos._tcp.mydomain.com
+ _kerberos._tcp.mydomain.com has SRV record 1 0 88 kerberos.mydomain.com.
+
+ # host -t SRV {each srv-host record}
+ # host -t TXT _kerberos.mydomain.com
+ _kerberos.mydomain.com descriptive text "MYDOMAIN.COM"
+ ...
+
+
+ f. As long as ``name resolution`` is working properly, either ``dnsmasq``
+ or ``named``, Kerberos should be able to find the needed service
+ records.
diff --git a/doc/dev/cephadm/cephadm-exporter.rst b/doc/dev/cephadm/cephadm-exporter.rst
new file mode 100644
index 000000000..bc41fcaeb
--- /dev/null
+++ b/doc/dev/cephadm/cephadm-exporter.rst
@@ -0,0 +1,306 @@
+================
+cephadm Exporter
+================
+
+There are a number of long running tasks that the cephadm 'binary' runs which can take several seconds
+to run. This latency represents a scalability challenge to the Ceph orchestrator management plane.
+
+To address this, cephadm needs to be able to run some of these longer running tasks asynchronously - this
+frees up processing on the mgr by offloading tasks to each host, reduces latency and improves scalability.
+
+This document describes the implementation requirements and design for an 'exporter' feature
+
+
+Requirements
+============
+The exporter should address these functional and non-functional requirements;
+
+* run as a normal systemd unit
+* utilise the same filesystem schema as other services deployed with cephadm
+* require only python3 standard library modules (no external dependencies)
+* use encryption to protect the data flowing from a host to Ceph mgr
+* execute data gathering tasks as background threads
+* be easily extended to include more data gathering tasks
+* monitor itself for the health of the data gathering threads
+* cache metadata to respond to queries quickly
+* respond to a metadata query in <30ms to support large Ceph clusters (000's nodes)
+* provide CLI interaction to enable the exporter to be deployed either at bootstrap time, or once the
+ cluster has been deployed.
+* be deployed as a normal orchestrator service (similar to the node-exporter)
+
+High Level Design
+=================
+
+This section will focus on the exporter logic **only**.
+
+.. code::
+
+ Establish a metadata cache object (tasks will be represented by separate attributes)
+ Create a thread for each data gathering task; host, ceph-volume and list_daemons
+ each thread updates it's own attribute within the cache object
+ Start a server instance passing requests to a specific request handler
+ the request handler only interacts with the cache object
+ the request handler passes metadata back to the caller
+ Main Loop
+ Leave the loop if a 'stop' request is received
+ check thread health
+ if a thread that was active, is now inactive
+ update the cache marking the task as inactive
+ update the cache with an error message for that task
+ wait for n secs
+
+
+In the initial exporter implementation, the exporter has been implemented as a RESTful API.
+
+
+Security
+========
+
+The cephadm 'binary' only supports standard python3 features, which has meant the RESTful API has been
+developed using the http module, which itself is not intended for production use. However, the implementation
+is not complex (based only on HTTPServer and BaseHHTPRequestHandler) and only supports the GET method - so the
+security risk is perceived as low.
+
+Current mgr to host interactions occurs within an ssh connection, so the goal of the exporter is to adopt a similar
+security model.
+
+The initial REST API is implemented with the following features;
+
+* generic self-signed, or user provided SSL crt/key to encrypt traffic between the mgr and the host
+* 'token' based authentication of the request
+
+All exporter instances will use the **same** crt/key to secure the link from the mgr to the host(s), in the same way
+that the ssh access uses the same public key and port for each host connection.
+
+.. note:: Since the same SSL configuration is used on every exporter, when you supply your own settings you must
+ ensure that the CN or SAN components of the distinguished name are either **not** used or created using wildcard naming.
+
+The crt, key and token files are all defined with restrictive permissions (600), to help mitigate against the risk of exposure
+to any other user on the Ceph cluster node(s).
+
+Administrator Interaction
+=========================
+Several new commands are required to configure the exporter, and additional parameters should be added to the bootstrap
+process to allow the exporter to be deployed automatically for new clusters.
+
+
+Enhancements to the 'bootstrap' process
+---------------------------------------
+bootstrap should support additional parameters to automatically configure exporter daemons across hosts
+
+``--with-exporter``
+
+By using this flag, you're telling the bootstrap process to include the cephadm-exporter service within the
+cluster. If you do not provide a specific configuration (SSL, token, port) to use, defaults would be applied.
+
+``--exporter-config``
+
+With the --exporter-config option, you may pass your own SSL, token and port information. The file must be in
+JSON format and contain the following fields; crt, key, token and port. The JSON content should be validated, and any
+errors detected passed back to the user during the argument parsing phase (before any changes are done).
+
+
+Additional ceph commands
+------------------------
+::
+
+# ceph cephadm generate-exporter-config
+
+This command will create generate a default configuration consisting of; a self signed certificate, a randomly generated
+32 character token and the default port of 9443 for the REST API.
+::
+
+# ceph cephadm set-exporter-config -i <config.json>
+
+Use a JSON file to define the crt, key, token and port for the REST API. The crt, key and token are validated by
+the mgr/cephadm module prior storing the values in the KV store. Invalid or missing entries should be reported to the
+user.
+::
+
+# ceph cephadm clear-exporter-config
+
+Clear the current configuration (removes the associated keys from the KV store)
+::
+
+# ceph cephadm get-exporter-config
+
+Show the current exporter configuration, in JSON format
+
+
+.. note:: If the service is already deployed any attempt to change or clear the configuration will
+ be denied. In order to change settings you must remove the service, apply the required configuration
+ and re-apply (``ceph orch apply cephadm-exporter``)
+
+
+
+New Ceph Configuration Keys
+===========================
+The exporter configuration is persisted to the monitor's KV store, with the following keys:
+
+| mgr/cephadm/exporter_config
+| mgr/cephadm/exporter_enabled
+
+
+
+RESTful API
+===========
+The primary goal of the exporter is the provision of metadata from the host to the mgr. This interaction takes
+place over a simple GET interface. Although only the GET method is supported, the API provides multiple URLs to
+provide different views on the metadata that has been gathered.
+
+.. csv-table:: Supported URL endpoints
+ :header: "URL", "Purpose"
+
+ "/v1/metadata", "show all metadata including health of all threads"
+ "/v1/metadata/health", "only report on the health of the data gathering threads"
+ "/v1/metadata/disks", "show the disk output (ceph-volume inventory data)"
+ "/v1/metadata/host", "show host related metadata from the gather-facts command"
+ "/v1/metatdata/daemons", "show the status of all ceph cluster related daemons on the host"
+
+Return Codes
+------------
+The following HTTP return codes are generated by the API
+
+.. csv-table:: Supported HTTP Responses
+ :header: "Status Code", "Meaning"
+
+ "200", "OK"
+ "204", "the thread associated with this request is no longer active, no data is returned"
+ "206", "some threads have stopped, so some content is missing"
+ "401", "request is not authorised - check your token is correct"
+ "404", "URL is malformed, not found"
+ "500", "all threads have stopped - unable to provide any metadata for the host"
+
+
+Deployment
+==========
+During the initial phases of the exporter implementation, deployment is regarded as optional but is available
+to new clusters and existing clusters that have the feature (Pacific and above).
+
+* new clusters : use the ``--with-exporter`` option
+* existing clusters : you'll need to set the configuration and deploy the service manually
+
+.. code::
+
+ # ceph cephadm generate-exporter-config
+ # ceph orch apply cephadm-exporter
+
+If you choose to remove the cephadm-exporter service, you may simply
+
+.. code::
+
+ # ceph orch rm cephadm-exporter
+
+This will remove the daemons, and the exporter releated settings stored in the KV store.
+
+
+Management
+==========
+Once the exporter is deployed, you can use the following snippet to extract the host's metadata.
+
+.. code-block:: python
+
+ import ssl
+ import json
+ import sys
+ import tempfile
+ import time
+ from urllib.request import Request, urlopen
+
+ # CHANGE THIS V
+ hostname = "rh8-1.storage.lab"
+
+ print("Reading config.json")
+ try:
+ with open('./config.json', 'r') as f:
+ raw=f.read()
+ except FileNotFoundError as e:
+ print("You must first create a config.json file using the cephadm get-exporter-config command")
+ sys.exit(1)
+
+ cfg = json.loads(raw)
+ with tempfile.NamedTemporaryFile(buffering=0) as t:
+ print("creating a temporary local crt file from the json")
+ t.write(cfg['crt'].encode('utf-8'))
+
+ ctx = ssl.create_default_context()
+ ctx.check_hostname = False
+ ctx.load_verify_locations(t.name)
+ hdrs={"Authorization":f"Bearer {cfg['token']}"}
+ print("Issuing call to gather metadata")
+ req=Request(f"https://{hostname}:9443/v1/metadata",headers=hdrs)
+ s_time = time.time()
+ r = urlopen(req,context=ctx)
+ print(r.status)
+ print("call complete")
+ # assert r.status == 200
+ if r.status in [200, 206]:
+
+ raw=r.read() # bytes string
+ js=json.loads(raw.decode())
+ print(json.dumps(js, indent=2))
+ elapsed = time.time() - s_time
+ print(f"Elapsed secs : {elapsed}")
+
+
+.. note:: the above example uses python3, and assumes that you've extracted the config using the ``get-exporter-config`` command.
+
+
+Implementation Specific Details
+===============================
+
+In the same way as a typical container based deployment, the exporter is deployed to a directory under ``/var/lib/ceph/<fsid>``. The
+cephadm binary is stored in this cluster folder, and the daemon's configuration and systemd settings are stored
+under ``/var/lib/ceph/<fsid>/cephadm-exporter.<id>/``.
+
+.. code::
+
+ [root@rh8-1 cephadm-exporter.rh8-1]# pwd
+ /var/lib/ceph/cb576f70-2f72-11eb-b141-525400da3eb7/cephadm-exporter.rh8-1
+ [root@rh8-1 cephadm-exporter.rh8-1]# ls -al
+ total 24
+ drwx------. 2 root root 100 Nov 25 18:10 .
+ drwx------. 8 root root 160 Nov 25 23:19 ..
+ -rw-------. 1 root root 1046 Nov 25 18:10 crt
+ -rw-------. 1 root root 1704 Nov 25 18:10 key
+ -rw-------. 1 root root 64 Nov 25 18:10 token
+ -rw-------. 1 root root 38 Nov 25 18:10 unit.configured
+ -rw-------. 1 root root 48 Nov 25 18:10 unit.created
+ -rw-r--r--. 1 root root 157 Nov 25 18:10 unit.run
+
+
+In order to respond to requests quickly, the CephadmDaemon uses a cache object (CephadmCache) to hold the results
+of the cephadm commands.
+
+The exporter doesn't introduce any new data gathering capability - instead it merely calls the existing cephadm commands.
+
+The CephadmDaemon class creates a local HTTP server(uses ThreadingMixIn), secured with TLS and uses the CephadmDaemonHandler
+to handle the requests. The request handler inspects the request header and looks for a valid Bearer token - if this is invalid
+or missing the caller receives a 401 Unauthorized error.
+
+The 'run' method of the CephadmDaemon class, places the scrape_* methods into different threads with each thread supporting
+a different refresh interval. Each thread then periodically issues it's cephadm command, and places the output
+in the cache object.
+
+In addition to the command output, each thread also maintains it's own timestamp record in the cache so the caller can
+very easily determine the age of the data it's received.
+
+If the underlying cephadm command execution hits an exception, the thread passes control to a _handle_thread_exception method.
+Here the exception is logged to the daemon's log file and the exception details are added to the cache, providing visibility
+of the problem to the caller.
+
+Although each thread is effectively given it's own URL endpoint (host, disks, daemons), the recommended way to gather data from
+the host is to simply use the ``/v1/metadata`` endpoint. This will provide all of the data, and indicate whether any of the
+threads have failed.
+
+The run method uses "signal" to establish a reload hook, but in the initial implementation this doesn't take any action and simply
+logs that a reload was received.
+
+
+Future Work
+===========
+
+#. Consider the potential of adding a restart policy for threads
+#. Once the exporter is fully integrated into mgr/cephadm, the goal would be to make the exporter the
+ default means of data gathering. However, until then the exporter will remain as an opt-in 'feature
+ preview'.
diff --git a/doc/dev/cephadm/compliance-check.rst b/doc/dev/cephadm/compliance-check.rst
new file mode 100644
index 000000000..eea462445
--- /dev/null
+++ b/doc/dev/cephadm/compliance-check.rst
@@ -0,0 +1,121 @@
+================
+Compliance Check
+================
+
+The stability and reliability of a Ceph cluster is dependent not just upon the Ceph daemons, but
+also the OS and hardware that Ceph is installed on. This document is intended to promote a design
+discussion for providing a "compliance" feature within mgr/cephadm, which would be responsible for
+identifying common platform-related issues that could impact Ceph stability and operation.
+
+The ultimate goal of these checks is to identify issues early and raise a healthcheck WARN
+event, to alert the Administrator to the issue.
+
+Prerequisites
+=============
+In order to effectively analyse the hosts that Ceph is deployed to, this feature requires a cache
+of host-related metadata. The metadata is already available from cephadm's HostFacts class and the
+``gather-facts`` cephadm command. For the purposes of this document, we will assume that this
+data is available within the mgr/cephadm "cache" structure.
+
+Some checks will require that the host status is also populated e.g. ONLINE, OFFLINE, MAINTENANCE
+
+Administrator Interaction
+=========================
+Not all users will require this feature, and must be able to 'opt out'. For this reason,
+mgr/cephadm must provide controls, such as the following;
+
+.. code-block::
+
+ ceph cephadm compliance enable | disable | status [--format json]
+ ceph cephadm compliance ls [--format json]
+ ceph cephadm compliance enable-check <name>
+ ceph cephadm compliance disable-check <name>
+ ceph cephadm compliance set-check-interval <int>
+ ceph cephadm compliance get-check-interval
+
+The status option would show the enabled/disabled state of the feature, along with the
+check-interval.
+
+The ``ls`` subcommand would show all checks in the following format;
+
+``check-name status description``
+
+Proposed Integration
+====================
+The compliance checks are not required to run all the time, but instead should run at discrete
+intervals. The interval would be configurable under via the :code:`set-check-interval`
+subcommand (default would be every 12 hours)
+
+
+mgr/cephadm currently executes an event driven (time based) serve loop to act on deploy/remove and
+reconcile activity. In order to execute the compliance checks, the compliance check code would be
+called from this main serve loop - when the :code:`set-check-interval` is met.
+
+
+Proposed Checks
+===============
+All checks would push any errors to a list, so multiple issues can be escalated to the Admin at
+the same time. The list below provides a description of each check, with the text following the
+name indicating a shortname version *(the shortname is the reference for command Interaction
+when enabling or disabling a check)*
+
+
+OS Consistency (OS)
+___________________
+* all hosts must use same vendor
+* all hosts must be on the same major release (this check would only be applicable to distributions that
+ offer a long-term-support strategy (RHEL, CentOS, SLES, Ubuntu etc)
+
+
+*src: gather-facts output*
+
+Linux Kernel Security Mode (LSM)
+________________________________
+* All hosts should have a consistent SELINUX/AppArmor configuration
+
+*src: gather-facts output*
+
+Services Check (SERVICES)
+_________________________
+Hosts that are in an ONLINE state should adhere to the following;
+
+* all daemons (systemd units) should be enabled
+* all daemons should be running (not dead)
+
+*src: list_daemons output*
+
+Support Status (SUPPORT)
+________________________
+If support status has been detected, it should be consistent across all hosts. At this point
+support status is available only for Red Hat machines.
+
+*src: gather-facts output*
+
+Network : MTU (MTU)
+________________________________
+All network interfaces on the same Ceph network (public/cluster) should have the same MTU
+
+*src: gather-facts output*
+
+Network : LinkSpeed (LINKSPEED)
+____________________________________________
+All network interfaces on the same Ceph network (public/cluster) should have the same Linkspeed
+
+*src: gather-facts output*
+
+Network : Consistency (INTERFACE)
+______________________________________________
+All hosts with OSDs should have consistent network configuration - eg. if some hosts do
+not separate cluster/public traffic but others do, that is an anomaly that would generate a
+compliance check warning.
+
+*src: gather-facts output*
+
+Notification Strategy
+=====================
+If any of the checks fail, mgr/cephadm would raise a WARN level alert
+
+Futures
+=======
+The checks highlighted here serve only as a starting point, and we should expect to expand
+on the checks over time.
diff --git a/doc/dev/cephadm/developing-cephadm.rst b/doc/dev/cephadm/developing-cephadm.rst
new file mode 100644
index 000000000..d9f81c2c0
--- /dev/null
+++ b/doc/dev/cephadm/developing-cephadm.rst
@@ -0,0 +1,263 @@
+=======================
+Developing with cephadm
+=======================
+
+There are several ways to develop with cephadm. Which you use depends
+on what you're trying to accomplish.
+
+vstart --cephadm
+================
+
+- Start a cluster with vstart, with cephadm configured
+- Manage any additional daemons with cephadm
+- Requires compiled ceph binaries
+
+In this case, the mon and manager at a minimum are running in the usual
+vstart way, not managed by cephadm. But cephadm is enabled and the local
+host is added, so you can deploy additional daemons or add additional hosts.
+
+This works well for developing cephadm itself, because any mgr/cephadm
+or cephadm/cephadm code changes can be applied by kicking ceph-mgr
+with ``ceph mgr fail x``. (When the mgr (re)starts, it loads the
+cephadm/cephadm script into memory.)
+
+::
+
+ MON=1 MGR=1 OSD=0 MDS=0 ../src/vstart.sh -d -n -x --cephadm
+
+- ``~/.ssh/id_dsa[.pub]`` is used as the cluster key. It is assumed that
+ this key is authorized to ssh with no passphrase to root@`hostname`.
+- cephadm does not try to manage any daemons started by vstart.sh (any
+ nonzero number in the environment variables). No service spec is defined
+ for mon or mgr.
+- You'll see health warnings from cephadm about stray daemons--that's because
+ the vstart-launched daemons aren't controlled by cephadm.
+- The default image is ``quay.io/ceph-ci/ceph:master``, but you can change
+ this by passing ``-o container_image=...`` or ``ceph config set global container_image ...``.
+
+
+cstart and cpatch
+=================
+
+The ``cstart.sh`` script will launch a cluster using cephadm and put the
+conf and keyring in your build dir, so that the ``bin/ceph ...`` CLI works
+(just like with vstart). The ``ckill.sh`` script will tear it down.
+
+- A unique but stable fsid is stored in ``fsid`` (in the build dir).
+- The mon port is random, just like with vstart.
+- The container image is ``quay.io/ceph-ci/ceph:$tag`` where $tag is
+ the first 8 chars of the fsid.
+- If the container image doesn't exist yet when you run cstart for the
+ first time, it is built with cpatch.
+
+There are a few advantages here:
+
+- The cluster is a "normal" cephadm cluster that looks and behaves
+ just like a user's cluster would. In contrast, vstart and teuthology
+ clusters tend to be special in subtle (and not-so-subtle) ways (e.g.
+ having the ``lockdep`` turned on).
+
+To start a test cluster::
+
+ sudo ../src/cstart.sh
+
+The last line of the output will be a line you can cut+paste to update
+the container image. For instance::
+
+ sudo ../src/script/cpatch -t quay.io/ceph-ci/ceph:8f509f4e
+
+By default, cpatch will patch everything it can think of from the local
+build dir into the container image. If you are working on a specific
+part of the system, though, can you get away with smaller changes so that
+cpatch runs faster. For instance::
+
+ sudo ../src/script/cpatch -t quay.io/ceph-ci/ceph:8f509f4e --py
+
+will update the mgr modules (minus the dashboard). Or::
+
+ sudo ../src/script/cpatch -t quay.io/ceph-ci/ceph:8f509f4e --core
+
+will do most binaries and libraries. Pass ``-h`` to cpatch for all options.
+
+Once the container is updated, you can refresh/restart daemons by bouncing
+them with::
+
+ sudo systemctl restart ceph-`cat fsid`.target
+
+When you're done, you can tear down the cluster with::
+
+ sudo ../src/ckill.sh # or,
+ sudo ../src/cephadm/cephadm rm-cluster --force --fsid `cat fsid`
+
+cephadm bootstrap --shared_ceph_folder
+======================================
+
+Cephadm can also be used directly without compiled ceph binaries.
+
+Run cephadm like so::
+
+ sudo ./cephadm bootstrap --mon-ip 127.0.0.1 \
+ --ssh-private-key /home/<user>/.ssh/id_rsa \
+ --skip-mon-network \
+ --skip-monitoring-stack --single-host-defaults \
+ --skip-dashboard \
+ --shared_ceph_folder /home/<user>/path/to/ceph/
+
+- ``~/.ssh/id_rsa`` is used as the cluster key. It is assumed that
+ this key is authorized to ssh with no passphrase to root@`hostname`.
+
+Source code changes made in the ``pybind/mgr/`` directory then
+require a daemon restart to take effect.
+
+Note regarding network calls from CLI handlers
+==============================================
+
+Executing any cephadm CLI commands like ``ceph orch ls`` will block the
+mon command handler thread within the MGR, thus preventing any concurrent
+CLI calls. Note that pressing ``^C`` will not resolve this situation,
+as *only* the client will be aborted, but not execution of the command
+within the orchestrator manager module itself. This means, cephadm will
+be completely unresponsive until the execution of the CLI handler is
+fully completed. Note that even ``ceph orch ps`` will not respond while
+another handler is executing.
+
+This means we should do very few synchronous calls to remote hosts.
+As a guideline, cephadm should do at most ``O(1)`` network calls in CLI handlers.
+Everything else should be done asynchronously in other threads, like ``serve()``.
+
+Note regarding different variables used in the code
+===================================================
+
+* a ``service_type`` is something like mon, mgr, alertmanager etc defined
+ in ``ServiceSpec``
+* a ``service_id`` is the name of the service. Some services don't have
+ names.
+* a ``service_name`` is ``<service_type>.<service_id>``
+* a ``daemon_type`` is the same as the service_type, except for ingress,
+ which has the haproxy and keepalived daemon types.
+* a ``daemon_id`` is typically ``<service_id>.<hostname>.<random-string>``.
+ (Not the case for e.g. OSDs. OSDs are always called OSD.N)
+* a ``daemon_name`` is ``<daemon_type>.<daemon_id>``
+
+Kcli: a virtualization management tool to make easy orchestrators development
+=============================================================================
+`Kcli <https://github.com/karmab/kcli>`_ is meant to interact with existing
+virtualization providers (libvirt, KubeVirt, oVirt, OpenStack, VMware vSphere,
+GCP and AWS) and to easily deploy and customize VMs from cloud images.
+
+It allows you to setup an environment with several vms with your preferred
+configuration( memory, cpus, disks) and OS flavor.
+
+main advantages:
+----------------
+ - Is fast. Typically you can have a completely new Ceph cluster ready to debug
+ and develop orchestrator features in less than 5 minutes.
+ - Is a "near production" lab. The lab created with kcli is very near of "real"
+ clusters in QE labs or even in production. So easy to test "real things" in
+ almost "real environment"
+ - Is safe and isolated. Do not depend of the things you have installed in your
+ machine. And the vms are isolated from your environment.
+ - Easy to work "dev" environment. For "not compilated" software pieces,
+ for example any mgr module. It is an environment that allow you to test your
+ changes interactively.
+
+Installation:
+-------------
+Complete documentation in `kcli installation <https://kcli.readthedocs.io/en/latest/#installation>`_
+but we strongly suggest to use the container image approach.
+
+So things to do:
+ - 1. Review `requeriments <https://kcli.readthedocs.io/en/latest/#libvirt-hypervisor-requisites>`_
+ and install/configure whatever you need to meet them.
+ - 2. get the kcli image and create one alias for executing the kcli command
+ ::
+
+ # podman pull quay.io/karmab/kcli
+ # alias kcli='podman run --net host -it --rm --security-opt label=disable -v $HOME/.ssh:/root/.ssh -v $HOME/.kcli:/root/.kcli -v /var/lib/libvirt/images:/var/lib/libvirt/images -v /var/run/libvirt:/var/run/libvirt -v $PWD:/workdir -v /var/tmp:/ignitiondir quay.io/karmab/kcli'
+
+.. note:: /var/lib/libvirt/images can be customized.... be sure that you are
+ using this folder for your OS images
+
+.. note:: Once you have used your kcli tool to create and use different labs, we
+ suggest you to "save" and use your own kcli image.
+ Why?: kcli is alive and it changes (and for the moment only exists one tag ...
+ latest). Because we have more than enough with the current functionality, and
+ what we want is overall stability,
+ we suggest to store the kcli image you are using in a safe place and update
+ your kcli alias to use your own image.
+
+Test your kcli installation:
+----------------------------
+See the kcli `basic usage workflow <https://kcli.readthedocs.io/en/latest/#basic-workflow>`_
+
+Create a Ceph lab cluster
+-------------------------
+In order to make easy this task we are going to use a kcli plan.
+
+A kcli plan is a file where you can define the different settings you want to
+have in a set of vms.
+You can define hardware parameters (cpu, memory, disks ..), operating system and
+it also allows you to automate the installation and configuration of any
+software you want to have.
+
+There is a `repository <https://github.com/karmab/kcli-plans>`_ with a collection of
+plans that can be used for different purposes. And we have predefined plans to
+install Ceph clusters using Ceph ansible or cephadm, lets create our first Ceph
+cluster using cephadm::
+
+# kcli2 create plan -u https://github.com/karmab/kcli-plans/blob/master/ceph/ceph_cluster.yml
+
+This will create a set of three vms using the plan file pointed by the url.
+After a few minutes (depend of your laptop power), lets examine the cluster:
+
+* Take a look to the vms created::
+
+ # kcli list vms
+
+* Enter in the bootstrap node::
+
+ # kcli ssh ceph-node-00
+
+* Take a look to the ceph cluster installed::
+
+ [centos@ceph-node-00 ~]$ sudo -i
+ [root@ceph-node-00 ~]# cephadm version
+ [root@ceph-node-00 ~]# cephadm shell
+ [ceph: root@ceph-node-00 /]# ceph orch host ls
+
+Create a Ceph cluster to make easy developing in mgr modules (Orchestrators and Dashboard)
+------------------------------------------------------------------------------------------
+The cephadm kcli plan (and cephadm) are prepared to do that.
+
+The idea behind this method is to replace several python mgr folders in each of
+the ceph daemons with the source code folders in your host machine.
+This "trick" will allow you to make changes in any orchestrator or dashboard
+module and test them intermediately. (only needed to disable/enable the mgr module)
+
+So in order to create a ceph cluster for development purposes you must use the
+same cephadm plan but with a new parameter pointing your Ceph source code folder::
+
+ # kcli create plan -u https://github.com/karmab/kcli-plans/blob/master/ceph/ceph_cluster.yml -P ceph_dev_folder=/home/mycodefolder/ceph
+
+Ceph Dashboard development
+--------------------------
+Ceph dashboard module is not going to be loaded if previously you have not
+generated the frontend bundle.
+
+For now, in order load properly the Ceph Dashboardmodule and to apply frontend
+changes you have to run "ng build" on your laptop::
+
+ # Start local frontend build with watcher (in background):
+ sudo dnf install -y nodejs
+ cd <path-to-your-ceph-repo>
+ cd src/pybind/mgr/dashboard/frontend
+ sudo chown -R <your-user>:root dist node_modules
+ NG_CLI_ANALYTICS=false npm ci
+ npm run build -- --deleteOutputPath=false --watch &
+
+After saving your changes, the frontend bundle will be built again.
+When completed, you'll see::
+
+ "Localized bundle generation complete."
+
+Then you can reload your Dashboard browser tab.
diff --git a/doc/dev/cephadm/host-maintenance.rst b/doc/dev/cephadm/host-maintenance.rst
new file mode 100644
index 000000000..2b84ec7bd
--- /dev/null
+++ b/doc/dev/cephadm/host-maintenance.rst
@@ -0,0 +1,104 @@
+================
+Host Maintenance
+================
+
+All hosts that support Ceph daemons need to support maintenance activity, whether the host
+is physical or virtual. This means that management workflows should provide
+a simple and consistent way to support this operational requirement. This document defines
+the maintenance strategy that could be implemented in cephadm and mgr/cephadm.
+
+
+High Level Design
+=================
+Placing a host into maintenance, adopts the following workflow;
+
+#. confirm that the removal of the host does not impact data availability (the following
+ steps will assume it is safe to proceed)
+
+ * orch host ok-to-stop <host> would be used here
+
+#. if the host has osd daemons, apply noout to the host subtree to prevent data migration
+ from triggering during the planned maintenance slot.
+#. Stop the ceph target (all daemons stop)
+#. Disable the ceph target on that host, to prevent a reboot from automatically starting
+ ceph services again)
+
+
+Exiting Maintenance, is basically the reverse of the above sequence
+
+Admin Interaction
+=================
+The ceph orch command will be extended to support maintenance.
+
+.. code-block::
+
+ ceph orch host maintenance enter <host> [ --force ]
+ ceph orch host maintenance exit <host>
+
+.. note:: In addition, the host's status should be updated to reflect whether it
+ is in maintenance or not.
+
+The 'check' Option
+__________________
+The orch host ok-to-stop command focuses on ceph daemons (mon, osd, mds), which
+provides the first check. However, a ceph cluster also uses other types of daemons
+for monitoring, management and non-native protocol support which means the
+logic will need to consider service impact too. The 'check' option provides
+this additional layer to alert the user of service impact to *secondary*
+daemons.
+
+The list below shows some of these additional daemons.
+
+* mgr (not included in ok-to-stop checks)
+* prometheus, grafana, alertmanager
+* rgw
+* haproxy
+* iscsi gateways
+* ganesha gateways
+
+By using the --check option first, the Admin can choose whether to proceed. This
+workflow is obviously optional for the CLI user, but could be integrated into the
+UI workflow to help less experienced Administators manage the cluster.
+
+By adopting this two-phase approach, a UI based workflow would look something
+like this.
+
+#. User selects a host to place into maintenance
+
+ * orchestrator checks for data **and** service impact
+#. If potential impact is shown, the next steps depend on the impact type
+
+ * **data availability** : maintenance is denied, informing the user of the issue
+ * **service availability** : user is provided a list of affected services and
+ asked to confirm
+
+
+Components Impacted
+===================
+Implementing this capability will require changes to the following;
+
+* cephadm
+
+ * Add maintenance subcommand with the following 'verbs'; enter, exit, check
+
+* mgr/cephadm
+
+ * add methods to CephadmOrchestrator for enter/exit and check
+ * data gathering would be skipped for hosts in a maintenance state
+
+* mgr/orchestrator
+
+ * add CLI commands to OrchestratorCli which expose the enter/exit and check interaction
+
+
+Ideas for Future Work
+=====================
+#. When a host is placed into maintenance, the time of the event could be persisted. This
+ would allow the orchestrator layer to establish a maintenance window for the task and
+ alert if the maintenance window has been exceeded.
+#. The maintenance process could support plugins to allow other integration tasks to be
+ initiated as part of the transition to and from maintenance. This plugin capability could
+ support actions like;
+
+ * alert suppression to 3rd party monitoring framework(s)
+ * service level reporting, to record outage windows
diff --git a/doc/dev/cephadm/index.rst b/doc/dev/cephadm/index.rst
new file mode 100644
index 000000000..a09baffdb
--- /dev/null
+++ b/doc/dev/cephadm/index.rst
@@ -0,0 +1,15 @@
+===================================
+CEPHADM Developer Documentation
+===================================
+
+.. rubric:: Contents
+
+.. toctree::
+ :maxdepth: 1
+
+
+ developing-cephadm
+ host-maintenance
+ compliance-check
+ cephadm-exporter
+ scalability-notes
diff --git a/doc/dev/cephadm/scalability-notes.rst b/doc/dev/cephadm/scalability-notes.rst
new file mode 100644
index 000000000..157153cb3
--- /dev/null
+++ b/doc/dev/cephadm/scalability-notes.rst
@@ -0,0 +1,95 @@
+#############################################
+ Notes and Thoughts on Cephadm's scalability
+#############################################
+
+*********************
+ About this document
+*********************
+
+This document does NOT define a specific proposal or some future work.
+Instead it merely lists a few thoughts that MIGHT be relevant for future
+cephadm enhacements.
+
+*******
+ Intro
+*******
+
+Current situation:
+
+Cephadm manages all registered hosts. This means that it periodically
+scrapes data from each host to identify changes on the host like:
+
+- disk added/removed
+- daemon added/removed
+- host network/firewall etc has changed
+
+Currently, cephadm scrapes each host (up to 10 in parallel) every 6
+minutes, unless a refresh is forced manually.
+
+Refreshes for disks (ceph-volume), daemons (podman/docker), etc, happen
+in sequence.
+
+With the cephadm exporter, we have now reduced the time to scan hosts
+considerably, but the question remains:
+
+Is the cephadm-exporter sufficient to solve all future scalability
+issues?
+
+***********************************************
+ Considerations of cephadm-exporter's REST API
+***********************************************
+
+The cephadm-exporter uses HTTP to serve an endpoint to the hosts
+metadata. We MIGHT encounter some issues with this approach, which need
+to be mitigated at some point.
+
+- With the cephadm-exporter we use SSH and HTTP to connect to each
+ host. Having two distinct transport layers feels odd, and we might
+ want to consider reducing it to only a single protocol.
+
+- The current approach of delivering ``bin/cephadm`` to the host doesn't
+ allow the use of external dependencies. This means that we're stuck
+ with the built-in HTTP server lib, which isn't great for providing a
+ good developer experience. ``bin/cephadm`` needs to be packaged and
+ distributed (one way or the other) for us to make use of a better
+ http server library.
+
+************************
+ MON's config-key store
+************************
+
+After the ``mgr/cephadm`` queried metadata from each host, cephadm stores
+the data within the mon's k-v store.
+
+If each host would be allowed to write their own metadata to the store,
+``mgr/cephadm`` would no longer be required to gather the data.
+
+Some questions arise:
+
+- ``mgr/cephadm`` now needs to query data from the config-key store,
+ instead of relying on cached data.
+
+- cephadm knows three different types of data: (1) Data that is
+ critical and needs to be stored in the config-key store. (2) Data
+ that can be kept in memory only. (3) Data that can be stored in
+ RADOS pool. How can we apply this idea to those different types of
+ data.
+
+*******************************
+ Increase the worker pool size
+*******************************
+
+``mgr/cephadm`` is currently able to scrape 10 nodes at the same time.
+
+The scrape of a individual host takes the same amount of time persists.
+We'd just reduce the overall execution time.
+
+At best we can reach O(hosts) + O(daemons).
+
+*************************
+ Backwards compatibility
+*************************
+
+Any changes need to be backwards compatible or completely isolated from
+any existing functionality. There are running cephadm clusters out there
+that require an upgrade path.
diff --git a/doc/dev/cephfs-mirroring.rst b/doc/dev/cephfs-mirroring.rst
new file mode 100644
index 000000000..3ca487c03
--- /dev/null
+++ b/doc/dev/cephfs-mirroring.rst
@@ -0,0 +1,403 @@
+================
+CephFS Mirroring
+================
+
+CephFS supports asynchronous replication of snapshots to a remote CephFS file system via
+`cephfs-mirror` tool. Snapshots are synchronized by mirroring snapshot data followed by
+creating a snapshot with the same name (for a given directory on the remote file system) as
+the snapshot being synchronized.
+
+Requirements
+------------
+
+The primary (local) and secondary (remote) Ceph clusters version should be Pacific or later.
+
+Key Idea
+--------
+
+For a given snapshot pair in a directory, `cephfs-mirror` daemon will rely on readdir diff
+to identify changes in a directory tree. The diffs are applied to directory in the remote
+file system thereby only synchronizing files that have changed between two snapshots.
+
+This feature is tracked here: https://tracker.ceph.com/issues/47034.
+
+Currently, snapshot data is synchronized by bulk copying to the remote filesystem.
+
+.. note:: Synchronizing hardlinks is not supported -- hardlinked files get synchronized
+ as separate files.
+
+Creating Users
+--------------
+
+Start by creating a user (on the primary/local cluster) for the mirror daemon. This user
+requires write capability on the metadata pool to create RADOS objects (index objects)
+for watch/notify operation and read capability on the data pool(s).
+
+ $ ceph auth get-or-create client.mirror mon 'profile cephfs-mirror' mds 'allow r' osd 'allow rw tag cephfs metadata=*, allow r tag cephfs data=*' mgr 'allow r'
+
+Create a user for each file system peer (on the secondary/remote cluster). This user needs
+to have full capabilities on the MDS (to take snapshots) and the OSDs::
+
+ $ ceph fs authorize <fs_name> client.mirror_remote / rwps
+
+This user should be used (as part of peer specification) when adding a peer.
+
+Starting Mirror Daemon
+----------------------
+
+Mirror daemon should be spawned using `systemctl(1)` unit files::
+
+ $ systemctl enable cephfs-mirror@mirror
+ $ systemctl start cephfs-mirror@mirror
+
+`cephfs-mirror` daemon can be run in foreground using::
+
+ $ cephfs-mirror --id mirror --cluster site-a -f
+
+.. note:: User used here is `mirror` as created in the `Creating Users` section.
+
+Mirroring Design
+----------------
+
+CephFS supports asynchronous replication of snapshots to a remote CephFS file system
+via `cephfs-mirror` tool. For a given directory, snapshots are synchronized by transferring
+snapshot data to the remote file system and creating a snapshot with the same name as the
+snapshot being synchronized.
+
+Snapshot Synchronization Order
+------------------------------
+
+Although the order in which snapshots get chosen for synchronization does not matter,
+snapshots are picked based on creation order (using snap-id).
+
+Snapshot Incarnation
+--------------------
+
+A snapshot may be deleted and recreated (with the same name) with different contents.
+An "old" snapshot could have been synchronized (earlier) and the recreation of the
+snapshot could have been done when mirroring was disabled. Using snapshot names to
+infer the point-of-continuation would result in the "new" snapshot (incarnation)
+never getting picked up for synchronization.
+
+Snapshots on the secondary file system stores the snap-id of the snapshot it was
+synchronized from. This metadata is stored in `SnapInfo` structure on the MDS.
+
+Interfaces
+----------
+
+`Mirroring` module (manager plugin) provides interfaces for managing directory snapshot
+mirroring. Manager interfaces are (mostly) wrappers around monitor commands for managing
+file system mirroring and is the recommended control interface.
+
+Mirroring Module and Interface
+------------------------------
+
+Mirroring module provides interface for managing directory snapshot mirroring. The module
+is implemented as a Ceph Manager plugin. Mirroring module does not manage spawning (and
+terminating) the mirror daemons. Right now the preferred way would be to start/stop
+mirror daemons via `systemctl(1)`. Going forward, deploying mirror daemons would be
+managed by `cephadm` (Tracker: http://tracker.ceph.com/issues/47261).
+
+The manager module is responsible for assigning directories to mirror daemons for
+synchronization. Multiple mirror daemons can be spawned to achieve concurrency in
+directory snapshot synchronization. When mirror daemons are spawned (or terminated)
+, the mirroring module discovers the modified set of mirror daemons and rebalances
+the directory assignment amongst the new set thus providing high-availability.
+
+.. note:: Multiple mirror daemons is currently untested. Only a single mirror daemon
+ is recommended.
+
+Mirroring module is disabled by default. To enable mirroring use::
+
+ $ ceph mgr module enable mirroring
+
+Mirroring module provides a family of commands to control mirroring of directory
+snapshots. To add or remove directories, mirroring needs to be enabled for a given
+file system. To enable mirroring use::
+
+ $ ceph fs snapshot mirror enable <fs_name>
+
+.. note:: Mirroring module commands use `fs snapshot mirror` prefix as compared to
+ the monitor commands which `fs mirror` prefix. Make sure to use module
+ commands.
+
+To disable mirroring, use::
+
+ $ ceph fs snapshot mirror disable <fs_name>
+
+Once mirroring is enabled, add a peer to which directory snapshots are to be mirrored.
+Peers follow `<client>@<cluster>` specification and get assigned a unique-id (UUID)
+when added. See `Creating Users` section on how to create Ceph users for mirroring.
+
+To add a peer use::
+
+ $ ceph fs snapshot mirror peer_add <fs_name> <remote_cluster_spec> [<remote_fs_name>] [<remote_mon_host>] [<cephx_key>]
+
+`<remote_fs_name>` is optional, and default to `<fs_name>` (on the remote cluster).
+
+This requires the remote cluster ceph configuration and user keyring to be available in
+the primary cluster. See `Bootstrap Peers` section to avoid this. `peer_add` additionally
+supports passing the remote cluster monitor address and the user key. However, bootstrapping
+a peer is the recommended way to add a peer.
+
+.. note:: Only a single peer is supported right now.
+
+To remove a peer use::
+
+ $ ceph fs snapshot mirror peer_remove <fs_name> <peer_uuid>
+
+.. note:: See `Mirror Daemon Status` section on how to figure out Peer UUID.
+
+To list file system mirror peers use::
+
+ $ ceph fs snapshot mirror peer_list <fs_name>
+
+To configure a directory for mirroring, use::
+
+ $ ceph fs snapshot mirror add <fs_name> <path>
+
+To stop a mirroring directory snapshots use::
+
+ $ ceph fs snapshot mirror remove <fs_name> <path>
+
+Only absolute directory paths are allowed. Also, paths are normalized by the mirroring
+module, therfore, `/a/b/../b` is equivalent to `/a/b`.
+
+ $ mkdir -p /d0/d1/d2
+ $ ceph fs snapshot mirror add cephfs /d0/d1/d2
+ {}
+ $ ceph fs snapshot mirror add cephfs /d0/d1/../d1/d2
+ Error EEXIST: directory /d0/d1/d2 is already tracked
+
+Once a directory is added for mirroring, its subdirectory or ancestor directories are
+disallowed to be added for mirorring::
+
+ $ ceph fs snapshot mirror add cephfs /d0/d1
+ Error EINVAL: /d0/d1 is a ancestor of tracked path /d0/d1/d2
+ $ ceph fs snapshot mirror add cephfs /d0/d1/d2/d3
+ Error EINVAL: /d0/d1/d2/d3 is a subtree of tracked path /d0/d1/d2
+
+Commands to check directory mapping (to mirror daemons) and directory distribution are
+detailed in `Mirror Daemon Status` section.
+
+Bootstrap Peers
+---------------
+
+Adding a peer (via `peer_add`) requires the peer cluster configuration and user keyring
+to be available in the primary cluster (manager host and hosts running the mirror daemon).
+This can be avoided by bootstrapping and importing a peer token. Peer bootstrap involves
+creating a bootstrap token on the peer cluster via::
+
+ $ ceph fs snapshot mirror peer_bootstrap create <fs_name> <client_entity> <site-name>
+
+e.g.::
+
+ $ ceph fs snapshot mirror peer_bootstrap create backup_fs client.mirror_remote site-remote
+ {"token": "eyJmc2lkIjogIjBkZjE3MjE3LWRmY2QtNDAzMC05MDc5LTM2Nzk4NTVkNDJlZiIsICJmaWxlc3lzdGVtIjogImJhY2t1cF9mcyIsICJ1c2VyIjogImNsaWVudC5taXJyb3JfcGVlcl9ib290c3RyYXAiLCAic2l0ZV9uYW1lIjogInNpdGUtcmVtb3RlIiwgImtleSI6ICJBUUFhcDBCZ0xtRmpOeEFBVnNyZXozai9YYUV0T2UrbUJEZlJDZz09IiwgIm1vbl9ob3N0IjogIlt2MjoxOTIuMTY4LjAuNTo0MDkxOCx2MToxOTIuMTY4LjAuNTo0MDkxOV0ifQ=="}
+
+`site-name` refers to a user-defined string to identify the remote filesystem. In context
+of `peer_add` interface, `site-name` is the passed in `cluster` name from `remote_cluster_spec`.
+
+Import the bootstrap token in the primary cluster via::
+
+ $ ceph fs snapshot mirror peer_bootstrap import <fs_name> <token>
+
+e.g.::
+
+ $ ceph fs snapshot mirror peer_bootstrap import cephfs eyJmc2lkIjogIjBkZjE3MjE3LWRmY2QtNDAzMC05MDc5LTM2Nzk4NTVkNDJlZiIsICJmaWxlc3lzdGVtIjogImJhY2t1cF9mcyIsICJ1c2VyIjogImNsaWVudC5taXJyb3JfcGVlcl9ib290c3RyYXAiLCAic2l0ZV9uYW1lIjogInNpdGUtcmVtb3RlIiwgImtleSI6ICJBUUFhcDBCZ0xtRmpOeEFBVnNyZXozai9YYUV0T2UrbUJEZlJDZz09IiwgIm1vbl9ob3N0IjogIlt2MjoxOTIuMTY4LjAuNTo0MDkxOCx2MToxOTIuMTY4LjAuNTo0MDkxOV0ifQ==
+
+Mirror Daemon Status
+--------------------
+
+Mirror daemons get asynchronously notified about changes in file system mirroring status
+and/or peer updates.
+
+CephFS mirroring module provides `mirror daemon status` interface to check mirror daemon
+status::
+
+ $ ceph fs snapshot mirror daemon status
+
+E.g::
+
+ $ ceph fs snapshot mirror daemon status | jq
+ [
+ {
+ "daemon_id": 284167,
+ "filesystems": [
+ {
+ "filesystem_id": 1,
+ "name": "a",
+ "directory_count": 1,
+ "peers": [
+ {
+ "uuid": "02117353-8cd1-44db-976b-eb20609aa160",
+ "remote": {
+ "client_name": "client.mirror_remote",
+ "cluster_name": "ceph",
+ "fs_name": "backup_fs"
+ },
+ "stats": {
+ "failure_count": 1,
+ "recovery_count": 0
+ }
+ }
+ ]
+ }
+ ]
+ }
+ ]
+
+An entry per mirror daemon instance is displayed along with information such as configured
+peers and basic stats. For more detailed stats, use the admin socket interface as detailed
+below.
+
+CephFS mirror daemons provide admin socket commands for querying mirror status. To check
+available commands for mirror status use::
+
+ $ ceph --admin-daemon /path/to/mirror/daemon/admin/socket help
+ {
+ ....
+ ....
+ "fs mirror status cephfs@360": "get filesystem mirror status",
+ ....
+ ....
+ }
+
+Commands with `fs mirror status` prefix provide mirror status for mirror enabled
+file systems. Note that `cephfs@360` is of format `filesystem-name@filesystem-id`.
+This format is required since mirror daemons get asynchronously notified regarding
+file system mirror status (A file system can be deleted and recreated with the same
+name).
+
+Right now, the command provides minimal information regarding mirror status::
+
+ $ ceph --admin-daemon /var/run/ceph/cephfs-mirror.asok fs mirror status cephfs@360
+ {
+ "rados_inst": "192.168.0.5:0/1476644347",
+ "peers": {
+ "a2dc7784-e7a1-4723-b103-03ee8d8768f8": {
+ "remote": {
+ "client_name": "client.mirror_remote",
+ "cluster_name": "site-a",
+ "fs_name": "backup_fs"
+ }
+ }
+ },
+ "snap_dirs": {
+ "dir_count": 1
+ }
+ }
+
+`Peers` section in the command output above shows the peer information such as unique
+peer-id (UUID) and specification. The peer-id is required to remove an existing peer
+as mentioned in the `Mirror Module and Interface` section.
+
+Command with `fs mirror peer status` prefix provide peer synchronization status. This
+command is of format `filesystem-name@filesystem-id peer-uuid`::
+
+ $ ceph --admin-daemon /var/run/ceph/cephfs-mirror.asok fs mirror peer status cephfs@360 a2dc7784-e7a1-4723-b103-03ee8d8768f8
+ {
+ "/d0": {
+ "state": "idle",
+ "last_synced_snap": {
+ "id": 120,
+ "name": "snap1",
+ "sync_duration": 0.079997898999999997,
+ "sync_time_stamp": "274900.558797s"
+ },
+ "snaps_synced": 2,
+ "snaps_deleted": 0,
+ "snaps_renamed": 0
+ }
+ }
+
+Synchronization stats such as `snaps_synced`, `snaps_deleted` and `snaps_renamed` are reset
+on daemon restart and/or when a directory is reassigned to another mirror daemon (when
+multiple mirror daemons are deployed).
+
+A directory can be in one of the following states::
+
+ - `idle`: The directory is currently not being synchronized
+ - `syncing`: The directory is currently being synchronized
+ - `failed`: The directory has hit upper limit of consecutive failures
+
+When a directory hits a configured number of consecutive synchronization failures, the
+mirror daemon marks it as `failed`. Synchronization for these directories are retried.
+By default, the number of consecutive failures before a directory is marked as failed
+is controlled by `cephfs_mirror_max_consecutive_failures_per_directory` configuration
+option (default: 10) and the retry interval for failed directories is controlled via
+`cephfs_mirror_retry_failed_directories_interval` configuration option (default: 60s).
+
+E.g., adding a regular file for synchronization would result in failed status::
+
+ $ ceph fs snapshot mirror add cephfs /f0
+ $ ceph --admin-daemon /var/run/ceph/cephfs-mirror.asok fs mirror peer status cephfs@360 a2dc7784-e7a1-4723-b103-03ee8d8768f8
+ {
+ "/d0": {
+ "state": "idle",
+ "last_synced_snap": {
+ "id": 120,
+ "name": "snap1",
+ "sync_duration": 0.079997898999999997,
+ "sync_time_stamp": "274900.558797s"
+ },
+ "snaps_synced": 2,
+ "snaps_deleted": 0,
+ "snaps_renamed": 0
+ },
+ "/f0": {
+ "state": "failed",
+ "snaps_synced": 0,
+ "snaps_deleted": 0,
+ "snaps_renamed": 0
+ }
+ }
+
+This allows a user to add a non-existent directory for synchronization. The mirror daemon
+would mark the directory as failed and retry (less frequently). When the directory comes
+to existence, the mirror daemons would unmark the failed state upon successfull snapshot
+synchronization.
+
+When mirroring is disabled, the respective `fs mirror status` command for the file system
+will not show up in command help.
+
+Mirroring module provides a couple of commands to display directory mapping and distribution
+information. To check which mirror daemon a directory has been mapped to use::
+
+ $ ceph fs snapshot mirror dirmap cephfs /d0/d1/d2
+ {
+ "instance_id": "404148",
+ "last_shuffled": 1601284516.10986,
+ "state": "mapped"
+ }
+
+.. note:: `instance_id` is the RAODS instance-id associated with a mirror daemon.
+
+Other information such as `state` and `last_shuffled` are interesting when running
+multiple mirror daemons.
+
+When no mirror daemons are running the above command shows::
+
+ $ ceph fs snapshot mirror dirmap cephfs /d0/d1/d2
+ {
+ "reason": "no mirror daemons running",
+ "state": "stalled"
+ }
+
+Signifying that no mirror daemons are running and mirroring is stalled.
+
+Re-adding Peers
+---------------
+
+When re-adding (reassigning) a peer to a file system in another cluster, ensure that
+all mirror daemons have stopped synchronization to the peer. This can be checked
+via `fs mirror status` admin socket command (the `Peer UUID` should not show up
+in the command output). Also, it is recommended to purge synchronized directories
+from the peer before re-adding it to another file system (especially those directories
+which might exist in the new primary file system). This is not required if re-adding
+a peer to the same primary file system it was earlier synchronized from.
+
+Feature Status
+--------------
+
+`cephfs-mirror` daemon is built by default (follows `WITH_CEPHFS` CMake rule).
diff --git a/doc/dev/cephfs-reclaim.rst b/doc/dev/cephfs-reclaim.rst
new file mode 100644
index 000000000..ce56f5eaa
--- /dev/null
+++ b/doc/dev/cephfs-reclaim.rst
@@ -0,0 +1,104 @@
+CephFS Reclaim Interface
+========================
+
+Introduction
+------------
+NFS servers typically do not track ephemeral state on stable storage. If
+the NFS server is restarted, then it will be resurrected with no
+ephemeral state, and the NFS clients are expected to send requests to
+reclaim what state they held during a grace period.
+
+In order to support this use-case, libcephfs has grown several functions
+that allow a client that has been stopped and restarted to destroy or
+reclaim state held by a previous incarnation of itself. This allows the
+client to reacquire state held by its previous incarnation, and to avoid
+the long wait for the old session to time out before releasing the state
+previously held.
+
+As soon as an NFS server running over cephfs goes down, it's racing
+against its MDS session timeout. If the Ceph session times out before
+the NFS grace period is started, then conflicting state could be
+acquired by another client. This mechanism also allows us to increase
+the timeout for these clients, to ensure that the server has a long
+window of time to be restarted.
+
+Setting the UUID
+----------------
+In order to properly reset or reclaim against the old session, we need a
+way to identify the old session. This done by setting a unique opaque
+value on the session using **ceph_set_uuid()**. The uuid value can be
+any string and is treated as opaque by the client.
+
+Setting the uuid directly can only be done on a new session, prior to
+mounting. When reclaim is performed the current session will inherit the
+old session's uuid.
+
+Starting Reclaim
+----------------
+After calling ceph_create and ceph_init on the resulting struct
+ceph_mount_info, the client should then issue ceph_start_reclaim,
+passing in the uuid of the previous incarnation of the client with any
+flags.
+
+CEPH_RECLAIM_RESET
+ This flag indicates that we do not intend to do any sort of reclaim
+ against the old session indicated by the given uuid, and that it
+ should just be discarded. Any state held by the previous client
+ should be released immediately.
+
+Finishing Reclaim
+-----------------
+After the Ceph client has completed all of its reclaim operations, the
+client should issue ceph_finish_reclaim to indicate that the reclaim is
+now complete.
+
+Setting Session Timeout (Optional)
+----------------------------------
+When a client dies and is restarted, and we need to preserve its state,
+we are effectively racing against the session expiration clock. In this
+situation we generally want a longer timeout since we expect to
+eventually kill off the old session manually.
+
+Example 1: Reset Old Session
+----------------------------
+This example just kills off the MDS session held by a previous instance
+of itself. An NFS server can start a grace period and then ask the MDS
+to tear down the old session. This allows clients to start reclaim
+immediately.
+
+(Note: error handling omitted for clarity)
+
+.. code-block:: c
+
+ struct ceph_mount_info *cmount;
+ const char *uuid = "foobarbaz";
+
+ /* Set up a new cephfs session, but don't mount it yet. */
+ rc = ceph_create(&cmount);
+ rc = ceph_init(&cmount);
+
+ /*
+ * Set the timeout to 5 minutes to lengthen the window of time for
+ * the server to restart, should it crash.
+ */
+ ceph_set_session_timeout(cmount, 300);
+
+ /*
+ * Start reclaim vs. session with old uuid. Before calling this,
+ * all NFS servers that could acquire conflicting state _must_ be
+ * enforcing their grace period locally.
+ */
+ rc = ceph_start_reclaim(cmount, uuid, CEPH_RECLAIM_RESET);
+
+ /* Declare reclaim complete */
+ rc = ceph_finish_reclaim(cmount);
+
+ /* Set uuid held by new session */
+ ceph_set_uuid(cmount, nodeid);
+
+ /*
+ * Now mount up the file system and do normal open/lock operations to
+ * satisfy reclaim requests.
+ */
+ ceph_mount(cmount, rootpath);
+ ...
diff --git a/doc/dev/cephfs-snapshots.rst b/doc/dev/cephfs-snapshots.rst
new file mode 100644
index 000000000..423240b97
--- /dev/null
+++ b/doc/dev/cephfs-snapshots.rst
@@ -0,0 +1,133 @@
+CephFS Snapshots
+================
+
+CephFS supports snapshots, generally created by invoking mkdir within the
+``.snap`` directory. Note this is a hidden, special directory, not visible
+during a directory listing.
+
+Overview
+-----------
+
+Generally, snapshots do what they sound like: they create an immutable view
+of the file system at the point in time they're taken. There are some headline
+features that make CephFS snapshots different from what you might expect:
+
+* Arbitrary subtrees. Snapshots are created within any directory you choose,
+ and cover all data in the file system under that directory.
+* Asynchronous. If you create a snapshot, buffered data is flushed out lazily,
+ including from other clients. As a result, "creating" the snapshot is
+ very fast.
+
+Important Data Structures
+-------------------------
+* SnapRealm: A `SnapRealm` is created whenever you create a snapshot at a new
+ point in the hierarchy (or, when a snapshotted inode is move outside of its
+ parent snapshot). SnapRealms contain an `sr_t srnode`, and `inodes_with_caps`
+ that are part of the snapshot. Clients also have a SnapRealm concept that
+ maintains less data but is used to associate a `SnapContext` with each open
+ file for writing.
+* sr_t: An `sr_t` is the on-disk snapshot metadata. It is part of the containing
+ directory and contains sequence counters, timestamps, the list of associated
+ snapshot IDs, and `past_parent_snaps`.
+* SnapServer: SnapServer manages snapshot ID allocation, snapshot deletion and
+ tracks list of effective snapshots in the file system. A file system only has
+ one instance of snapserver.
+* SnapClient: SnapClient is used to communicate with snapserver, each MDS rank
+ has its own snapclient instance. SnapClient also caches effective snapshots
+ locally.
+
+Creating a snapshot
+-------------------
+CephFS snapshot feature is enabled by default on new file system. To enable it
+on existing file systems, use command below.
+
+.. code::
+
+ $ ceph fs set <fs_name> allow_new_snaps true
+
+When snapshots are enabled, all directories in CephFS will have a special
+``.snap`` directory. (You may configure a different name with the ``client
+snapdir`` setting if you wish.)
+
+To create a CephFS snapshot, create a subdirectory under
+``.snap`` with a name of your choice. For example, to create a snapshot on
+directory "/1/2/3/", invoke ``mkdir /1/2/3/.snap/my-snapshot-name`` .
+
+This is transmitted to the MDS Server as a
+CEPH_MDS_OP_MKSNAP-tagged `MClientRequest`, and initially handled in
+Server::handle_client_mksnap(). It allocates a `snapid` from the `SnapServer`,
+projects a new inode with the new SnapRealm, and commits it to the MDLog as
+usual. When committed, it invokes
+`MDCache::do_realm_invalidate_and_update_notify()`, which notifies all clients
+with caps on files under "/1/2/3/", about the new SnapRealm. When clients get
+the notifications, they update client-side SnapRealm hierarchy, link files
+under "/1/2/3/" to the new SnapRealm and generate a `SnapContext` for the
+new SnapRealm.
+
+Note that this *is not* a synchronous part of the snapshot creation!
+
+Updating a snapshot
+-------------------
+If you delete a snapshot, a similar process is followed. If you remove an inode
+out of its parent SnapRealm, the rename code creates a new SnapRealm for the
+renamed inode (if SnapRealm does not already exist), saves IDs of snapshots that
+are effective on the original parent SnapRealm into `past_parent_snaps` of the
+new SnapRealm, then follows a process similar to creating snapshot.
+
+Generating a SnapContext
+------------------------
+A RADOS `SnapContext` consists of a snapshot sequence ID (`snapid`) and all
+the snapshot IDs that an object is already part of. To generate that list, we
+combine `snapids` associated with the SnapRealm and all valid `snapids` in
+`past_parent_snaps`. Stale `snapids` are filtered out by SnapClient's cached
+effective snapshots.
+
+Storing snapshot data
+---------------------
+File data is stored in RADOS "self-managed" snapshots. Clients are careful to
+use the correct `SnapContext` when writing file data to the OSDs.
+
+Storing snapshot metadata
+-------------------------
+Snapshotted dentries (and their inodes) are stored in-line as part of the
+directory they were in at the time of the snapshot. *All dentries* include a
+`first` and `last` snapid for which they are valid. (Non-snapshotted dentries
+will have their `last` set to CEPH_NOSNAP).
+
+Snapshot writeback
+------------------
+There is a great deal of code to handle writeback efficiently. When a Client
+receives an `MClientSnap` message, it updates the local `SnapRealm`
+representation and its links to specific `Inodes`, and generates a `CapSnap`
+for the `Inode`. The `CapSnap` is flushed out as part of capability writeback,
+and if there is dirty data the `CapSnap` is used to block fresh data writes
+until the snapshot is completely flushed to the OSDs.
+
+In the MDS, we generate snapshot-representing dentries as part of the regular
+process for flushing them. Dentries with outstanding `CapSnap` data is kept
+pinned and in the journal.
+
+Deleting snapshots
+------------------
+Snapshots are deleted by invoking "rmdir" on the ".snap" directory they are
+rooted in. (Attempts to delete a directory which roots snapshots *will fail*;
+you must delete the snapshots first.) Once deleted, they are entered into the
+`OSDMap` list of deleted snapshots and the file data is removed by the OSDs.
+Metadata is cleaned up as the directory objects are read in and written back
+out again.
+
+Hard links
+----------
+Inode with multiple hard links is moved to a dummy global SnapRealm. The
+dummy SnapRealm covers all snapshots in the file system. The inode's data
+will be preserved for any new snapshot. These preserved data will cover
+snapshots on any linkage of the inode.
+
+Multi-FS
+---------
+Snapshots and multiple file systems don't interact well. Specifically, each
+MDS cluster allocates `snapids` independently; if you have multiple file systems
+sharing a single pool (via namespaces), their snapshots *will* collide and
+deleting one will result in missing file data for others. (This may even be
+invisible, not throwing errors to the user.) If each FS gets its own
+pool things probably work, but this isn't tested and may not be true.
diff --git a/doc/dev/cephx.rst b/doc/dev/cephx.rst
new file mode 100644
index 000000000..27e501f99
--- /dev/null
+++ b/doc/dev/cephx.rst
@@ -0,0 +1,406 @@
+=====
+Cephx
+=====
+
+.. _cephx:
+
+Intro
+-----
+
+The protocol design looks a lot like kerberos. The authorizer "KDC"
+role is served by the monitor, who has a database of shared secrets
+for each entity. Clients and non-monitor daemons all start by
+authenticating with the monitor to obtain tickets, mostly referreed to
+in the code as authorizers. These tickets provide both
+*authentication* and *authorization* in that they include a
+description of the *capabilities* for the entity, a concise structured
+description of what actions are allowed, that can be interpreted and
+enforced by the service daemons.
+
+Other references
+----------------
+
+- A write-up from 2012 on cephx as it existed at that time by Peter
+ Reiher: :ref:`cephx_2012_peter`
+
+Terms
+-----
+
+- *monitor(s)*: central authorization authority
+- *service*: the set of all daemons of a particular type (e.g., all
+ OSDs, all MDSs)
+- *client*: an entity or principal that is accessing the service
+- *entity name*: the string identifier for a principal
+ (e.g. client.admin, osd.123)
+- *ticket*: a bit of data that cryptographically asserts identify and
+ authorization
+
+- *principal*: a client or daemon, identified by a unique entity_name,
+ that shares a secret with the monitor.
+- *principal_secret*: principal secret, a shared secret (16 bytes)
+ known by the principal and the monitor
+- *mon_secret*: monitor secret, a shared secret known by all monitors
+- *service_secret*: a rotating secret known by all members of a
+ service class (e.g., all OSDs)
+
+- *auth ticket*: a ticket proving identity to the monitors
+- *service ticket*: a ticket proving identify and authorization to a
+ service
+
+
+Terminology
+-----------
+
+``{foo, bar}^secret`` denotes encryption by secret.
+
+
+Context
+-------
+
+The authentication messages described here are specific to the cephx
+auth implementation. The messages are transferred by the Messenger
+protocol or by MAuth messages, depending on the version of the
+messenger protocol. See also :ref:`msgr2-protocol`.
+
+An initial (messenger) handshake negotiates an authentication method
+to be used (cephx vs none or krb or whatever) and an assertion of what
+entity the client or daemon is attempting to authenticate as.
+
+Phase I: obtaining auth ticket
+------------------------------
+
+The cephx exchange begins with the monitor knowing who the client
+claims to be, and an initial cephx message from the monitor to the
+client/principal.::
+
+ a->p :
+ CephxServerChallenge {
+ u64 server_challenge # random (by server)
+ }
+
+The client responds by adding its own challenge, and calculating a
+value derived from both challenges and its shared key
+principal_secret.::
+
+ p->a :
+ CephxRequestHeader {
+ u16 CEPHX_GET_AUTH_SESSION_KEY
+ }
+ CephXAuthenticate {
+ u8 2 # 2 means nautilus+
+ u64 client_challenge # random (by client)
+ u64 key = {client_challenge ^ server_challenge}^principal_secret # (roughly)
+ blob old_ticket # old ticket, if we are reconnecting or renewing
+ u32 other_keys # bit mask of service keys we want
+ }
+
+Prior to nautilus,::
+
+ CephXAuthenticate {
+ u8 1 # 2 means nautilus+
+ u64 client_challenge # random (by client)
+ u64 key = {client_challenge + server_challenge}^principal_secret # (roughly)
+ blob old_ticket # old ticket, if we are reconnecting or renewing
+ }
+
+The monitor looks up principal_secret in database, and verifies the
+key is correct. If old_ticket is present, verify it is valid, and we
+can reuse the same global_id. (Otherwise, a new global_id is assigned
+by the monitor.)::
+
+ a->p :
+ CephxReplyHeader {
+ u16 CEPHX_GET_AUTH_SESSION_KEY
+ s32 result (0)
+ }
+ u8 encoding_version = 1
+ u32 num_tickets ( = 1)
+ ticket_info # (N = 1)
+
+plus (for Nautilus and later)::
+
+ u32 connection_secret_len # in bytes
+ connection_secret^session_key
+ u32 other_keys_len # bytes of other keys (encoded)
+ other_keys {
+ u8 encoding_version = 1
+ u32 num_tickets
+ service_ticket_info * N # for each service ticket
+ }
+
+where::
+
+ ticket_info {
+ u32 service_id # CEPH_ENTITY_TYPE_AUTH
+ u8 msg_version (1)
+ {CephXServiceTicket service_ticket}^principal_secret
+ {CephxTicketBlob ticket_blob}^existing session_key # if we are renewing a ticket,
+ CephxTicketBlob ticket_blob # otherwise
+ }
+
+ service_ticket_info {
+ u32 service_id # CEPH_ENTITY_TYPE_{OSD,MDS,MGR}
+ u8 msg_version (1)
+ {CephXServiceTicket service_ticket}^principal_secret
+ CephxTicketBlob ticket_blob
+ }
+
+ CephxServiceTicket {
+ CryptoKey session_key # freshly generated (even if old_ticket is present)
+ utime_t expiration # now + auth_mon_ticket_ttl
+ }
+
+ CephxTicketBlob {
+ u64 secret_id # which service ticket encrypted this; -1 == monsecret, otherwise service's rotating key id
+ {CephXServiceTicketInfo ticket}^mon_secret
+ }
+
+ CephxServiceTicketInfo {
+ CryptoKey session_key # same session_key as above
+ AuthTicket ticket
+ }
+
+ AuthTicket {
+ EntityName name # client's identity, as proven by its possession of principal_secret
+ u64 global_id # newly assigned, or from old_ticket
+ utime_t created, renew_after, expires
+ AuthCapsInfo # what client is allowed to do
+ u32 flags = 0 # unused
+ }
+
+So: for each ticket, principal gets a part that it decrypts with its
+secret to get the session_key (CephxServiceTicket). And the
+CephxTicketBlob is opaque (secured by the mon secret) but can be used
+later to prove who we are and what we can do (see CephxAuthorizer
+below).
+
+For Nautilus+, we also include the service tickets.
+
+The client can infer that the monitor is authentic because it can
+decrypt the service_ticket with its secret (i.e., the server has its
+secret key).
+
+
+Phase II: Obtaining service tickets (pre-nautilus)
+--------------------------------------------------
+
+Now the client needs the keys used to talk to non-monitors (osd, mds,
+mgr).::
+
+ p->a :
+ CephxRequestHeader {
+ u16 CEPHX_GET_PRINCIPAL_SESSION_KEY
+ }
+ CephxAuthorizer authorizer
+ CephxServiceTicketRequest {
+ u32 keys # bitmask of CEPH_ENTITY_TYPE_NAME (MGR, OSD, MDS, etc)
+ }
+
+where::
+
+ CephxAuthorizer {
+ u8 AUTH_MODE_AUTHORIZER (1)
+ u64 global_id
+ u32 service_id # CEPH_ENTITY_TYPE_*
+ CephxTicketBlob auth_ticket
+ {CephxAuthorize msg}^session_key
+ }
+
+ CephxAuthorize msg {
+ u8 2
+ u64 nonce # random from client
+ bool have_challenge = false # not used here
+ u64 server_challenge_plus_one = 0 # not used here
+ }
+
+The monitor validates the authorizer by decrypting the auth_ticket
+with ``mon_secret`` and confirming that it says this principal is who
+they say they are in the CephxAuthorizer fields. Note that the nonce
+random bytes aren't used here (the field exists for Phase III below).
+
+Assuming all is well, the authorizer can generate service tickets
+based on the CEPH_ENTITY_TYPE_* bits in the ``keys`` bitmask.
+
+The response looks like::
+
+ CephxResponseHeader {
+ u16 CEPHX_GET_PRINCIPAL_SESSION_KEY
+ s32 result (= 0)
+ }
+ u8 encoding_version = 1
+ u32 num_tickets
+ ticket_info * N
+
+Where, as above,::
+
+ ticket_info {
+ u32 service_id # CEPH_ENTITY_TYPE_{OSD,MGR,MDS}
+ u8 msg_version (1)
+ {CephXServiceTicket service_ticket}^principal_secret
+ CephxTicketBlob ticket_blob
+ }
+
+ CephxServiceTicket {
+ CryptoKey session_key
+ utime_t expiration
+ }
+
+ CephxTicketBlob {
+ u64 secret_id # which version of the (rotating) service ticket encrypted this
+ {CephXServiceTicketInfo ticket}^rotating_service_secret
+ }
+
+ CephxServiceTicketInfo {
+ CryptoKey session_key
+ AuthTicket ticket
+ }
+
+ AuthTicket {
+ EntityName name
+ u64 global_id
+ utime_t created, renew_after, expires
+ AuthCapsInfo # what you are allowed to do
+ u32 flags = 0 # unused
+ }
+
+This concludes the authentication exchange with the monitor. The
+client or daemon now has tickets to talk to the mon and all other
+daemons of interest.
+
+
+Phase III: Opening a connection to a service
+--------------------------------------------
+
+When a connection is opened, an "authorizer" payload is sent::
+
+ p->s :
+ CephxAuthorizer {
+ u8 AUTH_MODE_AUTHORIZER (1)
+ u64 global_id
+ u32 service_id # CEPH_ENTITY_TYPE_*
+ CephxTicketBlob auth_ticket
+ {CephxAuthorize msg}^session_key
+ }
+
+ CephxAuthorize msg {
+ u8 2
+ u64 nonce # random from client
+ bool have_challenge = false
+ u64 server_challenge_plus_one = 0
+ }
+
+Note that prior to the Luminous v12.2.6 or Mimic v13.2.2 releases, the
+CephxAuthorize msg did not contain a challenge, and consisted only
+of::
+
+ CephxAuthorize msg {
+ u8 1
+ u64 nonce # random from client
+ }
+
+The server will inspect the auth_ticket CephxTicketBlob (by decrypting
+it with its current rotating service key). If it is a pre-v12.2.6 or
+pre-v13.2.2 client, the server immediately replies with::
+
+ s->p :
+ {CephxAuthorizeReply reply}^session_key
+
+where::
+
+ CephxAuthorizeReply {
+ u64 nonce_plus_one
+ }
+
+Otherwise, the server will respond with a challenge (to prevent replay
+attacks)::
+
+ s->p :
+ {CephxAuthorizeChallenge challenge}^session_key
+
+where::
+
+ CephxAuthorizeChallenge {
+ u64 server_challenge # random from server
+ }
+
+The client decrypts and updates its CephxAuthorize msg accordingly,
+resending most of the same information as before::
+
+ p->s :
+ CephxAuthorizer {
+ u8 AUTH_MODE_AUTHORIZER (1)
+ u64 global_id
+ u32 service_id # CEPH_ENTITY_TYPE_*
+ CephxTicketBlob auth_ticket
+ {CephxAuthorize msg}^session_key
+ }
+
+where::
+
+ CephxAuthorize msg {
+ u8 2
+ u64 nonce # (new) random from client
+ bool have_challenge = true
+ u64 server_challenge_plus_one # server_challenge + 1
+ }
+
+The server validates the ticket as before, and then also verifies the
+msg nonce has it's challenge + 1, confirming this is a live
+authentication attempt (not a replay).
+
+Finally, the server responds with a reply that proves its authenticity
+to the client. It also includes some entropy to use for encryption of
+the session, if it is needed for the mode.::
+
+ s->p :
+ {CephxAuthorizeReply reply}^session_key
+
+where::
+
+ CephxAuthorizeReply {
+ u64 nonce_plus_one
+ u32 connection_secret_length
+ connection secret
+ }
+
+Prior to nautilus, there is no connection secret::
+
+ CephxAuthorizeReply {
+ u64 nonce_plus_one
+ }
+
+The client decrypts and confirms that the server incremented nonce
+properly and that this is thus a live authentication request and not a
+replay.
+
+
+Rotating service secrets
+------------------------
+
+Daemons make use of a rotating secret for their tickets instead of a
+fixed secret in order to limit the severity of a compromised daemon.
+If a daemon's secret key is compromised by an attacker, that daemon
+and its key can be removed from the monitor's database, but the
+attacker may also have obtained a copy of the service secret shared by
+all daemons. To mitigate this, service keys rotate periodically so
+that after a period of time (auth_service_ticket_ttl) the key the
+attacker obtained will no longer be valid.::
+
+ p->a :
+ CephxRequestHeader {
+ u16 CEPHX_GET_ROTATING_KEY
+ }
+
+ a->p :
+ CephxReplyHeader {
+ u16 CEPHX_GET_ROTATING_KEY
+ s32 result = 0
+ }
+ {CryptoKey service_key}^principal_secret
+
+That is, the new rotating key is simply protected by the daemon's
+rotating secret.
+
+Note that, as an implementation detail, the services keep the current
+key and the prior key on hand so that the can continue to validate
+requests while the key is being rotated.
diff --git a/doc/dev/cephx_protocol.rst b/doc/dev/cephx_protocol.rst
new file mode 100644
index 000000000..7b8c17876
--- /dev/null
+++ b/doc/dev/cephx_protocol.rst
@@ -0,0 +1,341 @@
+.. _cephx_2012_peter:
+
+============================================================
+A Detailed Description of the Cephx Authentication Protocol
+============================================================
+
+Peter Reiher
+7/13/12
+
+This document provides deeper detail on the Cephx authorization protocol whose high level flow
+is described in the memo by Yehuda (12/19/09). Because this memo discusses details of
+routines called and variables used, it represents a snapshot. The code might be changed
+subsequent to the creation of this document, and the document is not likely to be updated in
+lockstep. With luck, code comments will indicate major changes in the way the protocol is
+implemented.
+
+Introduction
+-------------
+
+The basic idea of the protocol is based on Kerberos. A client wishes to obtain something from
+a server. The server will only offer the requested service to authorized clients. Rather
+than requiring each server to deal with authentication and authorization issues, the system
+uses an authorization server. Thus, the client must first communicate with the authorization
+server to authenticate itself and to obtain credentials that will grant it access to the
+service it wants.
+
+Authorization is not the same as authentication. Authentication provides evidence that some
+party is who it claims to be. Authorization provides evidence that a particular party is
+allowed to do something. Generally, secure authorization implies secure authentication
+(since without authentication, you may authorize something for an imposter), but the reverse
+is not necessarily true. One can authenticate without authorizing. The purpose
+of this protocol is to authorize.
+
+The basic approach is to use symmetric cryptography throughout. Each client C has its own
+secret key, known only to itself and the authorization server A. Each server S has its own
+secret key, known only to itself and the authorization server A. Authorization information
+will be passed in tickets, encrypted with the secret key of the entity that offers the service.
+There will be a ticket that A gives to C, which permits C to ask A for other tickets. This
+ticket will be encrypted with A's key, since A is the one who needs to check it. There will
+later be tickets that A issues that allow C to communicate with S to ask for service. These
+tickets will be encrypted with S's key, since S needs to check them. Since we wish to provide
+security of the communications, as well, session keys are set up along with the tickets.
+Currently, those session keys are only used for authentication purposes during this protocol
+and the handshake between the client C and the server S, when the client provides its service
+ticket. They could be used for authentication or secrecy throughout, with some changes to
+the system.
+
+Several parties need to prove something to each other if this protocol is to achieve its
+desired security effects.
+
+1. The client C must prove to the authenticator A that it really is C. Since everything
+is being done via messages, the client must also prove that the message proving authenticity
+is fresh, and is not being replayed by an attacker.
+
+2. The authenticator A must prove to client C that it really is the authenticator. Again,
+proof that replay is not occurring is also required.
+
+3. A and C must securely share a session key to be used for distribution of later
+authorization material between them. Again, no replay is allowable, and the key must be
+known only to A and C.
+
+4. A must receive evidence from C that allows A to look up C's authorized operations with
+server S.
+
+5. C must receive a ticket from A that will prove to S that C can perform its authorized
+operations. This ticket must be usable only by C.
+
+6. C must receive from A a session key to protect the communications between C and S. The
+session key must be fresh and not the result of a replay.
+
+Getting Started With Authorization
+-----------------------------------
+
+When the client first needs to get service, it contacts the monitor. At the moment, it has
+no tickets. Therefore, it uses the "unknown" protocol to talk to the monitor. This protocol
+is specified as ``CEPH_AUTH_UNKNOWN``. The monitor also takes on the authentication server
+role, A. The remainder of the communications will use the cephx protocol (most of whose code
+will be found in files in ``auth/cephx``). This protocol is responsible for creating and
+communicating the tickets spoken of above.
+
+Currently, this document does not follow the pre-cephx protocol flow. It starts up at the
+point where the client has contacted the server and is ready to start the cephx protocol itself.
+
+Once we are in the cephx protocol, we can get the tickets. First, C needs a ticket that
+allows secure communications with A. This ticket can then be used to obtain other tickets.
+This is phase I of the protocol, and consists of a send from C to A and a response from A to C.
+Then, C needs a ticket to allow it to talk to S to get services. This is phase II of the
+protocol, and consists of a send from C to A and a response from A to C.
+
+Phase I:
+--------
+
+The client is set up to know that it needs certain things, using a variable called ``need``,
+which is part of the ``AuthClientHandler`` class, which the ``CephxClientHandler`` inherits
+from. At this point, one thing that's encoded in the ``need`` variable is
+``CEPH_ENTITY_TYPE_AUTH``, indicating that we need to start the authentication protocol
+from scratch. Since we're always talking to the same authorization server, if we've gone
+through this step of the protocol before (and the resulting ticket/session hasn't timed out),
+we can skip this step and just ask for client tickets. But it must be done initially, and
+we'll assume that we are in that state.
+
+The message C sends to A in phase I is build in ``CephxClientHandler::build_request()`` (in
+``auth/cephx/CephxClientHandler.cc``). This routine is used for more than one purpose.
+In this case, we first call ``validate_tickets()`` (from routine
+``CephXTicektManager::validate_tickets()`` which lives in ``auth/cephx/CephxProtocol.h``).
+This code runs through the list of possible tickets to determine what we need, setting values
+in the ``need`` flag as necessary. Then we call ``ticket.get_handler()``. This routine
+(in ``CephxProtocol.h``) finds a ticket of the specified type (a ticket to perform
+authorization) in the ticket map, creates a ticket handler object for it, and puts the
+handler into the right place in the map. Then we hit specialized code to deal with individual
+cases. The case here is when we still need to authenticate to A (the
+``if (need & CEPH_ENTITY_TYPE_AUTH)`` branch).
+
+We now create a message of type ``CEPHX_GET_AUTH_SESSION_KEY``. We need to authenticate
+this message with C's secret key, so we fetch that from the local key repository. We create
+a random challenge, whose purpose is to prevent replays. We encrypt that challenge using
+``cephx_calc_client_server_challenge()``. We already
+have a server challenge (a similar set of random bytes, but created by the server and sent to
+the client) from our pre-cephx stage. We take both challenges and our secret key and
+produce a combined encrypted challenge value, which goes into ``req.key``.
+
+If we have an old ticket, we store it in ``req.old_ticket``. We're about to get a new one.
+
+The entire ``req`` structure, including the old ticket and the cryptographic hash of the two
+challenges, gets put into the message. Then we return from this function, and the
+message is sent.
+
+We now switch over to the authenticator side, A. The server receives the message that was
+sent, of type ``CEPH_GET_AUTH_SESSION_KEY``. The message gets handled in ``prep_auth()``,
+in ``mon/AuthMonitor.cc``, which calls ``handle_request()`` is ``CephxServiceHandler.cc`` to
+do most of the work. This routine, also, handles multiple cases.
+
+The control flow is determined by the ``request_type`` in the ``cephx_header`` associated
+with the message. Our case here is ``CEPH_GET_AUTH_SESSION_KEY``. We need the
+secret key A shares with C, so we call ``get_secret()`` from out local key repository to get
+it. (It's called a ``key_server`` in the code, but it's not really a separate machine or
+processing entity. It's more like the place where locally used keys are kept.) We should
+have set up a server challenge already with this client, so we make sure
+we really do have one. (This variable is specific to a ``CephxServiceHandler``, so there
+is a different one for each such structure we create, presumably one per client A is
+dealing with.) If there is no challenge, we'll need to start over, since we need to
+check the client's crypto hash, which depends on a server challenge, in part.
+
+We now call the same routine the client used to calculate the hash, based on the same values:
+the client challenge (which is in the incoming message), the server challenge (which we saved),
+and the client's key (which we just obtained). We check to see if the client sent the same
+thing we expected. If so, we know we're talking to the right client. We know the session is
+fresh, because it used the challenge we sent it to calculate its crypto hash. So we can
+give it an authentication ticket.
+
+We fetch C's ``eauth`` structure. This contains an ID, a key, and a set of caps (capabilities).
+
+The client sent us its old ticket in the message, if it had one. If
+so, we set a flag, ``should_enc_ticket``, to true and set the global
+ID to the global ID in that old ticket. If the attempt to decode its
+old ticket fails (most probably because it didn't have one),
+``should_enc_ticket`` remains false. Now we set up the new ticket,
+filling in timestamps, the name of C, and the global ID provided in the
+method call (unless there was an old ticket). We need a new session
+key to help the client communicate securely with us, not using its
+permanent key. We set the service ID to ``CEPH_ENTITY_TYPE_AUTH``,
+which will tell the client C what to do with the message we send it.
+We build a cephx response header and call
+``cephx_build_service_ticket_reply()``.
+
+``cephx_build_service_ticket_reply()`` is in ``auth/cephx/CephxProtocol.cc``. This
+routine will build up the response message. Much of it copies data from its parameters to
+a message structure. Part of that information (the session key and the validity period)
+gets encrypted with C's permanent key. If the ``should_encrypt_ticket`` flag is set,
+encrypt it using the old ticket's key. Otherwise, there was no old ticket key, so the
+new ticket is not encrypted. (It is, of course, already encrypted with A's permanent key.)
+Presumably the point of this second encryption is to expose less material encrypted with
+permanent keys.
+
+Then we call the key server's ``get_service_caps()`` routine on the entity name, with a
+flag ``CEPH_ENTITY_TYPE_MON``, and capabilities, which will be filled in by this routine.
+The use of that constant flag means we're going to get the client's caps for A, not for some
+other data server. The ticket here is to access the authorizer A, not the service S. The
+result of this call is that the caps variable (a parameter to the routine we're in) is
+filled in with the monitor capabilities that will allow C to access A's authorization services.
+
+``handle_request()`` itself does not send the response message. It builds up the
+``result_bl``, which basically holds that message's contents, and the capabilities structure,
+but it doesn't send the message. We go back to ``prep_auth()``, in ``mon/AuthMonitor.cc``,
+for that. This routine does some fiddling around with the caps structure that just got
+filled in. There's a global ID that comes up as a result of this fiddling that is put into
+the reply message. The reply message is built here (mostly from the ``response_bl`` buffer)
+and sent off.
+
+This completes Phase I of the protocol. At this point, C has authenticated itself to A, and A has generated a new session key and ticket allowing C to obtain server tickets from A.
+
+Phase II
+--------
+
+This phase starts when C receives the message from A containing a new ticket and session key.
+The goal of this phase is to provide C with a session key and ticket allowing it to
+communicate with S.
+
+The message A sent to C is dispatched to ``build_request()`` in ``CephxClientHandler.cc``,
+the same routine that was used early in Phase I to build the first message in the protocol.
+This time, when ``validate_tickets()`` is called, the ``need`` variable will not contain
+``CEPH_ENTITY_TYPE_AUTH``, so a different branch through the bulk of the routine will be
+used. This is the branch indicated by ``if (need)``. We have a ticket for the authorizer,
+but we still need service tickets.
+
+We must send another message to A to obtain the tickets (and session key) for the server
+S. We set the ``request_type`` of the message to ``CEPHX_GET_PRINCIPAL_SESSION_KEY`` and
+call ``ticket_handler.build_authorizer()`` to obtain an authorizer. This routine is in
+``CephxProtocol.cc``. We set the key for this authorizer to be the session key we just got
+from A,and create a new nonce. We put the global ID, the service ID, and the ticket into a
+message buffer that is part of the authorizer. Then we create a new ``CephXAuthorize``
+structure. The nonce we just created goes there. We encrypt this ``CephXAuthorize``
+structure with the current session key and stuff it into the authorizer's buffer. We
+return the authorizer.
+
+Back in ``build_request()``, we take the part of the authorizer that was just built (its
+buffer, not the session key or anything else) and shove it into the buffer we're creating
+for the message that will go to A. Then we delete the authorizer. We put the requirements
+for what we want in ``req.keys``, and we put ``req`` into the buffer. Then we return, and
+the message gets sent.
+
+The authorizer A receives this message which is of type ``CEPHX_GET_PRINCIPAL_SESSION_KEY``.
+The message gets handled in ``prep_auth()``, in ``mon/AuthMonitor.cc``, which again calls
+``handle_request()`` in ``CephxServiceHandler.cc`` to do most of the work.
+
+In this case, ``handle_request()`` will take the ``CEPHX_GET_PRINCIPAL_SESSION_KEY`` case.
+It will call ``cephx_verify_authorizer()`` in ``CephxProtocol.cc``. Here, we will grab
+a bunch of data out of the input buffer, including the global and service IDs and the ticket
+for A. The ticket contains a ``secret_id``, indicating which key is being used for it.
+If the secret ID pulled out of the ticket was -1, the ticket does not specify which secret
+key A should use. In this case, A should use the key for the specific entity that C wants
+to contact, rather than a rotating key shared by all server entities of the same type.
+To get that key, A must consult the key repository to find the right key. Otherwise,
+there's already a structure obtained from the key repository to hold the necessary secret.
+Server secrets rotate on a time expiration basis (key rotation is not covered in this
+document), so run through that structure to find its current secret. Either way, A now
+knows the secret key used to create this ticket. Now decrypt the encrypted part of the
+ticket, using this key. It should be a ticket for A.
+
+The ticket also contains a session key that C should have used to encrypt other parts of
+this message. Use that session key to decrypt the rest of the message.
+
+Create a ``CephXAuthorizeReply`` to hold our reply. Extract the nonce (which was in the stuff
+we just decrypted), add 1 to it, and put the result in the reply. Encrypt the reply and
+put it in the buffer provided in the call to ``cephx_verify_authorizer()`` and return
+to ``handle_request()``. This will be used to prove to C that A (rather than an attacker)
+created this response.
+
+Having verified that the message is valid and from C, now we need to build it a ticket for S.
+We need to know what S it wants to communicate with and what services it wants. Pull the
+ticket request that describes those things out of its message. Now run through the ticket
+request to see what it wanted. (He could potentially be asking for multiple different
+services in the same request, but we will assume it's just one, for this discussion.) Once we
+know which service ID it's after, call ``build_session_auth_info()``.
+
+``build_session_auth_info()`` is in ``CephxKeyServer.cc``. It checks to see if the
+secret for the ``service_ID`` of S is available and puts it into the subfield of one of
+the parameters, and calls the similarly named ``_build_session_auth_info()``, located in
+the same file. This routine loads up the new ``auth_info`` structure with the
+ID of S, a ticket, and some timestamps for that ticket. It generates a new session key
+and puts it in the structure. It then calls ``get_caps()`` to fill in the
+``info.ticket`` caps field. ``get_caps()`` is also in ``CephxKeyServer.cc``. It fills the
+``caps_info`` structure it is provided with caps for S allowed to C.
+
+Once ``build_session_auth_info()`` returns, A has a list of the capabilities allowed to
+C for S. We put a validity period based on the current TTL for this context into the info
+structure, and put it into the ``info_vec`` structure we are preparing in response to the
+message.
+
+Now call ``build_cephx_response_header()``, also in ``CephxServiceHandler.cc``. Fill in
+the ``request_type``, which is ``CEPHX_GET_PRINCIPAL_SESSION_KEY``, a status of 0,
+and the result buffer.
+
+Now call ``cephx_build_service_ticket_reply()``, which is in ``CephxProtocol.cc``. The
+same routine was used towards the end of A's handling of its response in phase I. Here,
+the session key (now a session key to talk to S, not A) and the validity period for that
+key will be encrypted with the existing session key shared between C and A.
+The ``should_encrypt_ticket`` parameter is false here, and no key is provided for that
+encryption. The ticket in question, destined for S once C sends it there, is already
+encrypted with S's secret. So, essentially, this routine will put ID information,
+the encrypted session key, and the ticket allowing C to talk to S into the buffer to
+be sent to C.
+
+After this routine returns, we exit from ``handle_request()``, going back to ``prep_auth()``
+and ultimately to the underlying message send code.
+
+The client receives this message. The nonce is checked as the message passes through
+``Pipe::connect()``, which is in ``msg/SimpleMessager.cc``. In a lengthy ``while(1)`` loop in
+the middle of this routine, it gets an authorizer. If the get was successful, eventually
+it will call ``verify_reply()``, which checks the nonce. ``connect()`` never explicitly
+checks to see if it got an authorizer, which would suggest that failure to provide an
+authorizer would allow an attacker to skip checking of the nonce. However, in many places,
+if there is no authorizer, important connection fields will get set to zero, which will
+ultimately cause the connection to fail to provide data. It would be worth testing, but
+it looks like failure to provide an authorizer, which contains the nonce, would not be helpful
+to an attacker.
+
+The message eventually makes its way through to ``handle_response()``, in
+``CephxClientHandler.cc``. In this routine, we call ``get_handler()`` to get a ticket
+handler to hold the ticket we have just received. This routine is embedded in the definition
+for a ``CephXTicketManager`` structure. It takes a type (``CEPH_ENTITY_TYPE_AUTH``, in
+this case) and looks through the ``tickets_map`` to find that type. There should be one, and
+it should have the session key of the session between C and A in its entry. This key will
+be used to decrypt the information provided by A, particularly the new session key allowing
+C to talk to S.
+
+We then call ``verify_service_ticket_reply()``, in ``CephxProtocol.cc``. This routine
+needs to determine if the ticket is OK and also obtain the session key associated with this
+ticket. It decrypts the encrypted portion of the message buffer, using the session key
+shared with A. This ticket was not encrypted (well, not twice - tickets are always encrypted,
+but sometimes double encrypted, which this one isn't). So it can be stored in a service
+ticket buffer directly. We now grab the ticket out of that buffer.
+
+The stuff we decrypted with the session key shared between C and A included the new session
+key. That's our current session key for this ticket, so set it. Check validity and
+set the expiration times. Now return true, if we got this far.
+
+Back in ``handle_response()``, we now call ``validate_tickets()`` to adjust what we think
+we need, since we now have a ticket we didn't have before. If we've taken care of
+everything we need, we'll return 0.
+
+This ends phase II of the protocol. We have now successfully set up a ticket and session key
+for client C to talk to server S. S will know that C is who it claims to be, since A will
+verify it. C will know it is S it's talking to, again because A verified it. The only
+copies of the session key for C and S to communicate were sent encrypted under the permanent
+keys of C and S, respectively, so no other party (excepting A, who is trusted by all) knows
+that session key. The ticket will securely indicate to S what C is allowed to do, attested
+to by A. The nonces passed back and forth between A and C ensure that they have not been
+subject to a replay attack. C has not yet actually talked to S, but it is ready to.
+
+Much of the security here falls apart if one of the permanent keys is compromised. Compromise
+of C's key means that the attacker can pose as C and obtain all of C's privileges, and can
+eavesdrop on C's legitimate conversations. He can also pretend to be A, but only in
+conversations with C. Since it does not (by hypothesis) have keys for any services, he
+cannot generate any new tickets for services, though it can replay old tickets and session
+keys until S's permanent key is changed or the old tickets time out.
+
+Compromise of S's key means that the attacker can pose as S to anyone, and can eavesdrop on
+any user's conversation with S. Unless some client's key is also compromised, the attacker
+cannot generate new fake client tickets for S, since doing so requires it to authenticate
+himself as A, using the client key it doesn't know.
diff --git a/doc/dev/config-key.rst b/doc/dev/config-key.rst
new file mode 100644
index 000000000..d7b79db2f
--- /dev/null
+++ b/doc/dev/config-key.rst
@@ -0,0 +1,68 @@
+===================
+ config-key layout
+===================
+
+*config-key* is a general-purpose key/value storage service offered by
+the mons. Generally speaking, you can put whatever you want there.
+Current in-tree users should be captured here with their key layout
+schema.
+
+OSD dm-crypt keys
+=================
+
+Key::
+
+ dm-crypt/osd/$OSD_UUID/luks = <json string>
+
+The JSON payload has the form::
+
+ { "dm-crypt": <secret> }
+
+where the secret is a base64 encoded LUKS key.
+
+Created by the 'osd new' command (see OSDMonitor.cc).
+
+Consumed by ceph-volume, and similar tools. Normally access to the
+dm-crypt/osd/$OSD_UUID prefix is allowed by a client.osd-lockbox.$OSD_UUID
+cephx key, such that only the appropriate host can retrieve the LUKS key (which
+in turn decrypts the actual raw key, also stored on the device itself).
+
+
+ceph-mgr modules
+================
+
+The convention for keys is::
+
+ mgr/$MODULE/$option = $value
+
+or::
+
+ mgr/$MODULE/$MGRID/$option = $value
+
+For example,::
+
+ mgr/dashboard/server_port = 80
+ mgr/dashboard/foo/server_addr = 1.2.3.4
+ mgr/dashboard/bar/server_addr = 1.2.3.5
+
+
+Configuration
+=============
+
+Configuration options for clients and daemons are also stored in config-key.
+
+Keys take the form::
+
+ config/$option = $value
+ config/$type/$option = $value
+ config/$type.$id/$option = $value
+ config/$type.$id/$mask[/$mask2...]/$option = $value
+
+Where
+
+* `type` is a daemon type (`osd`, `mon`, `mds`, `mgr`, `client`)
+* `id` is a daemon id (e.g., `0`, `foo`), such that `$type.$id` is something like `osd.123` or `mds.foo`)
+* `mask` restricts who the option applies to, and can take two forms:
+
+ #. `$crush_type:$crush_value`. For example, `rack:foorack`
+ #. `class:$classname`, in reference to CRUSH device classes (e.g., `ssd`)
diff --git a/doc/dev/config.rst b/doc/dev/config.rst
new file mode 100644
index 000000000..5b620b2fe
--- /dev/null
+++ b/doc/dev/config.rst
@@ -0,0 +1,166 @@
+=================================
+ Configuration Management System
+=================================
+
+The configuration management system exists to provide every daemon with the
+proper configuration information. The configuration can be viewed as a set of
+key-value pairs.
+
+How can the configuration be set? Well, there are several sources:
+ - the ceph configuration file, usually named ceph.conf
+ - command line arguments::
+ --debug-ms=1
+ --debug-pg=10
+ etc.
+ - arguments injected at runtime using "injectargs" or "config set"
+
+
+The Configuration File
+======================
+
+Most configuration settings originate in the Ceph configuration file.
+
+How do we find the configuration file? Well, in order, we check:
+ - the default locations
+ - the environment variable CEPH_CONF
+ - the command line argument -c
+
+Each stanza of the configuration file describes the key-value pairs that will be in
+effect for a particular subset of the daemons. The "global" stanza applies to
+everything. The "mon", "osd", and "mds" stanzas specify settings to take effect
+for all monitors, all OSDs, and all mds servers, respectively. A stanza of the
+form mon.$name, osd.$name, or mds.$name gives settings for the monitor, OSD, or
+MDS of that name, respectively. Configuration values that appear later in the
+file win over earlier ones.
+
+A sample configuration file can be found in src/sample.ceph.conf.
+
+
+Metavariables
+=============
+
+The configuration system allows any configuration value to be
+substituted into another value using the ``$varname`` syntax, similar
+to how bash shell expansion works.
+
+A few additional special metavariables are also defined:
+ - $host: expands to the current hostname
+ - $type: expands to one of "mds", "osd", "mon", or "client"
+ - $id: expands to the daemon identifier. For ``osd.0``, this would be ``0``; for ``mds.a``, it would be ``a``; for ``client.admin``, it would be ``admin``.
+ - $num: same as $id
+ - $name: expands to $type.$id
+
+
+Reading configuration values
+====================================================
+
+There are two ways for Ceph code to get configuration values. One way is to
+read it directly from a variable named "g_conf," or equivalently,
+"g_ceph_ctx->_conf." The other is to register an observer that will be called
+every time the relevant configuration values changes. This observer will be
+called soon after the initial configuration is read, and every time after that
+when one of the relevant values changes. Each observer tracks a set of keys
+and is invoked only when one of the relevant keys changes.
+
+The interface to implement is found in common/config_obs.h.
+
+The observer method should be preferred in new code because
+ - It is more flexible, allowing the code to do whatever reinitialization needs
+ to be done to implement the new configuration value.
+ - It is the only way to create a std::string configuration variable that can
+ be changed by injectargs.
+ - Even for int-valued configuration options, changing the values in one thread
+ while another thread is reading them can lead to subtle and
+ impossible-to-diagnose bugs.
+
+For these reasons, reading directly from g_conf should be considered deprecated
+and not done in new code. Do not ever alter g_conf.
+
+Changing configuration values
+====================================================
+
+Configuration values can be changed by calling ``g_conf()->set_val``. After changing
+the configuration, you should call ``g_conf()->apply_changes`` to re-run all the
+affected configuration observers. For convenience, you can call
+``g_conf()->set_val_or_die`` to make a configuration change which you think should
+never fail.
+
+Injectargs, parse_argv, and parse_env are three other functions which modify
+the configuration. Just like with set_val, you should call apply_changes after
+calling these functions to make sure your changes get applied.
+
+
+Defining config options
+=======================
+
+New-style config options are defined in common/options.cc. All new config
+options should go here (and not into legacy_config_opts.h).
+
+Levels
+------
+
+The Option constructor takes a "level" value:
+
+* *LEVEL_BASIC* is for basic config options that a normal operator is likely to adjust.
+* *LEVEL_ADVANCED* is for options that an operator *can* adjust, but should not touch unless they understand what they are doing. Adjusting advanced options poorly can lead to problems (performance or even data loss) if done incorrectly.
+* *LEVEL_DEV* is for options in place for use by developers only, either for testing purposes, or to describe constants that no user should adjust but we prefer not to compile into the code.
+
+Description and long description
+--------------------------------
+
+Short description of the option. Sentence fragment. e.g.::
+
+ .set_description("Default checksum algorithm to use")
+
+The long description is complete sentences, perhaps even multiple
+paragraphs, and may include other detailed information or notes.::
+
+ .set_long_description("crc32c, xxhash32, and xxhash64 are available. The _16 and _8 variants use only a subset of the bits for more compact (but less reliable) checksumming.")
+
+Default values
+--------------
+
+There is a default value for every config option. In some cases, there may
+also be a *daemon default* that only applies to code that declares itself
+as a daemon (in this case, the regular default only applies to non-daemons).
+
+Safety
+------
+
+If an option can be safely changed at runtime::
+
+ .set_safe()
+
+Service
+-------
+
+Service is a component name, like "common", "osd", "rgw", "mds", etc. It may
+be a list of components, like::
+
+ .add_service("mon mds osd mgr")
+
+For example, the rocksdb options affect both the osd and mon.
+
+Tags
+----
+
+Tags identify options across services that relate in some way. Example include;
+
+ - network -- options affecting network configuration
+ - mkfs -- options that only matter at mkfs time
+
+Enums
+-----
+
+For options with a defined set of allowed values::
+
+ .set_enum_allowed({"none", "crc32c", "crc32c_16", "crc32c_8", "xxhash32", "xxhash64"})
+
+Flags
+-----
+
+* **RUNTIME**: the value can be updated at runtime
+* **NO_MON_UPDATE**: Daemons/clients do not pull this value from the monitor config database. We disallow setting this option via 'ceph config set ...'. This option should be configured via ceph.conf or via the command line.
+* **STARTUP**: option takes effect only during daemon startup
+* **CLUSTER_CREATE**: option only affects cluster creation
+* **CREATE**: option only affects daemon creation
diff --git a/doc/dev/context.rst b/doc/dev/context.rst
new file mode 100644
index 000000000..1a2b2cbfb
--- /dev/null
+++ b/doc/dev/context.rst
@@ -0,0 +1,20 @@
+=============
+ CephContext
+=============
+
+A CephContext represents a single view of the Ceph cluster. It comes complete
+with a configuration, a set of performance counters (PerfCounters), and a
+heartbeat map. You can find more information about CephContext in
+src/common/ceph_context.h.
+
+Generally, you will have only one CephContext in your application, called
+g_ceph_context. However, in library code, it is possible that the library user
+will initialize multiple CephContexts. For example, this would happen if he
+called rados_create more than once.
+
+A ceph context is required to issue log messages. Why is this? Well, without
+the CephContext, we would not know which log messages were disabled and which
+were enabled. The dout() macro implicitly references g_ceph_context, so it
+can't be used in library code. It is fine to use dout and derr in daemons, but
+in library code, you must use ldout and lderr, and pass in your own CephContext
+object. The compiler will enforce this restriction.
diff --git a/doc/dev/continuous-integration.rst b/doc/dev/continuous-integration.rst
new file mode 100644
index 000000000..5c2f15823
--- /dev/null
+++ b/doc/dev/continuous-integration.rst
@@ -0,0 +1,285 @@
+Continuous Integration Architecture
+===================================
+
+In Ceph, we rely on multiple CI pipelines in our development. Most of these pipelines
+are centered around Jenkins. And their configurations are generated using `Jenkins Job Builder`_.
+
+.. _Jenkins Job Builder: https://docs.openstack.org/infra/jenkins-job-builder/
+
+Let's take the ``make check`` performed by Jenkins as an example.
+
+ceph-pull-requests
+------------------
+
+``ceph-pull-requests`` is a jenkins job which gets triggered by a GitHub pull
+request or a trigger phrase like::
+
+ jenkins test make check
+
+There are multiple parties involved in this jenkins job:
+
+.. graphviz::
+
+ digraph {
+ rankdir="LR";
+ github [
+ label="<git> git_repo | <webhooks> webhooks | <api> api";
+ shape=record;
+ href="https://github.com/ceph/ceph";
+ ];
+ subgraph cluster_lab {
+ label="Sepia Lab";
+ href="https://wiki.sepia.ceph.com/doku.php";
+ shape=circle;
+ apt_mirror [
+ href="http://apt-mirror.front.sepia.ceph.com";
+ ];
+ shaman [
+ href="https://shaman.ceph.com";
+ ];
+ chacra [
+ peripheries=3;
+ href="https://chacra.ceph.com";
+ ];
+ subgraph cluster_jenkins {
+ label="jenkins";
+ href="https://jenkins.ceph.com";
+ jenkins_controller [ label = "controller" ];
+ jenkins_agents [ label = "agents", peripheries=3 ];
+ };
+ };
+ {
+ rank=same;
+ package_repos [ peripheries=3 ];
+ pypi;
+ npm;
+ }
+ github:webhooks -> jenkins_controller [ label = "notify", color = "crimson" ];
+ jenkins_controller -> jenkins_agents [ label = "schedule jobs" ];
+ jenkins_agents -> github:git [ label = "git pull" ];
+ jenkins_agents -> shaman [ label = "query for chacra repo URL" ];
+ jenkins_agents -> chacra [ label = "pull build dependencies" ];
+ jenkins_agents -> package_repos [ label = "pull build dependencies" ];
+ jenkins_agents -> pypi [ label = "pull Python packages" ];
+ jenkins_agents -> npm [ label = "pull JavaScript packages" ];
+ jenkins_agents -> apt_mirror [ label = "pull build dependencies" ];
+ jenkins_agents -> github:api [ label = "update", color = "crimson" ];
+ }
+
+Where
+
+Sepia Lab
+ `Sepia Lab`_ is a test lab used by the Ceph project. This lab offers
+ the storage and computing resources required by our CI infra.
+
+Jenkins agents
+ are a set of machines which perform the CI jobs. In this case, they
+
+ #. pull the git repo from GitHub and
+ #. rebase the pull request against the latest master
+ #. set necessary environment variables
+ #. run ``run-make-check.sh``
+
+Chacra
+ is a server offering RESTful API allowing the clients to store and
+ retrieve binary packages. It also creates the repo for uploaded
+ packages automatically. Once a certain repo is created on chacra, the
+ configured shaman server is updated as well, then we can query shaman
+ for the corresponding repo address. Chacra not only hosts Ceph packages,
+ it also hosts quite a few other packages like various build dependencies.
+
+Shaman
+ is a server offering RESTful API allowing the clients to query the
+ information of repos hosted by chacra nodes. Shaman is also known
+ for its `Web UI`_. But please note, shaman does not build the
+ packages, it just offers information on the builds.
+
+As the following shows, `chacra`_ manages multiple projects whose metadata
+are stored in a database. These metadata are exposed via Shaman as a web
+service. `chacractl`_ is a utility to interact with the `chacra`_ service.
+
+.. graphviz::
+
+ digraph {
+ libboost [
+ shape=cylinder;
+ ];
+ libzbd [
+ shape=cylinder;
+ ];
+ other_repos [
+ label="...";
+ shape=cylinder;
+ ];
+ postgresql [
+ shape=cylinder;
+ style=filled;
+ ]
+ shaman -> postgresql;
+ chacra -> postgresql;
+ chacractl -> chacra;
+ chacra -> libboost;
+ chacra -> libzbd;
+ chacra -> other_repos;
+ }
+
+.. _Sepia Lab: https://wiki.sepia.ceph.com/doku.php
+.. _Web UI: https://shaman.ceph.com
+
+build dependencies
+------------------
+
+Just like lots of other software projects, Ceph has both build-time and
+run-time dependencies. Most of time, we are inclined to use the packages
+prebuilt by the distro. But there are cases where
+
+- the necessary dependencies are either missing in the distro, or
+- their versions are too old, or
+- they are packaged without some important feature enabled.
+- we want to ensure that the version of a certain runtime dependency is
+ identical to the one we tested in our lab.
+
+No matter what the reason is, we either need to build them from source, or
+to package them as binary packages instead of using the ones shipped by the
+distro. Quite a few build-time dependencies are included as git submodules,
+but in order to avoid rebuilding these dependencies repeatedly, we pre-built
+some of them and uploaded them to our own repos. So, when performing
+``make check``, the building hosts in our CI just pull them from our internal
+repos hosting these packages instead of building them.
+
+So far, following packages are prebuilt for ubuntu focal, and then uploaded to
+`chacra`_:
+
+libboost
+ packages `boost`_. The packages' names are changed from ``libboost-*`` to
+ ``ceph-libboost-*``, and they are instead installed into ``/opt/ceph``, so
+ they don't interfere with the official ``libboost`` packages shipped by
+ distro. Its build scripts are hosted at https://github.com/ceph/ceph-boost.
+ See https://github.com/ceph/ceph-boost/commit/2a8ae02932b2a1fd6a68072da8ca0df2b99b805c
+ for an example of how to bump the version number. The commands used to
+ build 1.79 on a vanilla Ubuntu Focal OS are below.
+
+ .. prompt:: bash $
+
+ sudo apt install debhelper dctrl-tools chrpath libbz2-dev libicu-dev bison \
+ flex docbook-to-man help2man xsltproc doxygen dh-python python3-all-dev graphviz
+ wget http://download.ceph.com/qa/boost_1_79_0.tar.bz2
+ git clone https://github.com/ceph/ceph-boost
+ tar xjf boost_1_79_0.tar.bz2
+ cp -ra ceph-boost/debian boost_1_79_0/
+ pushd boost_1_79_0
+ export DEB_BUILD_OPTIONS='parallel=6 nodoc'
+ dpkg-buildpackage -us -uc -b
+ popd
+ BOOST_SHA=$(git ls-remote https://github.com/ceph/ceph-boost main | awk '{ print $1 }')
+ ls *.deb | chacractl binary create \
+ libboost/master/$BOOST_SHA/ubuntu/focal/amd64/flavors/default
+
+libzbd
+ packages `libzbd`_ . The upstream libzbd includes debian packaging already.
+
+libpmem
+ packages `pmdk`_ . Please note, ``ndctl`` is one of the build dependencies of
+ pmdk, for an updated debian packaging, please see
+ https://github.com/ceph/ceph-ndctl .
+
+.. note::
+
+ please ensure that the package version and the release number of the
+ packaging are properly updated when updating/upgrading the packaging,
+ otherwise it would be difficult to tell which version of the package
+ is installed. We check the package version before trying to upgrade
+ it in ``install-deps.sh``.
+
+.. _boost: https://www.boost.org
+.. _libzbd: https://github.com/westerndigitalcorporation/libzbd
+.. _pmdk: https://github.com/pmem/pmdk
+
+But in addition to these libraries, ``ceph-mgr-dashboard``'s frontend uses lots of
+JavaScript packages. Quite a few of them are not packaged by distros. Not to
+mention the trouble of testing different combination of versions of these
+packages. So we decided to include these JavaScript packages in our dist tarball
+using ``make-dist``.
+
+Also, because our downstream might not want to use the prepackaged binaries when
+redistributing the precompiled Ceph packages, we also need to include these
+libraries in our dist tarball. They are
+
+- boost
+- liburing
+- pmdk
+
+``make-dist`` is a script used by our CI pipeline to create dist tarball so the
+tarball can be used to build the Ceph packages in a clean room environment. When
+we need to upgrade these third party libraries, we should
+
+- update the CMake script
+- rebuild the prebuilt packages and
+- update this script to reflect the change.
+
+Uploading Dependencies
+----------------------
+
+To ensure that prebuilt packages are available by the jenkins agents, we need to
+upload them to either ``apt-mirror.front.sepia.ceph.com`` or `chacra`_. To upload
+packages to the former would require the help of our lab administrator, so if we
+want to maintain the package repositories on regular basis, a better choice would be
+to manage them using `chacractl`_. `chacra`_ represents packages repositories using
+a resource hierarchy, like::
+
+ <project>/<branch>/<ref>/<distro>/<distro-version>/<arch>
+
+In which:
+
+project
+ in general, it is used for denoting a set of related packages. For instance,
+ ``libboost``.
+
+branch
+ branch of project. This mirrors the concept of a Git repo.
+
+ref
+ a unique id of a given version of a set packages. This id is used to reference
+ the set packages under the ``<project>/<branch>``. It is a good practice to
+ version the packaging recipes, like the ``debian`` directory for building DEB
+ packages and the ``spec`` for building RPM packages, and use the SHA1 of the
+ packaging recipe for the ``ref``. But you could also use a random string for
+ ``ref``, like the tag name of the built source tree.
+
+distro
+ the distro name for which the packages are built. Currently, following distros are
+ supported:
+
+ - centos
+ - debian
+ - fedora
+ - rhel
+ - ubuntu
+
+distro-version
+ the version of the distro. For instance, if a package is built on ubuntu focal,
+ the ``distro-version`` should be ``20.04``.
+
+arch
+ the architecture of the packages. It could be:
+
+ - arm64
+ - amd64
+ - noarch
+
+So, for example, we can upload the prebuilt boost packages to chacra like
+
+.. prompt:: bash $
+
+ ls *.deb | chacractl binary create \
+ libboost/master/099c0fd56b4a54457e288a2eff8fffdc0d416f7a/ubuntu/focal/amd64/flavors/default
+
+.. _chacra: https://github.com/ceph/chacra
+.. _chacractl: https://github.com/ceph/chacractl
+
+Update ``install-deps.sh``
+--------------------------
+
+We also need to update ``install-deps.sh`` to point the built script to the new
+repo. Please refer to the `script <https://github.com/ceph/ceph/blob/master/install-deps.sh>`_,
+for more details.
diff --git a/doc/dev/corpus.rst b/doc/dev/corpus.rst
new file mode 100644
index 000000000..4005f70c0
--- /dev/null
+++ b/doc/dev/corpus.rst
@@ -0,0 +1,100 @@
+
+Corpus structure
+================
+
+ceph.git/ceph-object-corpus is a submodule.::
+
+ bin/ # misc scripts
+ archive/$version/objects/$type/$hash # a sample of encoded objects from a specific version
+
+You can also mark known or deliberate incompatibilities between versions with::
+
+ archive/$version/forward_incompat/$type
+
+The presence of a file indicates that new versions of code cannot
+decode old objects across that ``$version`` (this is normally the case).
+
+
+How to generate an object corpus
+--------------------------------
+
+.. highlight:: shell
+
+We can generate an object corpus for a particular version of ceph using the
+script of ``script/gen-corpus.sh``, or by following the instructions below:
+
+#. Checkout a clean repo (best not to do this where you normally work)::
+
+ git clone ceph.git
+ cd ceph
+ git submodule update --init --recursive
+
+#. Build with flag to dump objects to ``/tmp/foo``::
+
+ rm -rf /tmp/foo ; mkdir /tmp/foo
+ do_cmake.sh -DCMAKE_CXX_FLAGS="-DENCODE_DUMP_PATH=/tmp/foo"
+ cd build
+ make
+
+#. Start via vstart::
+
+ cd build
+ MON=3 MGR=2 OSD=3 MDS=3 RGW=1 ../src/vstart.sh -n -x
+
+#. Use as much functionality of the cluster as you can, to exercise as many object encoder methods as possible::
+
+ bin/ceph osd pool create mypool
+ bin/rados -p mypool bench 10 write -b 123
+ bin/ceph osd out 0
+ bin/ceph osd in 0
+ bin/init-ceph restart osd.1
+ for f in ../qa/workunits/cls/*.sh ; do PATH="bin:$PATH" $f ; done
+ PATH="bin:$PATH" ../qa/workunits/rados/test.sh
+ bin/ceph_test_librbd
+ bin/ceph_test_libcephfs
+ bin/init-ceph restart mds.a
+ ../qa/workunits/rgw/run-s3tests.sh
+
+#. Stop::
+
+ ../src/stop.sh
+
+#. Import the corpus (this will take a few minutes)::
+
+ ../src/test/encoding/import.sh /tmp/foo `bin/ceph-dencoder version` ../ceph-object-corpus/archive
+ ../src/test/encoding/import-generated.sh ../ceph-object-corpus/archive
+
+#. Prune it! There will be a bazillion copies of various objects, and we only want a representative sample.::
+
+ pushd ../ceph-object-corpus
+ bin/prune-archive.sh
+ popd
+
+#. Verify the tests pass::
+
+ ctest -R readable.sh
+
+#. Commit it to the corpus repo and push::
+
+ pushd ../ceph-object-corpus
+ git checkout -b wip-new
+ git add archive/`../build/bin/ceph-dencoder version`
+ git commit -m `../build/bin/ceph-dencoder version`
+ git remote add cc git@github.com:ceph/ceph-object-corpus.git
+ git push cc wip-new
+ popd
+
+#. Go test it out::
+
+ cd my/regular/tree
+ cd ceph-object-corpus
+ git fetch origin
+ git checkout wip-new
+ cd ../build
+ ctest -R readable.sh
+
+#. If everything looks good, update the submodule master branch, and commit the submodule in ceph.git.
+
+
+
+
diff --git a/doc/dev/cpu-profiler.rst b/doc/dev/cpu-profiler.rst
new file mode 100644
index 000000000..5b06a4791
--- /dev/null
+++ b/doc/dev/cpu-profiler.rst
@@ -0,0 +1,54 @@
+=====================
+ Installing Oprofile
+=====================
+
+The easiest way to profile Ceph's CPU consumption is to use the `oprofile`_
+system-wide profiler.
+
+.. _oprofile: http://oprofile.sourceforge.net/about/
+
+Installation
+============
+
+If you are using a Debian/Ubuntu distribution, you can install ``oprofile`` by
+executing the following::
+
+ sudo apt-get install oprofile oprofile-gui
+
+
+Compiling Ceph for Profiling
+============================
+
+To compile Ceph for profiling, first clean everything. ::
+
+ make distclean
+
+Then, export the following settings so that you can see callgraph output. ::
+
+ export CFLAGS="-fno-omit-frame-pointer -O2 -g"
+
+Finally, compile Ceph. ::
+
+ ./autogen.sh
+ ./configure
+ make
+
+You can use ``make -j`` to execute multiple jobs depending upon your system. For
+example::
+
+ make -j4
+
+
+Ceph Configuration
+==================
+
+Ensure that you disable ``lockdep``. Consider setting logging to
+levels appropriate for a production cluster. See `Ceph Logging and Debugging`_
+for details.
+
+.. _Ceph Logging and Debugging: ../../rados/troubleshooting/log-and-debug
+
+See the `CPU Profiling`_ section of the RADOS Troubleshooting documentation for details on using Oprofile.
+
+
+.. _CPU Profiling: ../../rados/troubleshooting/cpu-profiling \ No newline at end of file
diff --git a/doc/dev/crimson/crimson.rst b/doc/dev/crimson/crimson.rst
new file mode 100644
index 000000000..954b88a37
--- /dev/null
+++ b/doc/dev/crimson/crimson.rst
@@ -0,0 +1,284 @@
+=======
+crimson
+=======
+
+Crimson is the code name of crimson-osd, which is the next generation ceph-osd.
+It targets fast networking devices, fast storage devices by leveraging state of
+the art technologies like DPDK and SPDK, for better performance. And it will
+keep the support of HDDs and low-end SSDs via BlueStore. Crismon will try to
+be backward compatible with classic OSD.
+
+.. highlight:: console
+
+Building Crimson
+================
+
+Crismon is not enabled by default. To enable it::
+
+ $ WITH_SEASTAR=true ./install-deps.sh
+ $ mkdir build && cd build
+ $ cmake -DWITH_SEASTAR=ON ..
+
+Please note, `ASan`_ is enabled by default if crimson is built from a source
+cloned using git.
+
+Also, Seastar uses its own lockless allocator which does not play well with
+the alien threads. So, to use alienstore / bluestore backend, you might want to
+pass ``-DSeastar_CXX_FLAGS=-DSEASTAR_DEFAULT_ALLOCATOR`` to ``cmake`` when
+configuring this project to use the libc allocator, like::
+
+ $ cmake -DWITH_SEASTAR=ON -DSeastar_CXX_FLAGS=-DSEASTAR_DEFAULT_ALLOCATOR ..
+
+.. _ASan: https://github.com/google/sanitizers/wiki/AddressSanitizer
+
+Running Crimson
+===============
+
+As you might expect, crimson is not featurewise on par with its predecessor yet.
+
+object store backend
+--------------------
+
+At the moment ``crimson-osd`` offers two object store backends:
+
+- CyanStore: CyanStore is modeled after memstore in classic OSD.
+- AlienStore: AlienStore is short for Alienized BlueStore.
+
+Seastore is still under active development.
+
+daemonize
+---------
+
+Unlike ``ceph-osd``, ``crimson-osd`` does daemonize itself even if the
+``daemonize`` option is enabled. Because, to read this option, ``crimson-osd``
+needs to ready its config sharded service, but this sharded service lives
+in the seastar reactor. If we fork a child process and exit the parent after
+starting the Seastar engine, that will leave us with a single thread which is
+the replica of the thread calls `fork()`_. This would unnecessarily complicate
+the code, if we would have tackled this problem in crimson.
+
+Since a lot of GNU/Linux distros are using systemd nowadays, which is able to
+daemonize the application, there is no need to daemonize by ourselves. For
+those who are using sysvinit, they can use ``start-stop-daemon`` for daemonizing
+``crimson-osd``. If this is not acceptable, we can whip up a helper utility
+to do the trick.
+
+
+.. _fork(): http://pubs.opengroup.org/onlinepubs/9699919799/functions/fork.html
+
+logging
+-------
+
+Currently, ``crimson-osd`` uses the logging utility offered by Seastar. see
+``src/common/dout.h`` for the mapping between different logging levels to
+the severity levels in Seastar. For instance, the messages sent to ``derr``
+will be printed using ``logger::error()``, and the messages with debug level
+over ``20`` will be printed using ``logger::trace()``.
+
++---------+---------+
+| ceph | seastar |
++---------+---------+
+| < 0 | error |
++---------+---------+
+| 0 | warn |
++---------+---------+
+| [1, 5) | info |
++---------+---------+
+| [5, 20] | debug |
++---------+---------+
+| > 20 | trace |
++---------+---------+
+
+Please note, ``crimson-osd``
+does not send the logging message to specified ``log_file``. It writes
+the logging messages to stdout and/or syslog. Again, this behavior can be
+changed using ``--log-to-stdout`` and ``--log-to-syslog`` command line
+options. By default, ``log-to-stdout`` is enabled, and the latter disabled.
+
+
+vstart.sh
+---------
+
+To facilitate the development of crimson, following options would be handy when
+using ``vstart.sh``,
+
+``--crimson``
+ start ``crimson-osd`` instead of ``ceph-osd``
+
+``--nodaemon``
+ do not daemonize the service
+
+``--redirect-output``
+ redirect the stdout and stderr of service to ``out/$type.$num.stdout``.
+
+``--osd-args``
+ pass extra command line options to crimson-osd or ceph-osd. It's quite
+ useful for passing Seastar options to crimson-osd. For instance, you could
+ use ``--osd-args "--memory 2G"`` to set the memory to use. Please refer
+ the output of::
+
+ crimson-osd --help-seastar
+
+ for more Seastar specific command line options.
+
+``--memstore``
+ use the CyanStore as the object store backend.
+
+``--bluestore``
+ use the AlienStore as the object store backend. This is the default setting,
+ if not specified otherwise.
+
+So, a typical command to start a single-crimson-node cluster is::
+
+ $ MGR=1 MON=1 OSD=1 MDS=0 RGW=0 ../src/vstart.sh -n -x \
+ --without-dashboard --memstore \
+ --crimson --nodaemon --redirect-output \
+ --osd-args "--memory 4G"
+
+Where we assign 4 GiB memory, a single thread running on core-0 to crimson-osd.
+
+You could stop the vstart cluster using::
+
+ $ ../src/stop.sh --crimson
+
+
+CBT Based Testing
+=================
+
+We can use `cbt`_ for performing perf tests::
+
+ $ git checkout master
+ $ make crimson-osd
+ $ ../src/script/run-cbt.sh --cbt ~/dev/cbt -a /tmp/baseline ../src/test/crimson/cbt/radosbench_4K_read.yaml
+ $ git checkout yet-another-pr
+ $ make crimson-osd
+ $ ../src/script/run-cbt.sh --cbt ~/dev/cbt -a /tmp/yap ../src/test/crimson/cbt/radosbench_4K_read.yaml
+ $ ~/dev/cbt/compare.py -b /tmp/baseline -a /tmp/yap -v
+ 19:48:23 - INFO - cbt - prefill/gen8/0: bandwidth: (or (greater) (near 0.05)):: 0.183165/0.186155 => accepted
+ 19:48:23 - INFO - cbt - prefill/gen8/0: iops_avg: (or (greater) (near 0.05)):: 46.0/47.0 => accepted
+ 19:48:23 - WARNING - cbt - prefill/gen8/0: iops_stddev: (or (less) (near 0.05)):: 10.4403/6.65833 => rejected
+ 19:48:23 - INFO - cbt - prefill/gen8/0: latency_avg: (or (less) (near 0.05)):: 0.340868/0.333712 => accepted
+ 19:48:23 - INFO - cbt - prefill/gen8/1: bandwidth: (or (greater) (near 0.05)):: 0.190447/0.177619 => accepted
+ 19:48:23 - INFO - cbt - prefill/gen8/1: iops_avg: (or (greater) (near 0.05)):: 48.0/45.0 => accepted
+ 19:48:23 - INFO - cbt - prefill/gen8/1: iops_stddev: (or (less) (near 0.05)):: 6.1101/9.81495 => accepted
+ 19:48:23 - INFO - cbt - prefill/gen8/1: latency_avg: (or (less) (near 0.05)):: 0.325163/0.350251 => accepted
+ 19:48:23 - INFO - cbt - seq/gen8/0: bandwidth: (or (greater) (near 0.05)):: 1.24654/1.22336 => accepted
+ 19:48:23 - INFO - cbt - seq/gen8/0: iops_avg: (or (greater) (near 0.05)):: 319.0/313.0 => accepted
+ 19:48:23 - INFO - cbt - seq/gen8/0: iops_stddev: (or (less) (near 0.05)):: 0.0/0.0 => accepted
+ 19:48:23 - INFO - cbt - seq/gen8/0: latency_avg: (or (less) (near 0.05)):: 0.0497733/0.0509029 => accepted
+ 19:48:23 - INFO - cbt - seq/gen8/1: bandwidth: (or (greater) (near 0.05)):: 1.22717/1.11372 => accepted
+ 19:48:23 - INFO - cbt - seq/gen8/1: iops_avg: (or (greater) (near 0.05)):: 314.0/285.0 => accepted
+ 19:48:23 - INFO - cbt - seq/gen8/1: iops_stddev: (or (less) (near 0.05)):: 0.0/0.0 => accepted
+ 19:48:23 - INFO - cbt - seq/gen8/1: latency_avg: (or (less) (near 0.05)):: 0.0508262/0.0557337 => accepted
+ 19:48:23 - WARNING - cbt - 1 tests failed out of 16
+
+Where we compile and run the same test against two branches. One is ``master``, another is ``yet-another-pr`` branch.
+And then we compare the test results. Along with every test case, a set of rules is defined to check if we have
+performance regressions when comparing two set of test results. If a possible regression is found, the rule and
+corresponding test results are highlighted.
+
+.. _cbt: https://github.com/ceph/cbt
+
+Hacking Crimson
+===============
+
+
+Seastar Documents
+-----------------
+
+See `Seastar Tutorial <https://github.com/scylladb/seastar/blob/master/doc/tutorial.md>`_ .
+Or build a browsable version and start an HTTP server::
+
+ $ cd seastar
+ $ ./configure.py --mode debug
+ $ ninja -C build/debug docs
+ $ python3 -m http.server -d build/debug/doc/html
+
+You might want to install ``pandoc`` and other dependencies beforehand.
+
+Debugging Crimson
+=================
+
+Debugging with GDB
+------------------
+
+The `tips`_ for debugging Scylla also apply to Crimson.
+
+.. _tips: https://github.com/scylladb/scylla/blob/master/docs/debugging.md#tips-and-tricks
+
+Human-readable backtraces with addr2line
+----------------------------------------
+
+When a seastar application crashes, it leaves us with a serial of addresses, like::
+
+ Segmentation fault.
+ Backtrace:
+ 0x00000000108254aa
+ 0x00000000107f74b9
+ 0x00000000105366cc
+ 0x000000001053682c
+ 0x00000000105d2c2e
+ 0x0000000010629b96
+ 0x0000000010629c31
+ 0x00002a02ebd8272f
+ 0x00000000105d93ee
+ 0x00000000103eff59
+ 0x000000000d9c1d0a
+ /lib/x86_64-linux-gnu/libc.so.6+0x000000000002409a
+ 0x000000000d833ac9
+ Segmentation fault
+
+``seastar-addr2line`` offered by Seastar can be used to decipher these
+addresses. After running the script, it will be waiting for input from stdin,
+so we need to copy and paste the above addresses, then send the EOF by inputting
+``control-D`` in the terminal::
+
+ $ ../src/seastar/scripts/seastar-addr2line -e bin/crimson-osd
+
+ 0x00000000108254aa
+ 0x00000000107f74b9
+ 0x00000000105366cc
+ 0x000000001053682c
+ 0x00000000105d2c2e
+ 0x0000000010629b96
+ 0x0000000010629c31
+ 0x00002a02ebd8272f
+ 0x00000000105d93ee
+ 0x00000000103eff59
+ 0x000000000d9c1d0a
+ 0x00000000108254aa
+ [Backtrace #0]
+ seastar::backtrace_buffer::append_backtrace() at /home/kefu/dev/ceph/build/../src/seastar/src/core/reactor.cc:1136
+ seastar::print_with_backtrace(seastar::backtrace_buffer&) at /home/kefu/dev/ceph/build/../src/seastar/src/core/reactor.cc:1157
+ seastar::print_with_backtrace(char const*) at /home/kefu/dev/ceph/build/../src/seastar/src/core/reactor.cc:1164
+ seastar::sigsegv_action() at /home/kefu/dev/ceph/build/../src/seastar/src/core/reactor.cc:5119
+ seastar::install_oneshot_signal_handler<11, &seastar::sigsegv_action>()::{lambda(int, siginfo_t*, void*)#1}::operator()(int, siginfo_t*, void*) const at /home/kefu/dev/ceph/build/../src/seastar/src/core/reactor.cc:5105
+ seastar::install_oneshot_signal_handler<11, &seastar::sigsegv_action>()::{lambda(int, siginfo_t*, void*)#1}::_FUN(int, siginfo_t*, void*) at /home/kefu/dev/ceph/build/../src/seastar/src/core/reactor.cc:5101
+ ?? ??:0
+ seastar::smp::configure(boost::program_options::variables_map, seastar::reactor_config) at /home/kefu/dev/ceph/build/../src/seastar/src/core/reactor.cc:5418
+ seastar::app_template::run_deprecated(int, char**, std::function<void ()>&&) at /home/kefu/dev/ceph/build/../src/seastar/src/core/app-template.cc:173 (discriminator 5)
+ main at /home/kefu/dev/ceph/build/../src/crimson/osd/main.cc:131 (discriminator 1)
+
+Please note, ``seastar-addr2line`` is able to extract the addresses from
+the input, so you can also paste the log messages like::
+
+ 2020-07-22T11:37:04.500 INFO:teuthology.orchestra.run.smithi061.stderr:Backtrace:
+ 2020-07-22T11:37:04.500 INFO:teuthology.orchestra.run.smithi061.stderr: 0x0000000000e78dbc
+ 2020-07-22T11:37:04.501 INFO:teuthology.orchestra.run.smithi061.stderr: 0x0000000000e3e7f0
+ 2020-07-22T11:37:04.501 INFO:teuthology.orchestra.run.smithi061.stderr: 0x0000000000e3e8b8
+ 2020-07-22T11:37:04.501 INFO:teuthology.orchestra.run.smithi061.stderr: 0x0000000000e3e985
+ 2020-07-22T11:37:04.501 INFO:teuthology.orchestra.run.smithi061.stderr: /lib64/libpthread.so.0+0x0000000000012dbf
+
+Unlike classic OSD, crimson does not print a human-readable backtrace when it
+handles fatal signals like `SIGSEGV` or `SIGABRT`. And it is more complicated
+when it comes to a stripped binary. So before planting a signal handler for
+those signals in crimson, we could to use `script/ceph-debug-docker.sh` to parse
+the addresses in the backtrace::
+
+ # assuming you are under the source tree of ceph
+ $ ./src/script/ceph-debug-docker.sh --flavor crimson master:27e237c137c330ebb82627166927b7681b20d0aa centos:8
+ ....
+ [root@3deb50a8ad51 ~]# wget -q https://raw.githubusercontent.com/scylladb/seastar/master/scripts/seastar-addr2line
+ [root@3deb50a8ad51 ~]# dnf install -q -y file
+ [root@3deb50a8ad51 ~]# python3 seastar-addr2line -e /usr/bin/crimson-osd
+ # paste the backtrace here
diff --git a/doc/dev/crimson/error-handling.rst b/doc/dev/crimson/error-handling.rst
new file mode 100644
index 000000000..43017457d
--- /dev/null
+++ b/doc/dev/crimson/error-handling.rst
@@ -0,0 +1,158 @@
+==============
+error handling
+==============
+
+
+In Seastar, a ``future`` represents a value not yet available but that can become
+available later. ``future`` can have one of following states:
+
+* unavailable: value is not available yet,
+* value,
+* failed: an exception was thrown when computing the value. This exception has
+ been captured and stored in the ``future`` instance via ``std::exception_ptr``.
+
+In the last case, the exception can be processed using ``future::handle_exception()`` or
+``future::handle_exception_type()``. Seastar even provides ``future::or_terminate()`` to
+terminate the program if the future fails.
+
+But in Crimson, quite a few errors are not serious enough to fail the program entirely.
+For instance, if we try to look up an object by its object id, and that operation could
+fail because the object does not exist or it is corrupted, we need to recover that object
+for fulfilling the request instead of terminating the process.
+
+In other words, these errors are expected. Moreover, the performance of the unhappy path
+should also be on par with that of the happy path. Also, we want to have a way to ensure
+that all expected errors are handled. It should be something like the statical analysis
+performed by compiler to spit a warning if any enum value is not handled in a ``switch-case``
+statement.
+
+Unfortunately, ``seastar::future`` is not able to satisfy these two requirements.
+
+* Seastar imposes re-throwing an exception to dispatch between different types of
+ exceptions. This is not very performant nor even scalable as locking in the language's
+ runtime can occur.
+* Seastar does not encode the expected exception type in the type of the returned
+ ``seastar::future``. Only the type of the value is encoded. This imposes huge
+ mental load on programmers as ensuring that all intended errors are indeed handled
+ requires manual code audit.
+
+.. highlight:: c++
+
+So, "errorator" is created. It is a wrapper around the vanilla ``seastar::future``.
+It addresses the performance and scalability issues while embedding the information
+about all expected types-of-errors to the type-of-future.::
+
+ using ertr = crimson::errorator<crimson::ct_error::enoent,
+ crimson::ct_error::einval>;
+
+In above example we defined an errorator that allows for two error types:
+
+* ``crimson::ct_error::enoent`` and
+* ``crimson::ct_error::einval``.
+
+These (and other ones in the ``crimson::ct_error`` namespace) are basically
+unthrowable wrappers over ``std::error_code`` to exclude accidental throwing
+and ensure signaling errors in a way that enables compile-time checking.
+
+The most fundamental thing in an errorator is a descendant of ``seastar::future``
+which can be used as e.g. function's return type::
+
+ static ertr::future<int> foo(int bar) {
+ if (bar == 42) {
+ return crimson::ct_error::einval::make();
+ } else {
+ return ertr::make_ready_future(bar);
+ }
+ }
+
+It's worth to note that returning an error that is not a part the errorator's error set
+would result in a compile-time error::
+
+ static ertr::future<int> foo(int bar) {
+ // Oops, input_output_error is not allowed in `ertr`. static_assert() will
+ // terminate the compilation. This behaviour is absolutely fundamental for
+ // callers -- to figure out about all possible errors they need to worry
+ // about is enough to just take a look on the function's signature; reading
+ // through its implementation is not necessary anymore!
+ return crimson::ct_error::input_output_error::make();
+ }
+
+The errorator concept goes further. It not only provides callers with the information
+about all potential errors embedded in the function's type; it also ensures at the caller
+site that all these errors are handled. As the reader probably know, the main method
+in ``seastar::future`` is ``then()``. On errorated future it is available but only if errorator's
+error set is empty (literally: ``errorator<>::future``); otherwise callers have
+to use ``safe_then()`` instead::
+
+ seastar::future<> baz() {
+ return foo(42).safe_then(
+ [] (const int bar) {
+ std::cout << "the optimistic path! got bar=" << bar << std::endl
+ return ertr::now();
+ },
+ ertr::all_same_way(const std::error_code& err) {
+ // handling errors removes them from errorator's error set
+ std::cout << "the error path! got err=" << err << std::endl;
+ return ertr::now();
+ }).then([] {
+ // as all errors have been handled, errorator's error set became
+ // empty and the future instance returned from `safe_then()` has
+ // `then()` available!
+ return seastar::now();
+ });
+ }
+
+In the above example ``ertr::all_same_way`` has been used to handle all errors in the same
+manner. This is not obligatory -- a caller can handle each of them separately. Moreover,
+it can provide a handler for only a subset of errors. The price for that is the availability
+of ``then()``::
+
+ using einval_ertr = crimson::errorator<crimson::ct_error::einval>;
+
+ // we can't return seastar::future<> (aka errorator<>::future<>) as handling
+ // as this level deals only with enoent leaving einval without a handler.
+ // handling it becomes a responsibility of a caller of `baz()`.
+ einval_ertr::future<> baz() {
+ return foo(42).safe_then(
+ [] (const int bar) {
+ std::cout << "the optimistic path! got bar=" << bar << std::endl
+ return ertr::now();
+ },
+ // provide a handler only for crimson::ct_error::enoent.
+ // crimson::ct_error::einval stays unhandled!
+ crimson::ct_error::enoent::handle([] {
+ std::cout << "the enoent error path!" << std::endl;
+ return ertr::now();
+ }));
+ // .safe_then() above returned `errorator<crimson::ct_error::einval>::future<>`
+ // which lacks `then()`.
+ }
+
+That is, handling errors removes them from errorated future's error set. This works
+in the opposite direction too -- returning new errors in ``safe_then()`` appends them
+the error set. Of course, this set must be compliant with error set in the ``baz()``'s
+signature::
+
+ using broader_ertr = crimson::errorator<crimson::ct_error::enoent,
+ crimson::ct_error::einval,
+ crimson::ct_error::input_output_error>;
+
+ broader_ertr::future<> baz() {
+ return foo(42).safe_then(
+ [] (const int bar) {
+ std::cout << "oops, the optimistic path generates a new error!";
+ return crimson::ct_error::input_output_error::make();
+ },
+ // we have a special handler to delegate the handling up. For conveience,
+ // the same behaviour is available as single argument-taking variant of
+ // `safe_then()`.
+ ertr::pass_further{});
+ }
+
+As it can be seen, handling and signaling errors in ``safe_then()`` is basically
+an operation on the error set checked at compile-time.
+
+More details can be found in `the slides from ceph::errorator<> throw/catch-free,
+compile time-checked exceptions for seastar::future<>
+<https://www.slideshare.net/ScyllaDB/cepherrorator-throwcatchfree-compile-timechecked-exceptions-for-seastarfuture>`_
+presented at the Seastar Summit 2019.
diff --git a/doc/dev/crimson/index.rst b/doc/dev/crimson/index.rst
new file mode 100644
index 000000000..55f071825
--- /dev/null
+++ b/doc/dev/crimson/index.rst
@@ -0,0 +1,11 @@
+===============================
+Crimson developer documentation
+===============================
+
+.. rubric:: Contents
+
+.. toctree::
+ :glob:
+
+ *
+
diff --git a/doc/dev/crimson/poseidonstore.rst b/doc/dev/crimson/poseidonstore.rst
new file mode 100644
index 000000000..3fbefd04b
--- /dev/null
+++ b/doc/dev/crimson/poseidonstore.rst
@@ -0,0 +1,586 @@
+===============
+ PoseidonStore
+===============
+
+Key concepts and goals
+======================
+
+* As one of the pluggable backend stores for Crimson, PoseidonStore targets only
+ high-end NVMe SSDs (not concerned with ZNS devices).
+* Designed entirely for low CPU consumption
+
+ - Hybrid update strategies for different data types (in-place, out-of-place) to
+ minimize CPU consumption by reducing host-side GC.
+ - Remove a black-box component like RocksDB and a file abstraction layer in BlueStore
+ to avoid unnecessary overheads (e.g., data copy and serialization/deserialization)
+ - Utilize NVMe feature (atomic large write command, Atomic Write Unit Normal).
+ Make use of io_uring, new kernel asynchronous I/O interface, to selectively use the interrupt
+ driven mode for CPU efficiency (or polled mode for low latency).
+* Sharded data/processing model
+
+Background
+----------
+
+Both in-place and out-of-place update strategies have their pros and cons.
+
+* Log-structured store
+
+ Log-structured based storage system is a typical example that adopts an update-out-of-place approach.
+ It never modifies the written data. Writes always go to the end of the log. It enables I/O sequentializing.
+
+ * Pros
+
+ - Without a doubt, one sequential write is enough to store the data
+ - It naturally supports transaction (this is no overwrite, so the store can rollback
+ previous stable state)
+ - Flash friendly (it mitigates GC burden on SSDs)
+ * Cons
+
+ - There is host-side GC that induces overheads
+
+ - I/O amplification (host-side)
+ - More host-CPU consumption
+
+ - Slow metadata lookup
+ - Space overhead (live and unused data co-exist)
+
+* In-place update store
+
+ The update-in-place strategy has been used widely for conventional file systems such as ext4 and xfs.
+ Once a block has been placed in a given disk location, it doesn't move.
+ Thus, writes go to the corresponding location in the disk.
+
+ * Pros
+
+ - Less host-CPU consumption (No host-side GC is required)
+ - Fast lookup
+ - No additional space for log-structured, but there is internal fragmentation
+ * Cons
+
+ - More writes occur to record the data (metadata and data section are separated)
+ - It cannot support transaction. Some form of WAL required to ensure update atomicity
+ in the general case
+ - Flash unfriendly (Give more burdens on SSDs due to device-level GC)
+
+Motivation and Key idea
+-----------------------
+
+In modern distributed storage systems, a server node can be equipped with multiple
+NVMe storage devices. In fact, ten or more NVMe SSDs could be attached on a server.
+As a result, it is hard to achieve NVMe SSD's full performance due to the limited CPU resources
+available in a server node. In such environments, CPU tends to become a performance bottleneck.
+Thus, now we should focus on minimizing host-CPU consumption, which is the same as the Crimson's objective.
+
+Towards an object store highly optimized for CPU consumption, three design choices have been made.
+
+* **PoseidonStore does not have a black-box component like RocksDB in BlueStore.**
+
+ Thus, it can avoid unnecessary data copy and serialization/deserialization overheads.
+ Moreover, we can remove an unnecessary file abstraction layer, which was required to run RocksDB.
+ Object data and metadata is now directly mapped to the disk blocks.
+ Eliminating all these overheads will reduce CPU consumption (e.g., pre-allocation, NVME atomic feature).
+
+* **PoseidonStore uses hybrid update strategies for different data size, similar to BlueStore.**
+
+ As we discussed, both in-place and out-of-place update strategies have their pros and cons.
+ Since CPU is only bottlenecked under small I/O workloads, we chose update-in-place for small I/Os to mininize CPU consumption
+ while choosing update-out-of-place for large I/O to avoid double write. Double write for small data may be better than host-GC overhead
+ in terms of CPU consumption in the long run. Although it leaves GC entirely up to SSDs,
+
+* **PoseidonStore makes use of io_uring, new kernel asynchronous I/O interface to exploit interrupt-driven I/O.**
+
+ User-space driven I/O solutions like SPDK provide high I/O performance by avoiding syscalls and enabling zero-copy
+ access from the application. However, it does not support interrupt-driven I/O, which is only possible with kernel-space driven I/O.
+ Polling is good for low-latency but bad for CPU efficiency. On the other hand, interrupt is good for CPU efficiency and bad for
+ low-latency (but not that bad as I/O size increases). Note that network acceleration solutions like DPDK also excessively consume
+ CPU resources for polling. Using polling both for network and storage processing aggravates CPU consumption.
+ Since network is typically much faster and has a higher priority than storage, polling should be applied only to network processing.
+
+high-end NVMe SSD has enough powers to handle more works. Also, SSD lifespan is not a practical concern these days
+(there is enough program-erase cycle limit [#f1]_). On the other hand, for large I/O workloads, the host can afford process host-GC.
+Also, the host can garbage collect invalid objects more effectively when their size is large
+
+Observation
+-----------
+
+Two data types in Ceph
+
+* Data (object data)
+
+ - The cost of double write is high
+ - The best mehod to store this data is in-place update
+
+ - At least two operations required to store the data: 1) data and 2) location of
+ data. Nevertheless, a constant number of operations would be better than out-of-place
+ even if it aggravates WAF in SSDs
+
+* Metadata or small data (e.g., object_info_t, snapset, pg_log, and collection)
+
+ - Multiple small-sized metadata entries for an object
+ - The best solution to store this data is WAL + Using cache
+
+ - The efficient way to store metadata is to merge all metadata related to data
+ and store it though a single write operation even though it requires background
+ flush to update the data partition
+
+
+Design
+======
+.. ditaa::
+
+ +-WAL partition-|----------------------Data partition-------------------------------+
+ | Sharded partition |
+ +-----------------------------------------------------------------------------------+
+ | WAL -> | | Super block | Freelist info | Onode radix tree info| Data blocks |
+ +-----------------------------------------------------------------------------------+
+ | Sharded partition 2
+ +-----------------------------------------------------------------------------------+
+ | WAL -> | | Super block | Freelist info | Onode radix tree info| Data blocks |
+ +-----------------------------------------------------------------------------------+
+ | Sharded partition N
+ +-----------------------------------------------------------------------------------+
+ | WAL -> | | Super block | Freelist info | Onode radix tree info| Data blocks |
+ +-----------------------------------------------------------------------------------+
+ | Global information (in reverse order)
+ +-----------------------------------------------------------------------------------+
+ | Global WAL -> | | SB | Freelist | |
+ +-----------------------------------------------------------------------------------+
+
+
+* WAL
+
+ - Log, metadata and small data are stored in the WAL partition
+ - Space within the WAL partition is continually reused in a circular manner
+ - Flush data to trim WAL as necessary
+* Disk layout
+
+ - Data blocks are metadata blocks or data blocks
+ - Freelist manages the root of free space B+tree
+ - Super block contains management info for a data partition
+ - Onode radix tree info contains the root of onode radix tree
+
+
+I/O procedure
+-------------
+* Write
+
+ For incoming writes, data is handled differently depending on the request size;
+ data is either written twice (WAL) or written in a log-structured manner.
+
+ #. If Request Size ≤ Threshold (similar to minimum allocation size in BlueStore)
+
+ Write data and metadata to [WAL] —flush—> Write them to [Data section (in-place)] and
+ [Metadata section], respectively.
+
+ Since the CPU becomes the bottleneck for small I/O workloads, in-place update scheme is used.
+ Double write for small data may be better than host-GC overhead in terms of CPU consumption
+ in the long run
+ #. Else if Request Size > Threshold
+
+ Append data to [Data section (log-structure)] —> Write the corresponding metadata to [WAL]
+ —flush—> Write the metadata to [Metadata section]
+
+ For large I/O workloads, the host can afford process host-GC
+ Also, the host can garbage collect invalid objects more effectively when their size is large
+
+ Note that Threshold can be configured to a very large number so that only the scenario (1) occurs.
+ With this design, we can control the overall I/O procedure with the optimizations for crimson
+ as described above.
+
+ * Detailed flow
+
+ We make use of a NVMe write command which provides atomicity guarantees (Atomic Write Unit Power Fail)
+ For example, 512 Kbytes of data can be atomically written at once without fsync().
+
+ * stage 1
+
+ - if the data is small
+ WAL (written) --> | TxBegin A | Log Entry | TxEnd A |
+ Append a log entry that contains pg_log, snapset, object_infot_t and block allocation
+ using NVMe atomic write command on the WAL
+ - if the data is large
+ Data partition (written) --> | Data blocks |
+ * stage 2
+
+ - if the data is small
+ No need.
+ - if the data is large
+ Then, append the metadata to WAL.
+ WAL --> | TxBegin A | Log Entry | TxEnd A |
+
+* Read
+
+ - Use the cached object metadata to find out the data location
+ - If not cached, need to search WAL after checkpoint and Object meta partition to find the
+ latest meta data
+
+* Flush (WAL --> Data partition)
+
+ - Flush WAL entries that have been committed. There are two conditions
+ (1. the size of WAL is close to full, 2. a signal to flush).
+ We can mitigate the overhead of frequent flush via batching processing, but it leads to
+ delaying completion.
+
+
+Crash consistency
+------------------
+
+* Large case
+
+ #. Crash occurs right after writing Data blocks
+
+ - Data partition --> | Data blocks |
+ - We don't need to care this case. Data is not alloacted yet in reality. The blocks will be reused.
+ #. Crash occurs right after WAL
+
+ - Data partition --> | Data blocks |
+ - WAL --> | TxBegin A | Log Entry | TxEnd A |
+ - Write procedure is completed, so there is no data loss or inconsistent state
+
+* Small case
+
+ #. Crash occurs right after writing WAL
+
+ - WAL --> | TxBegin A | Log Entry| TxEnd A |
+ - All data has been written
+
+
+Comparison
+----------
+
+* Best case (pre-allocation)
+
+ - Only need writes on both WAL and Data partition without updating object metadata (for the location).
+* Worst case
+
+ - At least three writes are required additionally on WAL, object metadata, and data blocks.
+ - If the flush from WAL to the data parition occurs frequently, radix tree onode structure needs to be update
+ in many times. To minimize such overhead, we can make use of batch processing to minimize the update on the tree
+ (the data related to the object has a locality because it will have the same parent node, so updates can be minimized)
+
+* WAL needs to be flushed if the WAL is close to full or a signal to flush.
+
+ - The premise behind this design is OSD can manage the latest metadata as a single copy. So,
+ appended entries are not to be read
+* Either best of the worst case does not produce severe I/O amplification (it produce I/Os, but I/O rate is constant)
+ unlike LSM-tree DB (the proposed design is similar to LSM-tree which has only level-0)
+
+
+Detailed Design
+===============
+
+* Onode lookup
+
+ * Radix tree
+ Our design is entirely based on the prefix tree. Ceph already makes use of the characteristic of OID's prefix to split or search
+ the OID (e.g., pool id + hash + oid). So, the prefix tree fits well to store or search the object. Our scheme is designed
+ to lookup the prefix tree efficiently.
+
+ * Sharded partition
+ A few bits (leftmost bits of the hash) of the OID determine a sharded partition where the object is located.
+ For example, if the number of partitions is configured as four, The entire space of the hash in hobject_t
+ can be divided into four domains (0x0xxx ~ 0x3xxx, 0x4xxx ~ 0x7xxx, 0x8xxx ~ 0xBxxx and 0xCxxx ~ 0xFxxx).
+
+ * Ondisk onode
+
+ .. code-block:: c
+
+ stuct onode {
+ extent_tree block_maps;
+ b+_tree omaps;
+ map xattrs;
+ }
+
+ onode contains the radix tree nodes for lookup, which means we can search for objects using tree node information in onode.
+ Also, if the data size is small, the onode can embed the data and xattrs.
+ The onode is fixed size (256 or 512 byte). On the other hands, omaps and block_maps are variable-length by using pointers in the onode.
+
+ .. ditaa::
+
+ +----------------+------------+--------+
+ | on\-disk onode | block_maps | omaps |
+ +----------+-----+------------+--------+
+ | ^ ^
+ | | |
+ +-----------+---------+
+
+
+ * Lookup
+ The location of the root of onode tree is specified on Onode radix tree info, so we can find out where the object
+ is located by using the root of prefix tree. For example, shared partition is determined by OID as described above.
+ Using the rest of the OID's bits and radix tree, lookup procedure find outs the location of the onode.
+ The extent tree (block_maps) contains where data chunks locate, so we finally figure out the data location.
+
+
+* Allocation
+
+ * Sharded partitions
+
+ The entire disk space is divided into several data chunks called sharded partition (SP).
+ Each SP has its own data structures to manage the partition.
+
+ * Data allocation
+
+ As we explained above, the management infos (e.g., super block, freelist info, onode radix tree info) are pre-allocated
+ in each shared partition. Given OID, we can map any data in Data block section to the extent tree in the onode.
+ Blocks can be allocated by searching the free space tracking data structure (we explain below).
+
+ ::
+
+ +-----------------------------------+
+ | onode radix tree root node block |
+ | (Per-SP Meta) |
+ | |
+ | # of records |
+ | left_sibling / right_sibling |
+ | +--------------------------------+|
+ | | keys[# of records] ||
+ | | +-----------------------------+||
+ | | | start onode ID |||
+ | | | ... |||
+ | | +-----------------------------+||
+ | +--------------------------------||
+ | +--------------------------------+|
+ | | ptrs[# of records] ||
+ | | +-----------------------------+||
+ | | | SP block number |||
+ | | | ... |||
+ | | +-----------------------------+||
+ | +--------------------------------+|
+ +-----------------------------------+
+
+ * Free space tracking
+ The freespace is tracked on a per-SP basis. We can use extent-based B+tree in XFS for free space tracking.
+ The freelist info contains the root of free space B+tree. Granularity is a data block in Data blocks partition.
+ The data block is the smallest and fixed size unit of data.
+
+ ::
+
+ +-----------------------------------+
+ | Free space B+tree root node block |
+ | (Per-SP Meta) |
+ | |
+ | # of records |
+ | left_sibling / right_sibling |
+ | +--------------------------------+|
+ | | keys[# of records] ||
+ | | +-----------------------------+||
+ | | | startblock / blockcount |||
+ | | | ... |||
+ | | +-----------------------------+||
+ | +--------------------------------||
+ | +--------------------------------+|
+ | | ptrs[# of records] ||
+ | | +-----------------------------+||
+ | | | SP block number |||
+ | | | ... |||
+ | | +-----------------------------+||
+ | +--------------------------------+|
+ +-----------------------------------+
+
+* Omap and xattr
+ In this design, omap and xattr data is tracked by b+tree in onode. The onode only has the root node of b+tree.
+ The root node contains entires which indicate where the key onode exists.
+ So, if we know the onode, omap can be found via omap b+tree.
+
+* Fragmentation
+
+ - Internal fragmentation
+
+ We pack different types of data/metadata in a single block as many as possible to reduce internal fragmentation.
+ Extent-based B+tree may help reduce this further by allocating contiguous blocks that best fit for the object
+
+ - External fragmentation
+
+ Frequent object create/delete may lead to external fragmentation
+ In this case, we need cleaning work (GC-like) to address this.
+ For this, we are referring the NetApp’s Continuous Segment Cleaning, which seems similar to the SeaStore’s approach
+ Countering Fragmentation in an Enterprise Storage System (NetApp, ACM TOS, 2020)
+
+.. ditaa::
+
+
+ +---------------+-------------------+-------------+
+ | Freelist info | Onode radix tree | Data blocks +-------+
+ +---------------+---------+---------+-+-----------+ |
+ | | |
+ +--------------------+ | |
+ | | |
+ | OID | |
+ | | |
+ +---+---+ | |
+ | Root | | |
+ +---+---+ | |
+ | | |
+ v | |
+ /-----------------------------\ | |
+ | Radix tree | | v
+ +---------+---------+---------+ | /---------------\
+ | onode | ... | ... | | | Num Chunk |
+ +---------+---------+---------+ | | |
+ +--+ onode | ... | ... | | | <Offset, len> |
+ | +---------+---------+---------+ | | <Offset, len> +-------+
+ | | | ... | |
+ | | +---------------+ |
+ | | ^ |
+ | | | |
+ | | | |
+ | | | |
+ | /---------------\ /-------------\ | | v
+ +->| onode | | onode |<---+ | /------------+------------\
+ +---------------+ +-------------+ | | Block0 | Block1 |
+ | OID | | OID | | +------------+------------+
+ | Omaps | | Omaps | | | Data | Data |
+ | Data Extent | | Data Extent +-----------+ +------------+------------+
+ +---------------+ +-------------+
+
+WAL
+---
+Each SP has a WAL.
+The datas written to the WAL are metadata updates, free space update and small data.
+Note that only data smaller than the predefined threshold needs to be written to the WAL.
+The larger data is written to the unallocated free space and its onode's extent_tree is updated accordingly
+(also on-disk extent tree). We statically allocate WAL partition aside from data partition pre-configured.
+
+
+Partition and Reactor thread
+----------------------------
+In early stage development, PoseidonStore will employ static allocation of partition. The number of sharded partitions
+is fixed and the size of each partition also should be configured before running cluster.
+But, the number of partitions can grow as below. We leave this as a future work.
+Also, each reactor thread has a static set of SPs.
+
+.. ditaa::
+
+ +------+------+-------------+------------------+
+ | SP 1 | SP N | --> <-- | global partition |
+ +------+------+-------------+------------------+
+
+
+
+Cache
+-----
+There are mainly two cache data structures; onode cache and block cache.
+It looks like below.
+
+#. Onode cache:
+ lru_map <OID, OnodeRef>;
+#. Block cache (data and omap):
+ Data cache --> lru_map <paddr, value>
+
+To fill the onode data structure, the target onode needs to be retrieved using the prefix tree.
+Block cache is used for caching a block contents. For a transaction, all the updates to blocks
+(including object meta block, data block) are first performed in the in-memory block cache.
+After writing a transaction to the WAL, the dirty blocks are flushed to their respective locations in the
+respective partitions.
+PoseidonStore can configure cache size for each type. Simple LRU cache eviction strategy can be used for both.
+
+
+Sharded partitions (with cross-SP transaction)
+----------------------------------------------
+The entire disk space is divided into a number of chunks called sharded partitions (SP).
+The prefixes of the parent collection ID (original collection ID before collection splitting. That is, hobject.hash)
+is used to map any collections to SPs.
+We can use BlueStore's approach for collection splitting, changing the number of significant bits for the collection prefixes.
+Because the prefixes of the parent collection ID do not change even after collection splitting, the mapping between
+the collection and SP are maintained.
+The number of SPs may be configured to match the number of CPUs allocated for each disk so that each SP can hold
+a number of objects large enough for cross-SP transaction not to occur.
+
+In case of need of cross-SP transaction, we could use the global WAL. The coordinator thread (mainly manages global partition) handles
+cross-SP transaction via acquire the source SP and target SP locks before processing the cross-SP transaction.
+Source and target probably are blocked.
+
+For the load unbalanced situation,
+Poseidonstore can create partitions to make full use of entire space efficiently and provide load balaning.
+
+
+CoW/Clone
+---------
+As for CoW/Clone, a clone has its own onode like other normal objects.
+
+Although each clone has its own onode, data blocks should be shared between the original object and clones
+if there are no changes on them to minimize the space overhead.
+To do so, the reference count for the data blocks is needed to manage those shared data blocks.
+
+To deal with the data blocks which has the reference count, poseidon store makes use of shared_blob
+which maintains the referenced data block.
+
+As shown the figure as below,
+the shared_blob tracks the data blocks shared between other onodes by using a reference count.
+The shared_blobs are managed by shared_blob_list in the superblock.
+
+
+.. ditaa::
+
+
+ /----------\ /----------\
+ | Object A | | Object B |
+ +----------+ +----------+
+ | Extent | | Extent |
+ +---+--+---+ +--+----+--+
+ | | | |
+ | | +----------+ |
+ | | | |
+ | +---------------+ |
+ | | | |
+ v v v v
+ +---------------+---------------+
+ | Data block 1 | Data block 2 |
+ +-------+-------+------+--------+
+ | |
+ v v
+ /---------------+---------------\
+ | shared_blob 1 | shared_blob 2 |
+ +---------------+---------------+ shared_blob_list
+ | refcount | refcount |
+ +---------------+---------------+
+
+Plans
+=====
+
+All PRs should contain unit tests to verify its minimal functionality.
+
+* WAL and block cache implementation
+
+ As a first step, we are going to build the WAL including the I/O procedure to read/write the WAL.
+ With WAL development, the block cache needs to be developed together.
+ Besides, we are going to add an I/O library to read/write from/to the NVMe storage to
+ utilize NVMe feature and the asynchronous interface.
+
+* Radix tree and onode
+
+ First, submit a PR against this file with a more detailed on disk layout and lookup strategy for the onode radix tree.
+ Follow up with implementation based on the above design once design PR is merged.
+ The second PR will be the implementation regarding radix tree which is the key structure to look up
+ objects.
+
+* Extent tree
+
+ This PR is the extent tree to manage data blocks in the onode. We build the extent tree, and
+ demonstrate how it works when looking up the object.
+
+* B+tree for omap
+
+ We will put together a simple key/value interface for omap. This probably will be a separate PR.
+
+* CoW/Clone
+
+ To support CoW/Clone, shared_blob and shared_blob_list will be added.
+
+* Integration to Crimson as to I/O interfaces
+
+ At this stage, interfaces for interacting with Crimson such as queue_transaction(), read(), clone_range(), etc.
+ should work right.
+
+* Configuration
+
+ We will define Poseidon store configuration in detail.
+
+* Stress test environment and integration to teuthology
+
+ We will add stress tests and teuthology suites.
+
+.. rubric:: Footnotes
+
+.. [#f1] Stathis Maneas, Kaveh Mahdaviani, Tim Emami, Bianca Schroeder: A Study of SSD Reliability in Large Scale Enterprise Storage Deployments. FAST 2020: 137-149
diff --git a/doc/dev/cxx.rst b/doc/dev/cxx.rst
new file mode 100644
index 000000000..e8ab9d4de
--- /dev/null
+++ b/doc/dev/cxx.rst
@@ -0,0 +1,27 @@
+C++17 and libstdc++ ABI
+=======================
+
+Ceph has switched over to C++17 in mimic. To build Ceph on old distros without
+GCC-7, it is required to install GCC-7 from additionary repos. On RHEL/CentOS,
+we are using devtoolset-7_ from SCLs_ for building Ceph. But devltoolset-7 is
+always using the old ABI_ even if ``_GLIBCXX_USE_CXX11_ABI=1`` is defined. So,
+on RHEL/CentOS, the old implementations of ``std::string`` and ``std::list``
+are still used. In other words, ``std::string`` is still copy-on-write, and
+``std::list::size()`` is still O(n) on these distros. But on Ubuntu Xenial,
+Ceph is built using the new ABI. So, because we are still using libstdc++ and
+devtoolset for building packages on RHEL/CentOS, please do not rely on the
+behavior of the new ABI or the old one.
+
+For those who argue that "GCC supports dual ABI!", here comes the long story.
+The problem is in the system shared library and ``libstdc++_nonshared.a`` model.
+If some symbol is exported from the system shared library, we must use that, and
+cannot override it. Also, the dual ABI support requires several of the system
+shared library symbols to behave differently (e.g. for locale facets, need
+to register twice as many, one set for old ABI, another for new ABI). So, this
+leaves us with no options but to stick with the old ABI, if we want to enable
+the built binaries to run on old distros where only the libstdc++ with the old
+ABI is available.
+
+.. _ABI: https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html
+.. _devtoolset-7: https://www.softwarecollections.org/en/scls/rhscl/devtoolset-7/
+.. _SCLs: https://www.softwarecollections.org/
diff --git a/doc/dev/dashboard/ui_goals.rst b/doc/dev/dashboard/ui_goals.rst
new file mode 100644
index 000000000..4e68ec1f5
--- /dev/null
+++ b/doc/dev/dashboard/ui_goals.rst
@@ -0,0 +1,78 @@
+===========================
+Ceph Dashboard Design Goals
+===========================
+
+.. note:: This document is intended to provide a focal point for discussing the overall design
+ principles for mgr/dashboard
+
+Introduction
+============
+
+Most distributed storage architectures are inherently complex and can present a management challenge
+to Operations teams who are typically stretched across multiple product and platform disciplines. In
+general terms, the complexity of any solution can have a direct bearing on the operational costs
+incurred to manage it. The answer is simple...make it simple :)
+
+This document is intended to highlight Ceph Dashboard design goals which may help to
+
+* reduce complexity
+* increase productivity
+* improve time-to-value
+* increase observability
+
+
+Understanding the Persona of the Target User
+============================================
+
+Ceph has historically been administered from the CLI. The CLI has always and will always offer the
+richest, most flexible way to install and manage a Ceph cluster. Administrators who require and
+demand this level of control are unlikely to adopt a UI for anything more than a technical curiosity.
+
+The relevance of the UI is therefore more critical for a new SysAdmin, where it can help technology
+adoption and reduce the operational friction that is normally experienced when implementing a new
+solution.
+
+Understanding the target user persona is therefore a fundamental first step in design. Attempting to
+design a UI that meets the requirements of a 'seasoned' Ceph Administrator or Developer, and a
+relatively new SysAdmin is unlikely to satisfy either user group.
+
+Design Principles
+=================
+
+Key Principles
+______________
+
+
+#. **Clarity and consistency**. The UI should ensure the data shown is unambiguous and consistent across
+ different views
+#. **Data timeliness**. Data displayed in the UI must be timely. State information **must** be reasonably
+ recent for it to be relevant and acted upon with confidence. In addition, the age of the data should
+ be shown as age (e.g. 20s ago) rather than UTC timestamps to make it more immediately consumable by
+ the Administrator.
+#. **Automate through workflows**. If the admin has to follow a 'recipe' to perform a task, the goal of
+ the dashboard UI should be to implement the flow.
+#. **Provide a natural next step**. The UI **is** the *expert system*, so instead of expecting the user
+ to know where they go next, the UI should lead them. This means linking components together to
+ establish a flow and deeper integration between the alertmanager implementation and the dashboard
+ elements enabling an Admin to efficiently step from alert to affected component.
+#. **Platform visibility**. The platform (OS and hardware configuration) is a fundamental component of the
+ solution, so providing platform level insights can help deliver a more holistic view of the Ceph cluster.
+#. **Jargon Busting**. Jargon is an unavoidable component of most systems. However, a good system will
+ include inline help to support new and infrequent users of the UI.
+
+
+Common Pitfalls
+_______________
+
+* Don't re-implement CLI commands in the UI. The sysadmin will likely use the CLI primitives in scripts
+ to automate tasks, so by simply adding a CLI feature we miss the workflow and add complexity, which
+ potentially 'bloats' the UI.
+* Don't think like a developer...try and adopt the mindset of an Administrator, who only works with the
+ Ceph cluster part-time - this is the reality for today's Operations teams.
+
+
+Focus On User Experience
+========================
+Ultimately, the goal must be to move away from pushing complexity onto the GUI user through multi-step
+workflows like iSCSI configuration or setting specific cluster flags in defined sequences. Simplicity
+should be the goal for the UI...let's leave the complexity to the CLI.
diff --git a/doc/dev/delayed-delete.rst b/doc/dev/delayed-delete.rst
new file mode 100644
index 000000000..31f3e6b97
--- /dev/null
+++ b/doc/dev/delayed-delete.rst
@@ -0,0 +1,13 @@
+=========================
+ CephFS delayed deletion
+=========================
+
+The deletion of a file does not immediately remove its data. Each of the file's
+underlying objects must be removed independently. If these objects were removed
+immediately, the client would have to send ``size_of_file / stripe_size *
+replication_count`` messages. This would consume significant bandwith and would
+slow the client unacceptably. If snapshots exist, their existence can prevent
+the deletion of objects associated with them.
+
+In these cases, such files are (1) marked as deleted on the MDS and (2) deleted
+lazily.
diff --git a/doc/dev/dev_cluster_deployement.rst b/doc/dev/dev_cluster_deployement.rst
new file mode 100644
index 000000000..69185e7f0
--- /dev/null
+++ b/doc/dev/dev_cluster_deployement.rst
@@ -0,0 +1,169 @@
+=================================
+ Deploying a development cluster
+=================================
+
+In order to develop on ceph, a Ceph utility,
+*vstart.sh*, allows you to deploy fake local cluster for development purpose.
+
+Usage
+=====
+
+It allows to deploy a fake local cluster on your machine for development purpose. It starts rgw, mon, osd and/or mds, or all of them if not specified.
+
+To start your development cluster, type the following::
+
+ vstart.sh [OPTIONS]...
+
+In order to stop the cluster, you can type::
+
+ ./stop.sh
+
+Options
+=======
+
+.. option:: -b, --bluestore
+
+ Use bluestore as the objectstore backend for osds.
+
+.. option:: --cache <pool>
+
+ Set a cache-tier for the specified pool.
+
+.. option:: -d, --debug
+
+ Launch in debug mode.
+
+.. option:: -e
+
+ Create an erasure pool.
+
+.. option:: -f, --filestore
+
+ Use filestore as the osd objectstore backend.
+
+.. option:: --hitset <pool> <hit_set_type>
+
+ Enable hitset tracking.
+
+.. option:: -i ip_address
+
+ Bind to the specified *ip_address* instead of guessing and resolve from hostname.
+
+.. option:: -k
+
+ Keep old configuration files instead of overwritting theses.
+
+.. option:: -K, --kstore
+
+ Use kstore as the osd objectstore backend.
+
+.. option:: -l, --localhost
+
+ Use localhost instead of hostname.
+
+.. option:: -m ip[:port]
+
+ Specifies monitor *ip* address and *port*.
+
+.. option:: --memstore
+
+ Use memstore as the objectstore backend for osds
+
+.. option:: --multimds <count>
+
+ Allow multimds with maximum active count.
+
+.. option:: -n, --new
+
+ Create a new cluster.
+
+.. option:: -N, --not-new
+
+ Reuse existing cluster config (default).
+
+.. option:: --nodaemon
+
+ Use ceph-run as wrapper for mon/osd/mds.
+
+.. option:: --nolockdep
+
+ Disable lockdep
+
+.. option:: -o <config>
+
+ Add *config* to all sections in the ceph configuration.
+
+.. option:: --rgw_port <port>
+
+ Specify ceph rgw http listen port.
+
+.. option:: --rgw_frontend <frontend>
+
+ Specify the rgw frontend configuration (default is civetweb).
+
+.. option:: --rgw_compression <compression_type>
+
+ Specify the rgw compression plugin (default is disabled).
+
+.. option:: --smallmds
+
+ Configure mds with small limit cache size.
+
+.. option:: --short
+
+ Short object names only; necessary for ext4 dev
+
+.. option:: --valgrind[_{osd,mds,mon}] 'valgrind_toolname [args...]'
+
+ Launch the osd/mds/mon/all the ceph binaries using valgrind with the specified tool and arguments.
+
+.. option:: --without-dashboard
+
+ Do not run using mgr dashboard.
+
+.. option:: -x
+
+ Enable cephx (on by default).
+
+.. option:: -X
+
+ Disable cephx.
+
+
+Environment variables
+=====================
+
+{OSD,MDS,MON,RGW}
+
+Theses environment variables will contains the number of instances of the desired ceph process you want to start.
+
+Example: ::
+
+ OSD=3 MON=3 RGW=1 vstart.sh
+
+
+============================================================
+ Deploying multiple development clusters on the same machine
+============================================================
+
+In order to bring up multiple ceph clusters on the same machine, *mstart.sh* a
+small wrapper around the above *vstart* can help.
+
+Usage
+=====
+
+To start multiple clusters, you would run mstart for each cluster you would want
+to deploy, and it will start monitors, rgws for each cluster on different ports
+allowing you to run multiple mons, rgws etc. on the same cluster. Invoke it in
+the following way::
+
+ mstart.sh <cluster-name> <vstart options>
+
+For eg::
+
+ ./mstart.sh cluster1 -n
+
+
+For stopping the cluster, you do::
+
+ ./mstop.sh <cluster-name>
diff --git a/doc/dev/developer_guide/basic-workflow.rst b/doc/dev/developer_guide/basic-workflow.rst
new file mode 100644
index 000000000..5917b56be
--- /dev/null
+++ b/doc/dev/developer_guide/basic-workflow.rst
@@ -0,0 +1,515 @@
+.. _basic workflow dev guide:
+
+Basic Workflow
+==============
+
+The following chart illustrates the basic Ceph development workflow:
+
+.. ditaa::
+
+ Upstream Code Your Local Environment
+
+ /----------\ git clone /-------------\
+ | Ceph | -------------------------> | ceph/main |
+ \----------/ \-------------/
+ ^ |
+ | | git branch fix_1
+ | git merge |
+ | v
+ /----------------\ git commit --amend /-------------\
+ | make check |---------------------> | ceph/fix_1 |
+ | ceph--qa--suite| \-------------/
+ \----------------/ |
+ ^ | fix changes
+ | | test changes
+ | review | git commit
+ | |
+ | v
+ /--------------\ /-------------\
+ | github |<---------------------- | ceph/fix_1 |
+ | pull request | git push \-------------/
+ \--------------/
+
+This page assumes that you are a new contributor with an idea for a bugfix or
+an enhancement, but you do not know how to proceed. Watch the `Getting Started
+with Ceph Development <https://www.youtube.com/watch?v=t5UIehZ1oLs>`_ video for
+a practical summary of this workflow.
+
+Updating the tracker
+--------------------
+
+Find the :ref:`issue-tracker` (Redmine) number of the bug you intend to fix. If
+no tracker issue exists, create one. There is only one case in which you do not
+have to create a Redmine tracker issue: the case of minor documentation changes.
+
+Simple documentation cleanup does not require a corresponding tracker issue.
+Major documenatation changes do require a tracker issue. Major documentation
+changes include adding new documentation chapters or files, and making
+substantial changes to the structure or content of the documentation.
+
+A (Redmine) tracker ticket explains the issue (bug) to other Ceph developers to
+keep them informed as the bug nears resolution. Provide a useful, clear title
+and include detailed information in the description. When composing the title
+of the ticket, ask yourself "If I need to search for this ticket two years from
+now, which keywords am I likely to search for?" Then include those keywords in
+the title.
+
+If your tracker permissions are elevated, assign the bug to yourself by setting
+the ``Assignee`` field. If your tracker permissions have not been elevated,
+just add a comment with a short message that says "I am working on this issue".
+
+Ceph Workflow Overview
+----------------------
+
+Three repositories are involved in the Ceph workflow. They are:
+
+1. The upstream repository (ceph/ceph)
+2. Your fork of the upstream repository (your_github_id/ceph)
+3. Your local working copy of the repository (on your workstation)
+
+The procedure for making changes to the Ceph repository is as follows:
+
+#. Configure your local environment
+
+ #. :ref:`Create a fork<forking>` of the "upstream Ceph"
+ repository.
+
+ #. :ref:`Clone the fork<cloning>` to your local filesystem.
+
+#. Fix the bug
+
+ #. :ref:`Synchronize local main with upstream main<synchronizing>`.
+
+ #. :ref:`Create a bugfix branch<bugfix_branch>` in your local working copy.
+
+ #. :ref:`Make alterations to the local working copy of the repository in your
+ local filesystem<fixing_bug_locally>`.
+
+ #. :ref:`Push the changes in your local working copy to your fork<push_changes>`.
+
+#. Create a Pull Request to push the change upstream
+
+ #. Create a Pull Request that asks for your changes to be added into the
+ "upstream Ceph" repository.
+
+Preparing Your Local Working Copy of the Ceph Repository
+--------------------------------------------------------
+
+The procedures in this section, "Preparing Your Local Working Copy of the Ceph
+Repository", must be followed only when you are first setting up your local
+environment. If this is your first time working with the Ceph project, then
+these commands are necessary and are the first commands that you should run.
+
+.. _forking:
+
+Creating a Fork of the Ceph Repository
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+See the `GitHub documentation
+<https://help.github.com/articles/fork-a-repo/#platform-linux>`_ for
+detailed instructions on forking. In short, if your GitHub username is
+"mygithubaccount", your fork of the upstream repo will appear at
+``https://github.com/mygithubaccount/ceph``.
+
+.. _cloning:
+
+Cloning Your Fork
+^^^^^^^^^^^^^^^^^
+
+After you have created your fork, clone it by running the following command:
+
+.. prompt:: bash $
+
+ git clone https://github.com/mygithubaccount/ceph
+
+You must fork the Ceph repository before you clone it. If you fail to fork,
+you cannot open a `GitHub pull request
+<https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request>`_.
+
+For more information on using GitHub, refer to `GitHub Help
+<https://help.github.com/>`_.
+
+Configuring Your Local Environment
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The commands in this section configure your local git environment so that it
+generates "Signed-off-by:" tags. These commands also set up your local
+environment so that it can stay synchronized with the upstream repository.
+
+These commands are necessary only during the initial setup of your local
+working copy. Another way to say that is "These commands are necessary
+only the first time that you are working with the Ceph repository. They are,
+however, unavoidable, and if you fail to run them then you will not be able
+to work on the Ceph repository.".
+
+1. Configure your local git environment with your name and email address.
+
+ .. note::
+ These commands will work only from within the ``ceph/`` directory
+ that was created when you cloned your fork.
+
+ .. prompt:: bash $
+
+ git config user.name "FIRST_NAME LAST_NAME"
+ git config user.email "MY_NAME@example.com"
+
+2. Add the upstream repo as a "remote" and fetch it:
+
+ .. prompt:: bash $
+
+ git remote add ceph https://github.com/ceph/ceph.git
+ git fetch ceph
+
+ These commands fetch all the branches and commits from ``ceph/ceph.git`` to
+ the local git repo as ``remotes/ceph/$BRANCH_NAME`` and can be referenced as
+ ``ceph/$BRANCH_NAME`` in local git commands.
+
+Fixing the Bug
+--------------
+
+.. _synchronizing:
+
+Synchronizing Local Main with Upstream Main
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In your local working copy, there is a copy of the ``main`` branch in
+``remotes/origin/main``. This is called "local main". This copy of the
+main branch (https://github.com/your_github_id/ceph.git) is "frozen in time"
+at the moment that you cloned it, but the upstream repo
+(https://github.com/ceph/ceph.git, typically abbreviated to ``ceph/ceph.git``)
+that it was forked from is not frozen in time: the upstream repo is still being
+updated by other contributors.
+
+Because upstream main is continually receiving updates from other
+contributors, your fork will drift farther and farther from the state of the
+upstream repo when you cloned it.
+
+Keep your fork's ``main`` branch synchronized with upstream main to reduce drift
+between your fork's main branch and the upstream main branch.
+
+Here are the commands for keeping your fork synchronized with the
+upstream repository:
+
+.. prompt:: bash $
+
+ git fetch ceph
+ git checkout main
+ git reset --hard ceph/main
+ git push -u origin main
+
+Follow this procedure often to keep your local ``main`` in sync with upstream
+``main``.
+
+If the command ``git status`` returns a line that reads "Untracked files", see
+:ref:`the procedure on updating submodules <update-submodules>`.
+
+.. _bugfix_branch:
+
+Creating a Bugfix branch
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Create a branch for your bugfix:
+
+.. prompt:: bash $
+
+ git checkout main
+ git checkout -b fix_1
+ git push -u origin fix_1
+
+The first command (git checkout main) makes sure that the bugfix branch
+"fix_1" is created from the most recent state of the main branch of the
+upstream repository.
+
+The second command (git checkout -b fix_1) creates a "bugfix branch" called
+"fix_1" in your local working copy of the repository. The changes that you make
+in order to fix the bug will be commited to this branch.
+
+The third command (git push -u origin fix_1) pushes the bugfix branch from
+your local working repository to your fork of the upstream repository.
+
+.. _fixing_bug_locally:
+
+Fixing the bug in the local working copy
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+#. **Updating the tracker**
+
+ In the `Ceph issue tracker <https://tracker.ceph.com>`_, change the status
+ of the tracker issue to "In progress". This communicates to other Ceph
+ contributors that you have begun working on a fix, which helps to avoid
+ duplication of effort. If you don't have permission to change that field,
+ just comment that you are working on the issue.
+
+#. **Fixing the bug itself**
+
+ This guide cannot tell you how to fix the bug that you have chosen to fix.
+ This guide assumes that you know what required improvement, and that you
+ know what to do to provide that improvement.
+
+ It might be that your fix is simple and requires only minimal testing. But
+ that's unlikely. It is more likely that the process of fixing your bug will
+ be iterative and will involve trial, error, skill, and patience.
+
+ For a detailed discussion of the tools available for validating bugfixes,
+ see the chapters on testing.
+
+Pushing the Fix to Your Fork
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You have finished work on the bugfix. You have tested the bugfix, and you
+believe that it works.
+
+#. Commit the changes to your local working copy.
+
+ Commit the changes to the `fix_1` branch of your local working copy by using
+ the ``--signoff`` option (here represented as the `s` portion of the `-as`
+ flag):
+
+ .. prompt:: bash $
+
+ git commit -as
+
+ .. _push_changes:
+
+#. Push the changes to your fork:
+
+ Push the changes from the `fix_1` branch of your local working copy to the
+ `fix_1` branch of your fork of the upstream repository:
+
+ .. prompt:: bash $
+
+ git push origin fix_1
+
+ .. note::
+
+ In the command ``git push origin fix_1``, ``origin`` is the name of your
+ fork of the upstream Ceph repository, and can be thought of as a nickname
+ for ``git@github.com:username/ceph.git``, where ``username`` is your
+ GitHub username.
+
+ It is possible that ``origin`` is not the name of your fork. Discover the
+ name of your fork by running ``git remote -v``, as shown here:
+
+ .. code-block:: bash
+
+ $ git remote -v
+ ceph https://github.com/ceph/ceph.git (fetch)
+ ceph https://github.com/ceph/ceph.git (push)
+ origin git@github.com:username/ceph.git (fetch)
+ origin git@github.com:username/ceph.git (push)
+
+ The line::
+
+ origin git@github.com:username/ceph.git (fetch)
+
+ and the line::
+
+ origin git@github.com:username/ceph.git (push)
+
+ provide the information that "origin" is the name of your fork of the
+ Ceph repository.
+
+
+Opening a GitHub pull request
+-----------------------------
+
+After you have pushed the bugfix to your fork, open a GitHub pull request
+(PR). This makes your bugfix visible to the community of Ceph contributors.
+They will review it. They may perform additional testing on your bugfix, and
+they might request changes to the bugfix.
+
+Be prepared to receive suggestions and constructive criticism in the form of
+comments within the PR.
+
+If you don't know how to create and manage pull requests, read `this GitHub
+pull request tutorial`_.
+
+.. _`this GitHub pull request tutorial`:
+ https://help.github.com/articles/using-pull-requests/
+
+To learn what constitutes a "good" pull request, see
+the `Git Commit Good Practice`_ article at the `OpenStack Project Wiki`_.
+
+.. _`Git Commit Good Practice`: https://wiki.openstack.org/wiki/GitCommitMessages
+.. _`OpenStack Project Wiki`: https://wiki.openstack.org/wiki/Main_Page
+
+See also our own `Submitting Patches
+<https://github.com/ceph/ceph/blob/main/SubmittingPatches.rst>`_ document.
+
+After your pull request (PR) has been opened, update the :ref:`issue-tracker`
+by adding a comment directing other contributors to your PR. The comment can be
+as simple as this::
+
+ *PR*: https://github.com/ceph/ceph/pull/$NUMBER_OF_YOUR_PULL_REQUEST
+
+Understanding Automated PR validation
+-------------------------------------
+
+When you create or update your PR, the Ceph project's `Continuous Integration
+(CI) <https://en.wikipedia.org/wiki/Continuous_integration>`_ infrastructure
+automatically tests it. At the time of this writing (May 2022), the automated
+CI testing included many tests. These five are among them:
+
+#. a test to check that the commits are properly signed (see :ref:`submitting-patches`):
+#. a test to check that the documentation builds
+#. a test to check that the submodules are unmodified
+#. a test to check that the API is in order
+#. a :ref:`make check<make-check>` test
+
+Additional tests may be run depending on which files your PR modifies.
+
+The :ref:`make check<make-check>` test builds the PR and runs it through a
+battery of tests. These tests run on servers that are operated by the Ceph
+Continuous Integration (CI) team. When the tests have completed their run, the
+result is shown on GitHub in the pull request itself.
+
+Test your modifications before you open a PR. Refer to the chapters
+on testing for details.
+
+Notes on PR make check test
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The GitHub :ref:`make check<make-check>` test is driven by a Jenkins instance.
+
+Jenkins merges your PR branch into the latest version of the base branch before
+it starts any tests. This means that you don't have to rebase the PR in order
+to pick up any fixes.
+
+You can trigger PR tests at any time by adding a comment to the PR - the
+comment should contain the string "test this please". Since a human who is
+subscribed to the PR might interpret that as a request for him or her to test
+the PR, you must address Jenkins directly. For example, write "jenkins retest
+this please". If you need to run only one of the tests, you can request it with
+a command like "jenkins test signed". A list of these requests is automatically
+added to the end of each new PR's description, so check there to find the
+single test you need.
+
+If there is a build failure and you aren't sure what caused it, check the
+:ref:`make check<make-check>` log. To access the make check log, click the
+"details" (next to the :ref:`make check<make-check>` test in the PR) link to
+enter the Jenkins web GUI. Then click "Console Output" (on the left).
+
+Jenkins is configured to search logs for strings that are known to have been
+associated with :ref:`make check<make-check>` failures in the past. However,
+there is no guarantee that these known strings are associated with any given
+:ref:`make check<make-check>` failure. You'll have to read through the log to
+determine the cause of your specific failure.
+
+Integration tests AKA ceph-qa-suite
+-----------------------------------
+
+Since Ceph is complex, it may be necessary to test your fix to
+see how it behaves on real clusters running on physical or virtual
+hardware. Tests designed for this purpose live in the `ceph/qa
+sub-directory`_ and are run via the `teuthology framework`_.
+
+.. _`ceph/qa sub-directory`: https://github.com/ceph/ceph/tree/main/qa/
+.. _`teuthology repository`: https://github.com/ceph/teuthology
+.. _`teuthology framework`: https://github.com/ceph/teuthology
+
+The Ceph community has access to the `Sepia lab
+<https://wiki.sepia.ceph.com/doku.php>`_ where :ref:`testing-integration-tests` can be
+run on physical hardware. Other developers may add tags like "needs-qa" to your
+PR. This allows PRs that need testing to be merged into a single branch and
+tested all at the same time. Since teuthology suites can take hours (even
+days in some cases) to run, this can save a lot of time.
+
+To request access to the Sepia lab, start `here <https://wiki.sepia.ceph.com/doku.php?id=vpnaccess>`_.
+
+Integration testing is discussed in more detail in the :ref:`testing-integration-tests`
+chapter.
+
+Code review
+-----------
+
+Once your bugfix has been thoroughly tested, or even during this process,
+it will be subjected to code review by other developers. This typically
+takes the form of comments in the PR itself, but can be supplemented
+by discussions on :ref:`irc` and the :ref:`mailing-list`.
+
+Amending your PR
+----------------
+
+While your PR is going through testing and `Code Review`_, you can
+modify it at any time by editing files in your local branch.
+
+After updates are committed locally (to the ``fix_1`` branch in our
+example), they need to be pushed to GitHub so they appear in the PR.
+
+Modifying the PR is done by adding commits to the ``fix_1`` branch upon
+which it is based, often followed by rebasing to modify the branch's git
+history. See `this tutorial
+<https://www.atlassian.com/git/tutorials/rewriting-history>`_ for a good
+introduction to rebasing. When you are done with your modifications, you
+will need to force push your branch with:
+
+.. prompt:: bash $
+
+ git push --force origin fix_1
+
+Why do we take these extra steps instead of simply adding additional commits
+the PR? It is best practice for a PR to consist of a single commit; this
+makes for clean history, eases peer review of your changes, and facilitates
+merges. In rare circumstances it also makes it easier to cleanly revert
+changes.
+
+Merging
+-------
+
+The bugfix process completes when a project lead merges your PR.
+
+When this happens, it is a signal for you (or the lead who merged the PR)
+to change the :ref:`issue-tracker` status to "Resolved". Some issues may be
+flagged for backporting, in which case the status should be changed to
+"Pending Backport" (see the :ref:`backporting` chapter for details).
+
+See also :ref:`merging` for more information on merging.
+
+Proper Merge Commit Format
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This is the most basic form of a merge commit::
+
+ doc/component: title of the commit
+
+ Reviewed-by: Reviewer Name <rname@example.com>
+
+This consists of two parts:
+
+#. The title of the commit / PR to be merged.
+#. The name and email address of the reviewer. Enclose the reviewer's email
+ address in angle brackets.
+
+Using .githubmap to Find a Reviewer's Email Address
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+If you cannot find the email address of the reviewer on his or her GitHub
+page, you can look it up in the **.githubmap** file, which can be found in
+the repository at **/ceph/.githubmap**.
+
+Using "git log" to find a Reviewer's Email Address
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+If you cannot find a reviewer's email address by using the above methods, you
+can search the git log for their email address. Reviewers are likely to have
+committed something before. If they have made previous contributions, the git
+log will probably contain their email address.
+
+Use the following command
+
+.. prompt:: bash [branch-under-review]$
+
+ git log
+
+Using ptl-tool to Generate Merge Commits
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Another method of generating merge commits involves using Patrick Donnelly's
+**ptl-tool** pull commits. This tool can be found at
+**/ceph/src/script/ptl-tool.py**. Merge commits that have been generated by
+the **ptl-tool** have the following form::
+
+ Merge PR #36257 into main
+ * refs/pull/36257/head:
+ client: move client_lock to _unmount()
+ client: add timer_lock support
+ Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
+
diff --git a/doc/dev/developer_guide/dash-devel.rst b/doc/dev/developer_guide/dash-devel.rst
new file mode 100644
index 000000000..29974555a
--- /dev/null
+++ b/doc/dev/developer_guide/dash-devel.rst
@@ -0,0 +1,2590 @@
+.. _dashdevel:
+
+Ceph Dashboard Developer Documentation
+======================================
+
+.. contents:: Table of Contents
+
+Feature Design
+--------------
+
+To promote collaboration on new Ceph Dashboard features, the first step is
+the definition of a design document. These documents then form the basis of
+implementation scope and permit wider participation in the evolution of the
+Ceph Dashboard UI.
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Design Documents:
+
+ UI Design Goals <../dashboard/ui_goals>
+
+
+Preliminary Steps
+-----------------
+
+The following documentation chapters expect a running Ceph cluster and at
+least a running ``dashboard`` manager module (with few exceptions). This
+chapter gives an introduction on how to set up such a system for development,
+without the need to set up a full-blown production environment. All options
+introduced in this chapter are based on a so called ``vstart`` environment.
+
+.. note::
+
+ Every ``vstart`` environment needs Ceph `to be compiled`_ from its Github
+ repository, though Docker environments simplify that step by providing a
+ shell script that contains those instructions.
+
+ One exception to this rule are the `build-free`_ capabilities of
+ `ceph-dev`_. See below for more information.
+
+.. _to be compiled: https://docs.ceph.com/docs/master/install/build-ceph/
+
+vstart
+~~~~~~
+
+"vstart" is actually a shell script in the ``src/`` directory of the Ceph
+repository (``src/vstart.sh``). It is used to start a single node Ceph
+cluster on the machine where it is executed. Several required and some
+optional Ceph internal services are started automatically when it is used to
+start a Ceph cluster. vstart is the basis for the three most commonly used
+development environments in Ceph Dashboard.
+
+You can read more about vstart in `Deploying a development cluster`_.
+Additional information for developers can also be found in the `Developer
+Guide`_.
+
+.. _Deploying a development cluster: https://docs.ceph.com/docs/master/dev/dev_cluster_deployement/
+.. _Developer Guide: https://docs.ceph.com/docs/master/dev/quick_guide/
+
+Host-based vs Docker-based Development Environments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This document introduces you to three different development environments, all
+based on vstart. Those are:
+
+- vstart running on your host system
+
+- vstart running in a Docker environment
+
+ * ceph-dev-docker_
+ * ceph-dev_
+
+ Besides their independent development branches and sometimes slightly
+ different approaches, they also differ with respect to their underlying
+ operating systems.
+
+ ========= ====================== ========
+ Release ceph-dev-docker ceph-dev
+ ========= ====================== ========
+ Mimic openSUSE Leap 15 CentOS 7
+ Nautilus openSUSE Leap 15 CentOS 7
+ Octopus openSUSE Leap 15.2 CentOS 8
+ --------- ---------------------- --------
+ Master openSUSE Tumbleweed CentOS 8
+ ========= ====================== ========
+
+.. note::
+
+ Independently of which of these environments you will choose, you need to
+ compile Ceph in that environment. If you compiled Ceph on your host system,
+ you would have to recompile it on Docker to be able to switch to a Docker
+ based solution. The same is true vice versa. If you previously used a
+ Docker development environment and compiled Ceph there and you now want to
+ switch to your host system, you will also need to recompile Ceph (or
+ compile Ceph using another separate repository).
+
+ `ceph-dev`_ is an exception to this rule as one of the options it provides
+ is `build-free`_. This is accomplished through a Ceph installation using
+ RPM system packages. You will still be able to work with a local Github
+ repository like you are used to.
+
+
+Development environment on your host system
+...........................................
+
+- No need to learn or have experience with Docker, jump in right away.
+
+- Limited amount of scripts to support automation (like Ceph compilation).
+
+- No pre-configured easy-to-start services (Prometheus, Grafana, etc).
+
+- Limited amount of host operating systems supported, depending on which
+ Ceph version is supposed to be used.
+
+- Dependencies need to be installed on your host.
+
+- You might find yourself in the situation where you need to upgrade your
+ host operating system (for instance due to a change of the GCC version used
+ to compile Ceph).
+
+
+Development environments based on Docker
+........................................
+
+- Some overhead in learning Docker if you are not used to it yet.
+
+- Both Docker projects provide you with scripts that help you getting started
+ and automate recurring tasks.
+
+- Both Docker environments come with partly pre-configured external services
+ which can be used to attach to or complement Ceph Dashboard features, like
+
+ - Prometheus
+ - Grafana
+ - Node-Exporter
+ - Shibboleth
+ - HAProxy
+
+- Works independently of the operating system you use on your host.
+
+
+.. _build-free: https://github.com/rhcs-dashboard/ceph-dev#quick-install-rpm-based
+
+vstart on your host system
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The vstart script is usually called from your `build/` directory like so:
+
+.. code::
+
+ ../src/vstart.sh -n -d
+
+In this case ``-n`` ensures that a new vstart cluster is created and that a
+possibly previously created cluster isn't re-used. ``-d`` enables debug
+messages in log files. There are several more options to chose from. You can
+get a list using the ``--help`` argument.
+
+At the end of the output of vstart, there should be information about the
+dashboard and its URLs::
+
+ vstart cluster complete. Use stop.sh to stop. See out/* (e.g. 'tail -f out/????') for debug output.
+
+ dashboard urls: https://192.168.178.84:41259, https://192.168.178.84:43259, https://192.168.178.84:45259
+ w/ user/pass: admin / admin
+ restful urls: https://192.168.178.84:42259, https://192.168.178.84:44259, https://192.168.178.84:46259
+ w/ user/pass: admin / 598da51f-8cd1-4161-a970-b2944d5ad200
+
+During development (especially in backend development), you also want to
+check on occasions if the dashboard manager module is still running. To do so
+you can call `./bin/ceph mgr services` manually. It will list all the URLs of
+successfully enabled services. Only URLs of services which are available over
+HTTP(S) will be listed there. Ceph Dashboard is one of these services. It
+should look similar to the following output:
+
+.. code::
+
+ $ ./bin/ceph mgr services
+ {
+ "dashboard": "https://home:41931/",
+ "restful": "https://home:42931/"
+ }
+
+By default, this environment uses a randomly chosen port for Ceph Dashboard
+and you need to use this command to find out which one it has become.
+
+Docker
+~~~~~~
+
+Docker development environments usually ship with a lot of useful scripts.
+``ceph-dev-docker`` for instance contains a file called `start-ceph.sh`,
+which cleans up log files, always starts a Rados Gateway service, sets some
+Ceph Dashboard configuration options and automatically runs a frontend proxy,
+all before or after starting up your vstart cluster.
+
+Instructions on how to use those environments are contained in their
+respective repository README files.
+
+- ceph-dev-docker_
+- ceph-dev_
+
+.. _ceph-dev-docker: https://github.com/ricardoasmarques/ceph-dev-docker
+.. _ceph-dev: https://github.com/rhcs-dashboard/ceph-dev
+
+Frontend Development
+--------------------
+
+Before you can start the dashboard from within a development environment, you
+will need to generate the frontend code and either use a compiled and running
+Ceph cluster (e.g. started by ``vstart.sh``) or the standalone development web
+server.
+
+The build process is based on `Node.js <https://nodejs.org/>`_ and requires the
+`Node Package Manager <https://www.npmjs.com/>`_ ``npm`` to be installed.
+
+Prerequisites
+~~~~~~~~~~~~~
+
+ * Node 12.18.2 or higher
+ * NPM 6.13.4 or higher
+
+nodeenv:
+ During Ceph's build we create a virtualenv with ``node`` and ``npm``
+ installed, which can be used as an alternative to installing node/npm in your
+ system.
+
+ If you want to use the node installed in the virtualenv you just need to
+ activate the virtualenv before you run any npm commands. To activate it run
+ ``. build/src/pybind/mgr/dashboard/node-env/bin/activate``.
+
+ Once you finish, you can simply run ``deactivate`` and exit the virtualenv.
+
+Angular CLI:
+ If you do not have the `Angular CLI <https://github.com/angular/angular-cli>`_
+ installed globally, then you need to execute ``ng`` commands with an
+ additional ``npm run`` before it.
+
+Package installation
+~~~~~~~~~~~~~~~~~~~~
+
+Run ``npm ci`` in directory ``src/pybind/mgr/dashboard/frontend`` to
+install the required packages locally.
+
+Adding or updating packages
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Run the following commands to add/update a package::
+
+ npm install <PACKAGE_NAME>
+ npm ci
+
+Setting up a Development Server
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Create the ``proxy.conf.json`` file based on ``proxy.conf.json.sample``.
+
+Run ``npm start`` for a dev server.
+Navigate to ``http://localhost:4200/``. The app will automatically
+reload if you change any of the source files.
+
+Code Scaffolding
+~~~~~~~~~~~~~~~~
+
+Run ``ng generate component component-name`` to generate a new
+component. You can also use
+``ng generate directive|pipe|service|class|guard|interface|enum|module``.
+
+Build the Project
+~~~~~~~~~~~~~~~~~
+
+Run ``npm run build`` to build the project. The build artifacts will be
+stored in the ``dist/`` directory. Use the ``--prod`` flag for a
+production build (``npm run build -- --prod``). Navigate to ``https://localhost:8443``.
+
+Build the Code Documentation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Run ``npm run doc-build`` to generate code docs in the ``documentation/``
+directory. To make them accessible locally for a web browser, run
+``npm run doc-serve`` and they will become available at ``http://localhost:8444``.
+With ``npm run compodoc -- <opts>`` you may
+`fully configure it <https://compodoc.app/guides/usage.html>`_.
+
+Code linting and formatting
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We use the following tools to lint and format the code in all our TS, SCSS and
+HTML files:
+
+- `codelyzer <http://codelyzer.com/>`_
+- `html-linter <https://github.com/chinchiheather/html-linter>`_
+- `htmllint-cli <https://github.com/htmllint/htmllint-cli>`_
+- `Prettier <https://prettier.io/>`_
+- `TSLint <https://palantir.github.io/tslint/>`_
+- `stylelint <https://stylelint.io/>`_
+
+We added 2 npm scripts to help run these tools:
+
+- ``npm run lint``, will check frontend files against all linters
+- ``npm run fix``, will try to fix all the detected linting errors
+
+Ceph Dashboard and Bootstrap
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Currently we are using Bootstrap on the Ceph Dashboard as a CSS framework. This means that most of our SCSS and HTML
+code can make use of all the utilities and other advantages Bootstrap is offering. In the past we often have used our
+own custom styles and this lead to more and more variables with a single use and double defined variables which
+sometimes are forgotten to be removed or it led to styling be inconsistent because people forgot to change a color or to
+adjust a custom SCSS class.
+
+To get the current version of Bootstrap used inside Ceph please refer to the ``package.json`` and search for:
+
+- ``bootstrap``: For the Bootstrap version used.
+- ``@ng-bootstrap``: For the version of the Angular bindings which we are using.
+
+So for the future please do the following when visiting a component:
+
+- Does this HTML/SCSS code use custom code? - If yes: Is it needed? --> Clean it up before changing the things you want
+ to fix or change.
+- If you are creating a new component: Please make use of Bootstrap as much as reasonably possible! Don't try to
+ reinvent the wheel.
+- If possible please look up if Bootstrap has guidelines on how to extend it properly to do achieve what you want to
+ achieve.
+
+The more bootstrap alike our code is the easier it is to theme, to maintain and the less bugs we will have. Also since
+Bootstrap is a framework which tries to have usability and user experience in mind we increase both points
+exponentially. The biggest benefit of all is that there is less code for us to maintain which makes it easier to read
+for beginners and even more easy for people how are already familiar with the code.
+
+Writing Unit Tests
+~~~~~~~~~~~~~~~~~~
+
+To write unit tests most efficient we have a small collection of tools,
+we use within test suites.
+
+Those tools can be found under
+``src/pybind/mgr/dashboard/frontend/src/testing/``, especially take
+a look at ``unit-test-helper.ts``.
+
+There you will be able to find:
+
+``configureTestBed`` that replaces the initial ``TestBed``
+methods. It takes the same arguments as ``TestBed.configureTestingModule``.
+Using it will run your tests a lot faster in development, as it doesn't
+recreate everything from scratch on every test. To use the default behaviour
+pass ``true`` as the second argument.
+
+``PermissionHelper`` to help determine if
+the correct actions are shown based on the current permissions and selection
+in a list.
+
+``FormHelper`` which makes testing a form a lot easier
+with a few simple methods. It allows you to set a control or multiple
+controls, expect if a control is valid or has an error or just do both with
+one method. Additional you can expect a template element or multiple elements
+to be visible in the rendered template.
+
+Running Unit Tests
+~~~~~~~~~~~~~~~~~~
+
+Run ``npm run test`` to execute the unit tests via `Jest
+<https://facebook.github.io/jest/>`_.
+
+If you get errors on all tests, it could be because `Jest
+<https://facebook.github.io/jest/>`__ or something else was updated.
+There are a few ways how you can try to resolve this:
+
+- Remove all modules with ``rm -rf dist node_modules`` and run ``npm install``
+ again in order to reinstall them
+- Clear the cache of jest by running ``npx jest --clearCache``
+
+Running End-to-End (E2E) Tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We use `Cypress <https://www.cypress.io/>`__ to run our frontend E2E tests.
+
+E2E Prerequisites
+.................
+
+You need to previously build the frontend.
+
+In some environments, depending on your user permissions and the CYPRESS_CACHE_FOLDER,
+you might need to run ``npm ci`` with the ``--unsafe-perm`` flag.
+
+You might need to install additional packages to be able to run Cypress.
+Please run ``npx cypress verify`` to verify it.
+
+run-frontend-e2e-tests.sh
+.........................
+
+Our ``run-frontend-e2e-tests.sh`` script is the go to solution when you wish to
+do a full scale e2e run.
+It will verify if everything needed is installed, start a new vstart cluster
+and run the full test suite.
+
+Start all frontend E2E tests by running::
+
+ $ ./run-frontend-e2e-tests.sh
+
+Report:
+ You can follow the e2e report on the terminal and you can find the screenshots
+ of failed test cases by opening the following directory::
+
+ src/pybind/mgr/dashboard/frontend/cypress/screenshots/
+
+Device:
+ You can force the script to use a specific device with the ``-d`` flag::
+
+ $ ./run-frontend-e2e-tests.sh -d <chrome|chromium|electron|docker>
+
+Remote:
+ By default this script will stop and start a new vstart cluster.
+ If you want to run the tests outside the ceph environment, you will need to
+ manually define the dashboard url using ``-r`` and, optionally, credentials
+ (``-u``, ``-p``)::
+
+ $ ./run-frontend-e2e-tests.sh -r <DASHBOARD_URL> -u <E2E_LOGIN_USER> -p <E2E_LOGIN_PWD>
+
+Note:
+ When using docker, as your device, you might need to run the script with sudo
+ permissions.
+
+run-cephadm-e2e-tests.sh
+.........................
+
+``run-cephadm-e2e-tests.sh`` runs a subset of E2E tests to verify that the Dashboard and cephadm as
+Orchestrator backend behave correctly.
+
+Prerequisites: you need to install `KCLI
+<https://kcli.readthedocs.io/en/latest/>`_ and Node.js in your local machine.
+
+Configure KCLI plan requirements::
+
+ $ sudo chown -R $(id -un) /var/lib/libvirt/images
+ $ mkdir -p /var/lib/libvirt/images/ceph-dashboard dashboard
+ $ kcli create pool -p /var/lib/libvirt/images/ceph-dashboard dashboard
+ $ kcli create network -c 192.168.100.0/24 dashboard
+
+Note:
+ This script is aimed to be run as jenkins job so the cleanup is triggered only in a jenkins
+ environment. In local, the user will shutdown the cluster when desired (i.e. after debugging).
+
+Start E2E tests by running::
+
+ $ cd <your/ceph/repo/dir>
+ $ sudo chown -R $(id -un) src/pybind/mgr/dashboard/frontend/{dist,node_modules,src/environments}
+ $ ./src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh
+
+Note:
+ In fedora 35, there can occur a permission error when trying to mount the shared_folders. This can be
+ fixed by running::
+
+ $ sudo setfacl -R -m u:qemu:rwx <abs-path-to-your-user-home>
+
+ or also by setting the appropriate permission to your $HOME directory
+
+You can also start a cluster in development mode (so the frontend build starts in watch mode and you
+only have to reload the page for the changes to be reflected) by running::
+
+ $ ./src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh --dev-mode
+
+Note:
+ Add ``--expanded`` if you need a cluster ready to deploy services (one with enough monitor
+ daemons spread across different hosts and enough OSDs).
+
+Test your changes by running:
+
+ $ ./src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh
+
+Shutdown the cluster by running:
+
+ $ kcli delete plan -y ceph
+ $ # In development mode, also kill the npm build watch process (e.g., pkill -f "ng build")
+
+Other running options
+.....................
+
+During active development, it is not recommended to run the previous script,
+as it is not prepared for constant file changes.
+Instead you should use one of the following commands:
+
+- ``npm run e2e`` - This will run ``ng serve`` and open the Cypress Test Runner.
+- ``npm run e2e:ci`` - This will run ``ng serve`` and run the Cypress Test Runner once.
+- ``npx cypress run`` - This calls cypress directly and will run the Cypress Test Runner.
+ You need to have a running frontend server.
+- ``npx cypress open`` - This calls cypress directly and will open the Cypress Test Runner.
+ You need to have a running frontend server.
+
+Calling Cypress directly has the advantage that you can use any of the available
+`flags <https://docs.cypress.io/guides/guides/command-line.html#cypress-run>`__
+to customize your test run and you don't need to start a frontend server each time.
+
+Using one of the ``open`` commands, will open a cypress application where you
+can see all the test files you have and run each individually.
+This is going to be run in watch mode, so if you make any changes to test files,
+it will retrigger the test run.
+This cannot be used inside docker, as it requires X11 environment to be able to open.
+
+By default Cypress will look for the web page at ``https://localhost:4200/``.
+If you are serving it in a different URL you will need to configure it by
+exporting the environment variable CYPRESS_BASE_URL with the new value.
+E.g.: ``CYPRESS_BASE_URL=https://localhost:41076/ npx cypress open``
+
+CYPRESS_CACHE_FOLDER
+.....................
+
+When installing cypress via npm, a binary of the cypress app will also be
+downloaded and stored in a cache folder.
+This removes the need to download it every time you run ``npm ci`` or even when
+using cypress in a separate project.
+
+By default Cypress uses ~/.cache to store the binary.
+To prevent changes to the user home directory, we have changed this folder to
+``/ceph/build/src/pybind/mgr/dashboard/cypress``, so when you build ceph or run
+``run-frontend-e2e-tests.sh`` this is the directory Cypress will use.
+
+When using any other command to install or run cypress,
+it will go back to the default directory. It is recommended that you export the
+CYPRESS_CACHE_FOLDER environment variable with a fixed directory, so you always
+use the same directory no matter which command you use.
+
+
+Writing End-to-End Tests
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+The PagerHelper class
+.....................
+
+The ``PageHelper`` class is supposed to be used for general purpose code that
+can be used on various pages or suites.
+
+Examples are
+
+- ``navigateTo()`` - Navigates to a specific page and waits for it to load
+- ``getFirstTableCell()`` - returns the first table cell. You can also pass a
+ string with the desired content and it will return the first cell that
+ contains it.
+- ``getTabsCount()`` - returns the amount of tabs
+
+Every method that could be useful on several pages belongs there. Also, methods
+which enhance the derived classes of the PageHelper belong there. A good
+example for such a case is the ``restrictTo()`` decorator. It ensures that a
+method implemented in a subclass of PageHelper is called on the correct page.
+It will also show a developer-friendly warning if this is not the case.
+
+Subclasses of PageHelper
+........................
+
+Helper Methods
+""""""""""""""
+
+In order to make code reusable which is specific for a particular suite, make
+sure to put it in a derived class of the ``PageHelper``. For instance, when
+talking about the pool suite, such methods would be ``create()``, ``exist()``
+and ``delete()``. These methods are specific to a pool but are useful for other
+suites.
+
+Methods that return HTML elements which can only be found on a specific page,
+should be either implemented in the helper methods of the subclass of PageHelper
+or as own methods of the subclass of PageHelper.
+
+Using PageHelpers
+"""""""""""""""""
+
+In any suite, an instance of the specific ``Helper`` class should be
+instantiated and called directly.
+
+.. code:: TypeScript
+
+ const pools = new PoolPageHelper();
+
+ it('should create a pool', () => {
+ pools.exist(poolName, false);
+ pools.navigateTo('create');
+ pools.create(poolName, 8);
+ pools.exist(poolName, true);
+ });
+
+Code Style
+..........
+
+Please refer to the official `Cypress Core Concepts
+<https://docs.cypress.io/guides/core-concepts/introduction-to-cypress.html#Cypress-Can-Be-Simple-Sometimes>`__
+for a better insight on how to write and structure tests.
+
+``describe()`` vs ``it()``
+""""""""""""""""""""""""""
+
+Both ``describe()`` and ``it()`` are function blocks, meaning that any
+executable code necessary for the test can be contained in either block.
+However, Typescript scoping rules still apply, therefore any variables declared
+in a ``describe`` are available to the ``it()`` blocks inside of it.
+
+``describe()`` typically are containers for tests, allowing you to break tests
+into multiple parts. Likewise, any setup that must be made before your tests are
+run can be initialized within the ``describe()`` block. Here is an example:
+
+.. code:: TypeScript
+
+ describe('create, edit & delete image test', () => {
+ const poolName = 'e2e_images_pool';
+
+ before(() => {
+ cy.login();
+ pools.navigateTo('create');
+ pools.create(poolName, 8, 'rbd');
+ pools.exist(poolName, true);
+ });
+
+ beforeEach(() => {
+ cy.login();
+ images.navigateTo();
+ });
+
+ //...
+
+ });
+
+As shown, we can initiate the variable ``poolName`` as well as run commands
+before our test suite begins (creating a pool). ``describe()`` block messages
+should include what the test suite is.
+
+``it()`` blocks typically are parts of an overarching test. They contain the
+functionality of the test suite, each performing individual roles.
+Here is an example:
+
+.. code:: TypeScript
+
+ describe('create, edit & delete image test', () => {
+ //...
+
+ it('should create image', () => {
+ images.createImage(imageName, poolName, '1');
+ images.getFirstTableCell(imageName).should('exist');
+ });
+
+ it('should edit image', () => {
+ images.editImage(imageName, poolName, newImageName, '2');
+ images.getFirstTableCell(newImageName).should('exist');
+ });
+
+ //...
+ });
+
+As shown from the previous example, our ``describe()`` test suite is to create,
+edit and delete an image. Therefore, each ``it()`` completes one of these steps,
+one for creating, one for editing, and so on. Likewise, every ``it()`` blocks
+message should be in lowercase and written so long as "it" can be the prefix of
+the message. For example, ``it('edits the test image' () => ...)`` vs.
+``it('image edit test' () => ...)``. As shown, the first example makes
+grammatical sense with ``it()`` as the prefix whereas the second message does
+not. ``it()`` should describe what the individual test is doing and what it
+expects to happen.
+
+Differences between Frontend Unit Tests and End-to-End (E2E) Tests / FAQ
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+General introduction about testing and E2E/unit tests
+
+
+What are E2E/unit tests designed for?
+.....................................
+
+E2E test:
+
+It requires a fully functional system and tests the interaction of all components
+of the application (Ceph, back-end, front-end).
+E2E tests are designed to mimic the behavior of the user when interacting with the application
+- for example when it comes to workflows like creating/editing/deleting an item.
+Also the tests should verify that certain items are displayed as a user would see them
+when clicking through the UI (for example a menu entry or a pool that has been
+created during a test and the pool and its properties should be displayed in the table).
+
+Angular Unit Tests:
+
+Unit tests, as the name suggests, are tests for smaller units of the code.
+Those tests are designed for testing all kinds of Angular components (e.g. services, pipes etc.).
+They do not require a connection to the backend, hence those tests are independent of it.
+The expected data of the backend is mocked in the frontend and by using this data
+the functionality of the frontend can be tested without having to have real data from the backend.
+As previously mentioned, data is either mocked or, in a simple case, contains a static input,
+a function call and an expected static output.
+More complex examples include the state of a component (attributes of the component class),
+that define how the output changes according to the given input.
+
+Which E2E/unit tests are considered to be valid?
+................................................
+
+This is not easy to answer, but new tests that are written in the same way as already existing
+dashboard tests should generally be considered valid.
+Unit tests should focus on the component to be tested.
+This is either an Angular component, directive, service, pipe, etc.
+
+E2E tests should focus on testing the functionality of the whole application.
+Approximately a third of the overall E2E tests should verify the correctness
+of user visible elements.
+
+How should an E2E/unit test look like?
+......................................
+
+Unit tests should focus on the described purpose
+and shouldn't try to test other things in the same `it` block.
+
+E2E tests should contain a description that either verifies
+the correctness of a user visible element or a complete process
+like for example the creation/validation/deletion of a pool.
+
+What should an E2E/unit test cover?
+...................................
+
+E2E tests should mostly, but not exclusively, cover interaction with the backend.
+This way the interaction with the backend is utilized to write integration tests.
+
+A unit test should mostly cover critical or complex functionality
+of a component (Angular Components, Services, Pipes, Directives, etc).
+
+What should an E2E/unit test NOT cover?
+.......................................
+
+Avoid duplicate testing: do not write E2E tests for what's already
+been covered as frontend-unit tests and vice versa.
+It may not be possible to completely avoid an overlap.
+
+Unit tests should not be used to extensively click through components and E2E tests
+shouldn't be used to extensively test a single component of Angular.
+
+Best practices/guideline
+........................
+
+As a general guideline we try to follow the 70/20/10 approach - 70% unit tests,
+20% integration tests and 10% end-to-end tests.
+For further information please refer to `this document
+<https://testing.googleblog.com/2015/04/just-say-no-to-more-end-to-end-tests.html>`__
+and the included "Testing Pyramid".
+
+Further Help
+~~~~~~~~~~~~
+
+To get more help on the Angular CLI use ``ng help`` or go check out the
+`Angular CLI
+README <https://github.com/angular/angular-cli/blob/master/README.md>`__.
+
+Example of a Generator
+~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ # Create module 'Core'
+ src/app> ng generate module core -m=app --routing
+
+ # Create module 'Auth' under module 'Core'
+ src/app/core> ng generate module auth -m=core --routing
+ or, alternatively:
+ src/app> ng generate module core/auth -m=core --routing
+
+ # Create component 'Login' under module 'Auth'
+ src/app/core/auth> ng generate component login -m=core/auth
+ or, alternatively:
+ src/app> ng generate component core/auth/login -m=core/auth
+
+Frontend Typescript Code Style Guide Recommendations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Group the imports based on its source and separate them with a blank
+line.
+
+The source groups can be either from Angular, external or internal.
+
+Example:
+
+.. code:: javascript
+
+ import { Component } from '@angular/core';
+ import { Router } from '@angular/router';
+
+ import { ToastrManager } from 'ngx-toastr';
+
+ import { Credentials } from '../../../shared/models/credentials.model';
+ import { HostService } from './services/host.service';
+
+Frontend components
+~~~~~~~~~~~~~~~~~~~
+
+There are several components that can be reused on different pages.
+This components are declared on the components module:
+`src/pybind/mgr/dashboard/frontend/src/app/shared/components`.
+
+Helper
+~~~~~~
+
+This component should be used to provide additional information to the user.
+
+Example:
+
+.. code:: html
+
+ <cd-helper>
+ Some <strong>helper</strong> html text
+ </cd-helper>
+
+Terminology and wording
+~~~~~~~~~~~~~~~~~~~~~~~
+
+Instead of using the Ceph component names, the approach
+suggested is to use the logical/generic names (Block over RBD, Filesystem over
+CephFS, Object over RGW). Nevertheless, as Ceph-Dashboard cannot completely hide
+the Ceph internals, some Ceph-specific names might remain visible.
+
+Regarding the wording for action labels and other textual elements (form titles,
+buttons, etc.), the chosen approach is to follow `these guidelines
+<https://www.patternfly.org/styles/terminology-and-wording/#terminology-and-wording-for-action-labels>`_.
+As a rule of thumb, 'Create' and 'Delete' are the proper wording for most forms,
+instead of 'Add' and 'Remove', unless some already created item is either added
+or removed to/from a set of items (e.g.: 'Add permission' to a user vs. 'Create
+(new) permission').
+
+In order to enforce the use of this wording, a service ``ActionLabelsI18n`` has
+been created, which provides translated labels for use in UI elements.
+
+Frontend branding
+~~~~~~~~~~~~~~~~~
+
+Every vendor can customize the 'Ceph dashboard' to his needs. No matter if
+logo, HTML-Template or TypeScript, every file inside the frontend folder can be
+replaced.
+
+To replace files, open ``./frontend/angular.json`` and scroll to the section
+``fileReplacements`` inside the production configuration. Here you can add the
+files you wish to brand. We recommend to place the branded version of a file in
+the same directory as the original one and to add a ``.brand`` to the file
+name, right in front of the file extension. A ``fileReplacement`` could for
+example look like this:
+
+.. code:: javascript
+
+ {
+ "replace": "src/app/core/auth/login/login.component.html",
+ "with": "src/app/core/auth/login/login.component.brand.html"
+ }
+
+To serve or build the branded user interface run:
+
+ $ npm run start -- --prod
+
+or
+
+ $ npm run build -- --prod
+
+Unfortunately it's currently not possible to use multiple configurations when
+serving or building the UI at the same time. That means a configuration just
+for the branding ``fileReplacements`` is not an option, because you want to use
+the production configuration anyway
+(https://github.com/angular/angular-cli/issues/10612).
+Furthermore it's also not possible to use glob expressions for
+``fileReplacements``. As long as the feature hasn't been implemented, you have
+to add the file replacements manually to the angular.json file
+(https://github.com/angular/angular-cli/issues/12354).
+
+Nevertheless you should stick to the suggested naming scheme because it makes
+it easier for you to use glob expressions once it's supported in the future.
+
+To change the variable defaults or add your own ones you can overwrite them in
+``./frontend/src/styles/vendor/_variables.scss``.
+Just reassign the variable you want to change, for example ``$color-primary: teal;``
+To overwrite or extend the default CSS, you can add your own styles in
+``./frontend/src/styles/vendor/_style-overrides.scss``.
+
+UI Style Guide
+~~~~~~~~~~~~~~
+
+The style guide is created to document Ceph Dashboard standards and maintain
+consistency across the project. Its an effort to make it easier for
+contributors to process designing and deciding mockups and designs for
+Dashboard.
+
+The development environment for Ceph Dashboard has live reloading enabled so
+any changes made in UI are reflected in open browser windows. Ceph Dashboard
+uses Bootstrap as the main third-party CSS library.
+
+Avoid duplication of code. Be consistent with the existing UI by reusing
+existing SCSS declarations as much as possible.
+
+Always check for existing code similar to what you want to write.
+You should always try to keep the same look-and-feel as the existing code.
+
+Colors
+......
+
+All the colors used in Ceph Dashboard UI are listed in
+`frontend/src/styles/defaults/_bootstrap-defaults.scss`. If using new color
+always define color variables in the `_bootstrap-defaults.scss` and
+use the variable instead of hard coded color values so that changes to the
+color are reflected in similar UI elements.
+
+The main color for the Ceph Dashboard is `$primary`. The primary color is
+used in navigation components and as the `$border-color` for input components of
+form.
+
+The secondary color is `$secondary` and is the background color for Ceph
+Dashboard.
+
+Buttons
+.......
+
+Buttons are used for performing actions such as: “Submit”, “Edit, “Create" and
+“Update”.
+
+**Forms:** When using to submit forms anywhere in the Dashboard, the main action
+button should use the `cd-submit-button` component and the secondary button should
+use `cd-back-button` component. The text on the action button should be same as the
+form title and follow a title case. The text on the secondary button should be
+`Cancel`. `Perform action` button should always be on right while `Cancel`
+button should always be on left.
+
+**Modals**: The main action button should use the `cd-submit-button` component and
+the secondary button should use `cd-back-button` component. The text on the action
+button should follow a title case and correspond to the action to be performed.
+The text on the secondary button should be `Close`.
+
+**Disclosure Button:** Disclosure buttons should be used to allow users to
+display and hide additional content in the interface.
+
+**Action Button**: Use the action button to perform actions such as edit or update
+a component. All action button should have an icon corresponding to the actions they
+perform and button text should follow title case. The button color should be the
+same as the form's main button color.
+
+**Drop Down Buttons:** Use dropdown buttons to display predefined lists of
+actions. All drop down buttons have icons corresponding to the action they
+perform.
+
+Links
+.....
+
+Use text hyperlinks as navigation to guide users to a new page in the application
+or to anchor users to a section within a page. The color of the hyperlinks
+should be `$primary`.
+
+Forms
+.....
+
+Mark invalid form fields with red outline and show a meaningful error message.
+Use red as font color for message and be as specific as possible.
+`This field is required.` should be the exact error message for required fields.
+Mark valid forms with a green outline and a green tick at the end of the form.
+Sections should not have a bigger header than the parent.
+
+Modals
+......
+
+Blur any interface elements in the background to bring the modal content into
+focus. The heading of the modal should reflect the action it can perform and
+should be clearly mentioned at the top of the modal. Use `cd-back-button`
+component in the footer for closing the modal.
+
+Icons
+.....
+
+We use `Fork Awesome <https://forkaweso.me/Fork-Awesome/>`_ classes for icons.
+We have a list of used icons in `src/app/shared/enum/icons.enum.ts`, these
+should be referenced in the HTML, so its easier to change them later. When
+icons are next to text, they should be center-aligned horizontally. If icons
+are stacked, they should also be center-aligned vertically. Use small icons
+with buttons. For notifications use large icons.
+
+Navigation
+..........
+
+For local navigation use tabs. For overall navigation use expandable vertical
+navigation to collapse and expand items as needed.
+
+Alerts and notifications
+........................
+
+Default notification should have `text-info` color. Success notification should
+have `text-success` color. Failure notification should have `text-danger` color.
+
+Error Handling
+~~~~~~~~~~~~~~
+
+For handling front-end errors, there is a generic Error Component which can be
+found in ``./src/pybind/mgr/dashboard/frontend/src/app/core/error``. For
+reporting a new error, you can simply extend the ``DashboardError`` class
+in ``error.ts`` file and add specific header and message for the new error. Some
+generic error classes are already in place such as ``DashboardNotFoundError``
+and ``DashboardForbiddenError`` which can be called and reused in different
+scenarios.
+
+For example - ``throw new DashboardNotFoundError()``.
+
+I18N
+----
+
+How to extract messages from source code?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To extract the I18N messages from the templates and the TypeScript files just
+run the following command in ``src/pybind/mgr/dashboard/frontend``::
+
+ $ npm run i18n:extract
+
+This will extract all marked messages from the HTML templates first and then
+add all marked strings from the TypeScript files to the translation template.
+Since the extraction from TypeScript files is still not supported by Angular
+itself, we are using the
+`ngx-translator <https://github.com/ngx-translate/i18n-polyfill>`_ extractor to
+parse the TypeScript files.
+
+When the command ran successfully, it should have created or updated the file
+``src/locale/messages.xlf``.
+
+The file isn't tracked by git, you can just use it to start with the
+translation offline or add/update the resource files on transifex.
+
+Supported languages
+~~~~~~~~~~~~~~~~~~~
+
+All our supported languages should be registered in both exports in
+``supported-languages.enum.ts`` and have a corresponding test in
+``language-selector.component.spec.ts``.
+
+The ``SupportedLanguages`` enum will provide the list for the default language selection.
+
+Translating process
+~~~~~~~~~~~~~~~~~~~
+
+To facilitate the translation process of the dashboard we are using a web tool
+called `transifex <https://www.transifex.com/>`_.
+
+If you wish to help translating to any language just go to our `transifex
+project page <https://www.transifex.com/ceph/ceph-dashboard/>`_, join the
+project and you can start translating immediately.
+
+All translations will then be reviewed and later pushed upstream.
+
+Updating translated messages
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Any time there are new messages translated and reviewed in a specific language
+we should update the translation file upstream.
+
+To do that, check the settings in the i18n config file
+``src/pybind/mgr/dashboard/frontend/i18n.config.json``:: and make sure that the
+organization is *ceph*, the project is *ceph-dashboard* and the resource is
+the one you want to pull from and push to e.g. *Master:master*. To find a list
+of available resources visit `<https://www.transifex.com/ceph/ceph-dashboard/content/>`_.
+
+After you checked the config go to the directory ``src/pybind/mgr/dashboard/frontend`` and run::
+
+ $ npm run i18n
+
+This command will extract all marked messages from the HTML templates and
+TypeScript files. Once the source file has been created it will push it to
+transifex and pull the latest translations. It will also fill all the
+untranslated strings with the source string.
+The tool will ask you for an api token, unless you added it by running:
+
+ $ npm run i18n:token
+
+To create a transifex api token visit `<https://www.transifex.com/user/settings/api/>`_.
+
+After the command ran successfully, build the UI and check if everything is
+working as expected. You also might want to run the frontend tests.
+
+Suggestions
+~~~~~~~~~~~
+
+Strings need to start and end in the same line as the element:
+
+.. code-block:: html
+
+ <!-- avoid -->
+ <span i18n>
+ Foo
+ </span>
+
+ <!-- recommended -->
+ <span i18n>Foo</span>
+
+
+ <!-- avoid -->
+ <span i18n>
+ Foo bar baz.
+ Foo bar baz.
+ </span>
+
+ <!-- recommended -->
+ <span i18n>Foo bar baz.
+ Foo bar baz.</span>
+
+Isolated interpolations should not be translated:
+
+.. code-block:: html
+
+ <!-- avoid -->
+ <span i18n>{{ foo }}</span>
+
+ <!-- recommended -->
+ <span>{{ foo }}</span>
+
+Interpolations used in a sentence should be kept in the translation:
+
+.. code-block:: html
+
+ <!-- recommended -->
+ <span i18n>There are {{ x }} OSDs.</span>
+
+Remove elements that are outside the context of the translation:
+
+.. code-block:: html
+
+ <!-- avoid -->
+ <label i18n>
+ Profile
+ <span class="required"></span>
+ </label>
+
+ <!-- recommended -->
+ <label>
+ <ng-container i18n>Profile<ng-container>
+ <span class="required"></span>
+ </label>
+
+Keep elements that affect the sentence:
+
+.. code-block:: html
+
+ <!-- recommended -->
+ <span i18n>Profile <b>foo</b> will be removed.</span>
+
+Backend Development
+-------------------
+
+The Python backend code of this module requires a number of Python modules to be
+installed. They are listed in file ``requirements.txt``. Using `pip
+<https://pypi.python.org/pypi/pip>`_ you may install all required dependencies
+by issuing ``pip install -r requirements.txt`` in directory
+``src/pybind/mgr/dashboard``.
+
+If you're using the `ceph-dev-docker development environment
+<https://github.com/ricardoasmarques/ceph-dev-docker/>`_, simply run
+``./install_deps.sh`` from the toplevel directory to install them.
+
+Unit Testing
+~~~~~~~~~~~~
+
+In dashboard we have two different kinds of backend tests:
+
+1. Unit tests based on ``tox``
+2. API tests based on Teuthology.
+
+Unit tests based on tox
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+We included a ``tox`` configuration file that will run the unit tests under
+Python 3, as well as linting tools to guarantee the uniformity of code.
+
+You need to install ``tox`` and ``coverage`` before running it. To install the
+packages in your system, either install it via your operating system's package
+management tools, e.g. by running ``dnf install python-tox python-coverage`` on
+Fedora Linux.
+
+Alternatively, you can use Python's native package installation method::
+
+ $ pip install tox
+ $ pip install coverage
+
+To run the tests, run ``src/script/run_tox.sh`` in the dashboard directory (where
+``tox.ini`` is located)::
+
+ ## Run Python 3 tests+lint commands:
+ $ ../../../script/run_tox.sh --tox-env py3,lint,check
+
+ ## Run Python 3 arbitrary command (e.g. 1 single test):
+ $ ../../../script/run_tox.sh --tox-env py3 "" tests/test_rgw_client.py::RgwClientTest::test_ssl_verify
+
+You can also run tox instead of ``run_tox.sh``::
+
+ ## Run Python 3 tests command:
+ $ tox -e py3
+
+ ## Run Python 3 arbitrary command (e.g. 1 single test):
+ $ tox -e py3 tests/test_rgw_client.py::RgwClientTest::test_ssl_verify
+
+Python files can be automatically fixed and formatted according to PEP8
+standards by using ``run_tox.sh --tox-env fix`` or ``tox -e fix``.
+
+We also collect coverage information from the backend code when you run tests. You can check the
+coverage information provided by the tox output, or by running the following
+command after tox has finished successfully::
+
+ $ coverage html
+
+This command will create a directory ``htmlcov`` with an HTML representation of
+the code coverage of the backend.
+
+API tests based on Teuthology
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+How to run existing API tests:
+ To run the API tests against a real Ceph cluster, we leverage the Teuthology
+ framework. This has the advantage of catching bugs originated from changes in
+ the internal Ceph code.
+
+ Our ``run-backend-api-tests.sh`` script will start a ``vstart`` Ceph cluster
+ before running the Teuthology tests, and then it stops the cluster after the
+ tests are run. Of course this implies that you have built/compiled Ceph
+ previously.
+
+ Start all dashboard tests by running::
+
+ $ ./run-backend-api-tests.sh
+
+ Or, start one or multiple specific tests by specifying the test name::
+
+ $ ./run-backend-api-tests.sh tasks.mgr.dashboard.test_pool.PoolTest
+
+ Or, ``source`` the script and run the tests manually::
+
+ $ source run-backend-api-tests.sh
+ $ run_teuthology_tests [tests]...
+ $ cleanup_teuthology
+
+How to write your own tests:
+ There are two possible ways to write your own API tests:
+
+ The first is by extending one of the existing test classes in the
+ ``qa/tasks/mgr/dashboard`` directory.
+
+ The second way is by adding your own API test module if you're creating a new
+ controller for example. To do so you'll just need to add the file containing
+ your new test class to the ``qa/tasks/mgr/dashboard`` directory and implement
+ all your tests here.
+
+ .. note:: Don't forget to add the path of the newly created module to
+ ``modules`` section in ``qa/suites/rados/mgr/tasks/dashboard.yaml``.
+
+ Short example: Let's assume you created a new controller called
+ ``my_new_controller.py`` and the related test module
+ ``test_my_new_controller.py``. You'll need to add
+ ``tasks.mgr.dashboard.test_my_new_controller`` to the ``modules`` section in
+ the ``dashboard.yaml`` file.
+
+ Also, if you're removing test modules please keep in mind to remove the
+ related section. Otherwise the Teuthology test run will fail.
+
+ Please run your API tests on your dev environment (as explained above)
+ before submitting a pull request. Also make sure that a full QA run in
+ Teuthology/sepia lab (based on your changes) has completed successfully
+ before it gets merged. You don't need to schedule the QA run yourself, just
+ add the 'needs-qa' label to your pull request as soon as you think it's ready
+ for merging (e.g. make check was successful, the pull request is approved and
+ all comments have been addressed). One of the developers who has access to
+ Teuthology/the sepia lab will take care of it and report the result back to
+ you.
+
+
+How to add a new controller?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A controller is a Python class that extends from the ``BaseController`` class
+and is decorated with either the ``@Controller``, ``@ApiController`` or
+``@UiApiController`` decorators. The Python class must be stored inside a Python
+file located under the ``controllers`` directory. The Dashboard module will
+automatically load your new controller upon start.
+
+``@ApiController`` and ``@UiApiController`` are both specializations of the
+``@Controller`` decorator.
+
+The ``@ApiController`` should be used for controllers that provide an API-like
+REST interface and the ``@UiApiController`` should be used for endpoints consumed
+by the UI but that are not part of the 'public' API. For any other kinds of
+controllers the ``@Controller`` decorator should be used.
+
+A controller has a URL prefix path associated that is specified in the
+controller decorator, and all endpoints exposed by the controller will share
+the same URL prefix path.
+
+A controller's endpoint is exposed by implementing a method on the controller
+class decorated with the ``@Endpoint`` decorator.
+
+For example create a file ``ping.py`` under ``controllers`` directory with the
+following code:
+
+.. code-block:: python
+
+ from ..tools import Controller, ApiController, UiApiController, BaseController, Endpoint
+
+ @Controller('/ping')
+ class Ping(BaseController):
+ @Endpoint()
+ def hello(self):
+ return {'msg': "Hello"}
+
+ @ApiController('/ping')
+ class ApiPing(BaseController):
+ @Endpoint()
+ def hello(self):
+ return {'msg': "Hello"}
+
+ @UiApiController('/ping')
+ class UiApiPing(BaseController):
+ @Endpoint()
+ def hello(self):
+ return {'msg': "Hello"}
+
+The ``hello`` endpoint of the ``Ping`` controller can be reached by the
+following URL: https://mgr_hostname:8443/ping/hello using HTTP GET requests.
+As you can see the controller URL path ``/ping`` is concatenated to the
+method name ``hello`` to generate the endpoint's URL.
+
+In the case of the ``ApiPing`` controller, the ``hello`` endpoint can be
+reached by the following URL: https://mgr_hostname:8443/api/ping/hello using a
+HTTP GET request.
+The API controller URL path ``/ping`` is prefixed by the ``/api`` path and then
+concatenated to the method name ``hello`` to generate the endpoint's URL.
+Internally, the ``@ApiController`` is actually calling the ``@Controller``
+decorator by passing an additional decorator parameter called ``base_url``::
+
+ @ApiController('/ping') <=> @Controller('/ping', base_url="/api")
+
+``UiApiPing`` works in a similar way than the ``ApiPing``, but the URL will be
+prefixed by ``/ui-api``: https://mgr_hostname:8443/ui-api/ping/hello. ``UiApiPing`` is
+also a ``@Controller`` extension::
+
+ @UiApiController('/ping') <=> @Controller('/ping', base_url="/ui-api")
+
+The ``@Endpoint`` decorator also supports many parameters to customize the
+endpoint:
+
+* ``method="GET"``: the HTTP method allowed to access this endpoint.
+* ``path="/<method_name>"``: the URL path of the endpoint, excluding the
+ controller URL path prefix.
+* ``path_params=[]``: list of method parameter names that correspond to URL
+ path parameters. Can only be used when ``method in ['POST', 'PUT']``.
+* ``query_params=[]``: list of method parameter names that correspond to URL
+ query parameters.
+* ``json_response=True``: indicates if the endpoint response should be
+ serialized in JSON format.
+* ``proxy=False``: indicates if the endpoint should be used as a proxy.
+
+An endpoint method may have parameters declared. Depending on the HTTP method
+defined for the endpoint the method parameters might be considered either
+path parameters, query parameters, or body parameters.
+
+For ``GET`` and ``DELETE`` methods, the method's non-optional parameters are
+considered path parameters by default. Optional parameters are considered
+query parameters. By specifying the ``query_parameters`` in the endpoint
+decorator it is possible to make a non-optional parameter to be a query
+parameter.
+
+For ``POST`` and ``PUT`` methods, all method parameters are considered
+body parameters by default. To override this default, one can use the
+``path_params`` and ``query_params`` to specify which method parameters are
+path and query parameters respectively.
+Body parameters are decoded from the request body, either from a form format, or
+from a dictionary in JSON format.
+
+Let's use an example to better understand the possible ways to customize an
+endpoint:
+
+.. code-block:: python
+
+ from ..tools import Controller, BaseController, Endpoint
+
+ @Controller('/ping')
+ class Ping(BaseController):
+
+ # URL: /ping/{key}?opt1=...&opt2=...
+ @Endpoint(path="/", query_params=['opt1'])
+ def index(self, key, opt1, opt2=None):
+ """..."""
+
+ # URL: /ping/{key}?opt1=...&opt2=...
+ @Endpoint(query_params=['opt1'])
+ def __call__(self, key, opt1, opt2=None):
+ """..."""
+
+ # URL: /ping/post/{key1}/{key2}
+ @Endpoint('POST', path_params=['key1', 'key2'])
+ def post(self, key1, key2, data1, data2=None):
+ """..."""
+
+
+In the above example we see how the ``path`` option can be used to override the
+generated endpoint URL in order to not use the method's name in the URL. In the
+``index`` method we set the ``path`` to ``"/"`` to generate an endpoint that is
+accessible by the root URL of the controller.
+
+An alternative approach to generate an endpoint that is accessible through just
+the controller's path URL is by using the ``__call__`` method, as we show in
+the above example.
+
+From the third method you can see that the path parameters are collected from
+the URL by parsing the list of values separated by slashes ``/`` that come
+after the URL path ``/ping`` for ``index`` method case, and ``/ping/post`` for
+the ``post`` method case.
+
+Defining path parameters in endpoints's URLs using python methods's parameters
+is very easy but it is still a bit strict with respect to the position of these
+parameters in the URL structure.
+Sometimes we may want to explicitly define a URL scheme that
+contains path parameters mixed with static parts of the URL.
+Our controller infrastructure also supports the declaration of URL paths with
+explicit path parameters at both the controller level and method level.
+
+Consider the following example:
+
+.. code-block:: python
+
+ from ..tools import Controller, BaseController, Endpoint
+
+ @Controller('/ping/{node}/stats')
+ class Ping(BaseController):
+
+ # URL: /ping/{node}/stats/{date}/latency?unit=...
+ @Endpoint(path="/{date}/latency")
+ def latency(self, node, date, unit="ms"):
+ """ ..."""
+
+In this example we explicitly declare a path parameter ``{node}`` in the
+controller URL path, and a path parameter ``{date}`` in the ``latency``
+method. The endpoint for the ``latency`` method is then accessible through
+the URL: https://mgr_hostname:8443/ping/{node}/stats/{date}/latency .
+
+For a full set of examples on how to use the ``@Endpoint``
+decorator please check the unit test file: ``tests/test_controllers.py``.
+There you will find many examples of how to customize endpoint methods.
+
+
+Implementing Proxy Controller
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Sometimes you might need to relay some requests from the Dashboard frontend
+directly to an external service.
+For that purpose we provide a decorator called ``@Proxy``.
+(As a concrete example, check the ``controllers/rgw.py`` file where we
+implemented an RGW Admin Ops proxy.)
+
+
+The ``@Proxy`` decorator is a wrapper of the ``@Endpoint`` decorator that
+already customizes the endpoint for working as a proxy.
+A proxy endpoint works by capturing the URL path that follows the controller
+URL prefix path, and does not do any decoding of the request body.
+
+Example:
+
+.. code-block:: python
+
+ from ..tools import Controller, BaseController, Proxy
+
+ @Controller('/foo/proxy')
+ class FooServiceProxy(BaseController):
+
+ @Proxy()
+ def proxy(self, path, **params):
+ """
+ if requested URL is "/foo/proxy/access/service?opt=1"
+ then path is "access/service" and params is {'opt': '1'}
+ """
+
+
+How does the RESTController work?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We also provide a simple mechanism to create REST based controllers using the
+``RESTController`` class. Any class which inherits from ``RESTController`` will,
+by default, return JSON.
+
+The ``RESTController`` is basically an additional abstraction layer which eases
+and unifies the work with collections. A collection is just an array of objects
+with a specific type. ``RESTController`` enables some default mappings of
+request types and given parameters to specific method names. This may sound
+complicated at first, but it's fairly easy. Lets have look at the following
+example:
+
+.. code-block:: python
+
+ import cherrypy
+ from ..tools import ApiController, RESTController
+
+ @ApiController('ping')
+ class Ping(RESTController):
+ def list(self):
+ return {"msg": "Hello"}
+
+ def get(self, id):
+ return self.objects[id]
+
+In this case, the ``list`` method is automatically used for all requests to
+``api/ping`` where no additional argument is given and where the request type
+is ``GET``. If the request is given an additional argument, the ID in our
+case, it won't map to ``list`` anymore but to ``get`` and return the element
+with the given ID (assuming that ``self.objects`` has been filled before). The
+same applies to other request types:
+
++--------------+------------+----------------+-------------+
+| Request type | Arguments | Method | Status Code |
++==============+============+================+=============+
+| GET | No | list | 200 |
++--------------+------------+----------------+-------------+
+| PUT | No | bulk_set | 200 |
++--------------+------------+----------------+-------------+
+| POST | No | create | 201 |
++--------------+------------+----------------+-------------+
+| DELETE | No | bulk_delete | 204 |
++--------------+------------+----------------+-------------+
+| GET | Yes | get | 200 |
++--------------+------------+----------------+-------------+
+| PUT | Yes | set | 200 |
++--------------+------------+----------------+-------------+
+| DELETE | Yes | delete | 204 |
++--------------+------------+----------------+-------------+
+
+To use a custom endpoint for the above listed methods, you can
+use ``@RESTController.MethodMap``
+
+.. code-block:: python
+
+ import cherrypy
+ from ..tools import ApiController, RESTController
+
+ @RESTController.MethodMap(version='0.1')
+ def create(self):
+ return {"msg": "Hello"}
+
+This decorator supports three parameters to customize the
+endpoint:
+
+* ``resource"``: resource id.
+* ``status=200``: set the HTTP status response code
+* ``version``: version
+
+How to use a custom API endpoint in a RESTController?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you don't have any access restriction you can use ``@Endpoint``. If you
+have set a permission scope to restrict access to your endpoints,
+``@Endpoint`` will fail, as it doesn't know which permission property should be
+used. To use a custom endpoint inside a restricted ``RESTController`` use
+``@RESTController.Collection`` instead. You can also choose
+``@RESTController.Resource`` if you have set a ``RESOURCE_ID`` in your
+``RESTController`` class.
+
+.. code-block:: python
+
+ import cherrypy
+ from ..tools import ApiController, RESTController
+
+ @ApiController('ping', Scope.Ping)
+ class Ping(RESTController):
+ RESOURCE_ID = 'ping'
+
+ @RESTController.Resource('GET')
+ def some_get_endpoint(self):
+ return {"msg": "Hello"}
+
+ @RESTController.Collection('POST')
+ def some_post_endpoint(self, **data):
+ return {"msg": data}
+
+Both decorators also support five parameters to customize the
+endpoint:
+
+* ``method="GET"``: the HTTP method allowed to access this endpoint.
+* ``path="/<method_name>"``: the URL path of the endpoint, excluding the
+ controller URL path prefix.
+* ``status=200``: set the HTTP status response code
+* ``query_params=[]``: list of method parameter names that correspond to URL
+ query parameters.
+* ``version``: version
+
+How to restrict access to a controller?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+All controllers require authentication by default.
+If you require that the controller can be accessed without authentication,
+then you can add the parameter ``secure=False`` to the controller decorator.
+
+Example:
+
+.. code-block:: python
+
+ import cherrypy
+ from . import ApiController, RESTController
+
+
+ @ApiController('ping', secure=False)
+ class Ping(RESTController):
+ def list(self):
+ return {"msg": "Hello"}
+
+How to create a dedicated UI endpoint which uses the 'public' API?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Sometimes we want to combine multiple calls into one single call
+to save bandwidth or for other performance reasons.
+In order to achieve that, we first have to create an ``@UiApiController`` which
+is used for endpoints consumed by the UI but that are not part of the
+'public' API. Let the ui class inherit from the REST controller class.
+Now you can use all methods from the api controller.
+
+Example:
+
+.. code-block:: python
+
+ import cherrypy
+ from . import UiApiController, ApiController, RESTController
+
+
+ @ApiController('ping', secure=False) # /api/ping
+ class Ping(RESTController):
+ def list(self):
+ return self._list()
+
+ def _list(self): # To not get in conflict with the JSON wrapper
+ return [1,2,3]
+
+
+ @UiApiController('ping', secure=False) # /ui-api/ping
+ class PingUi(Ping):
+ def list(self):
+ return self._list() + [4, 5, 6]
+
+How to access the manager module instance from a controller?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We provide the manager module instance as a global variable that can be
+imported in any module.
+
+Example:
+
+.. code-block:: python
+
+ import logging
+ import cherrypy
+ from .. import mgr
+ from ..tools import ApiController, RESTController
+
+ logger = logging.getLogger(__name__)
+
+ @ApiController('servers')
+ class Servers(RESTController):
+ def list(self):
+ logger.debug('Listing available servers')
+ return {'servers': mgr.list_servers()}
+
+
+How to write a unit test for a controller?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We provide a test helper class called ``ControllerTestCase`` to easily create
+unit tests for your controller.
+
+If we want to write a unit test for the above ``Ping`` controller, create a
+``test_ping.py`` file under the ``tests`` directory with the following code:
+
+.. code-block:: python
+
+ from .helper import ControllerTestCase
+ from .controllers.ping import Ping
+
+
+ class PingTest(ControllerTestCase):
+ @classmethod
+ def setup_test(cls):
+ cp_config = {'tools.authenticate.on': True}
+ cls.setup_controllers([Ping], cp_config=cp_config)
+
+ def test_ping(self):
+ self._get("/api/ping")
+ self.assertStatus(200)
+ self.assertJsonBody({'msg': 'Hello'})
+
+The ``ControllerTestCase`` class starts by initializing a CherryPy webserver.
+Then it will call the ``setup_test()`` class method where we can explicitly
+load the controllers that we want to test. In the above example we are only
+loading the ``Ping`` controller. We can also provide ``cp_config`` in order to
+update the controller's cherrypy config (e.g. enable authentication as shown in the example).
+
+How to update or create new dashboards in grafana?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We are using ``jsonnet`` and ``grafonnet-lib`` to write code for the grafana dashboards.
+All the dashboards are written inside ``grafana_dashboards.jsonnet`` file in the
+monitoring/grafana/dashboards/jsonnet directory.
+
+We generate the dashboard json files directly from this jsonnet file by running this
+command in the grafana/dashboards directory:
+``jsonnet -m . jsonnet/grafana_dashboards.jsonnet``.
+(For the above command to succeed we need ``jsonnet`` package installed and ``grafonnet-lib``
+directory cloned in our machine. Please refer -
+``https://grafana.github.io/grafonnet-lib/getting-started/`` in case you have some trouble.)
+
+To update an existing grafana dashboard or to create a new one, we need to update
+the ``grafana_dashboards.jsonnet`` file and generate the new/updated json files using the
+above mentioned command. For people who are not familiar with grafonnet or jsonnet implementation
+can follow this doc - ``https://grafana.github.io/grafonnet-lib/``.
+
+Example grafana dashboard in jsonnet format:
+
+To specify the grafana dashboard properties such as title, uid etc we can create a local function -
+
+::
+
+ local dashboardSchema(title, uid, time_from, refresh, schemaVersion, tags,timezone, timepicker)
+
+To add a graph panel we can spcify the graph schema in a local function such as -
+
+::
+
+ local graphPanelSchema(title, nullPointMode, stack, formatY1, formatY2, labelY1, labelY2, min, fill, datasource)
+
+and then use these functions inside the dashboard definition like -
+
+::
+
+ {
+ radosgw-sync-overview.json: //json file name to be generated
+
+ dashboardSchema(
+ 'RGW Sync Overview', 'rgw-sync-overview', 'now-1h', '15s', .., .., ..
+ )
+
+ .addPanels([
+ graphPanelSchema(
+ 'Replication (throughput) from Source Zone', 'Bps', null, .., .., ..)
+ ])
+ }
+
+The valid grafonnet-lib attributes can be found here - ``https://grafana.github.io/grafonnet-lib/api-docs/``.
+
+
+How to listen for manager notifications in a controller?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The manager notifies the modules of several types of cluster events, such
+as cluster logging event, etc...
+
+Each module has a "global" handler function called ``notify`` that the manager
+calls to notify the module. But this handler function must not block or spend
+too much time processing the event notification.
+For this reason we provide a notification queue that controllers can register
+themselves with to receive cluster notifications.
+
+The example below represents a controller that implements a very simple live
+log viewer page:
+
+.. code-block:: python
+
+ from __future__ import absolute_import
+
+ import collections
+
+ import cherrypy
+
+ from ..tools import ApiController, BaseController, NotificationQueue
+
+
+ @ApiController('livelog')
+ class LiveLog(BaseController):
+ log_buffer = collections.deque(maxlen=1000)
+
+ def __init__(self):
+ super(LiveLog, self).__init__()
+ NotificationQueue.register(self.log, 'clog')
+
+ def log(self, log_struct):
+ self.log_buffer.appendleft(log_struct)
+
+ @cherrypy.expose
+ def default(self):
+ ret = '<html><meta http-equiv="refresh" content="2" /><body>'
+ for l in self.log_buffer:
+ ret += "{}<br>".format(l)
+ ret += "</body></html>"
+ return ret
+
+As you can see above, the ``NotificationQueue`` class provides a register
+method that receives the function as its first argument, and receives the
+"notification type" as the second argument.
+You can omit the second argument of the ``register`` method, and in that case
+you are registering to listen all notifications of any type.
+
+Here is an list of notification types (these might change in the future) that
+can be used:
+
+* ``clog``: cluster log notifications
+* ``command``: notification when a command issued by ``MgrModule.send_command``
+ completes
+* ``perf_schema_update``: perf counters schema update
+* ``mon_map``: monitor map update
+* ``fs_map``: cephfs map update
+* ``osd_map``: OSD map update
+* ``service_map``: services (RGW, RBD-Mirror, etc.) map update
+* ``mon_status``: monitor status regular update
+* ``health``: health status regular update
+* ``pg_summary``: regular update of PG status information
+
+
+How to write a unit test when a controller accesses a Ceph module?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Consider the following example that implements a controller that retrieves the
+list of RBD images of the ``rbd`` pool:
+
+.. code-block:: python
+
+ import rbd
+ from .. import mgr
+ from ..tools import ApiController, RESTController
+
+
+ @ApiController('rbdimages')
+ class RbdImages(RESTController):
+ def __init__(self):
+ self.ioctx = mgr.rados.open_ioctx('rbd')
+ self.rbd = rbd.RBD()
+
+ def list(self):
+ return [{'name': n} for n in self.rbd.list(self.ioctx)]
+
+In the example above, we want to mock the return value of the ``rbd.list``
+function, so that we can test the JSON response of the controller.
+
+The unit test code will look like the following:
+
+.. code-block:: python
+
+ import mock
+ from .helper import ControllerTestCase
+
+
+ class RbdImagesTest(ControllerTestCase):
+ @mock.patch('rbd.RBD.list')
+ def test_list(self, rbd_list_mock):
+ rbd_list_mock.return_value = ['img1', 'img2']
+ self._get('/api/rbdimages')
+ self.assertJsonBody([{'name': 'img1'}, {'name': 'img2'}])
+
+
+
+How to add a new configuration setting?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you need to store some configuration setting for a new feature, we already
+provide an easy mechanism for you to specify/use the new config setting.
+
+For instance, if you want to add a new configuration setting to hold the
+email address of the dashboard admin, just add a setting name as a class
+attribute to the ``Options`` class in the ``settings.py`` file::
+
+ # ...
+ class Options(object):
+ # ...
+
+ ADMIN_EMAIL_ADDRESS = ('admin@admin.com', str)
+
+The value of the class attribute is a pair composed by the default value for that
+setting, and the python type of the value.
+
+By declaring the ``ADMIN_EMAIL_ADDRESS`` class attribute, when you restart the
+dashboard module, you will automatically gain two additional CLI commands to
+get and set that setting::
+
+ $ ceph dashboard get-admin-email-address
+ $ ceph dashboard set-admin-email-address <value>
+
+To access, or modify the config setting value from your Python code, either
+inside a controller or anywhere else, you just need to import the ``Settings``
+class and access it like this:
+
+.. code-block:: python
+
+ from settings import Settings
+
+ # ...
+ tmp_var = Settings.ADMIN_EMAIL_ADDRESS
+
+ # ....
+ Settings.ADMIN_EMAIL_ADDRESS = 'myemail@admin.com'
+
+The settings management implementation will make sure that if you change a
+setting value from the Python code you will see that change when accessing
+that setting from the CLI and vice-versa.
+
+
+How to run a controller read-write operation asynchronously?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some controllers might need to execute operations that alter the state of the
+Ceph cluster. These operations might take some time to execute and to maintain
+a good user experience in the Web UI, we need to run those operations
+asynchronously and return immediately to frontend some information that the
+operations are running in the background.
+
+To help in the development of the above scenario we added the support for
+asynchronous tasks. To trigger the execution of an asynchronous task we must
+use the following class method of the ``TaskManager`` class::
+
+ from ..tools import TaskManager
+ # ...
+ TaskManager.run(name, metadata, func, args, kwargs)
+
+* ``name`` is a string that can be used to group tasks. For instance
+ for RBD image creation tasks we could specify ``"rbd/create"`` as the
+ name, or similarly ``"rbd/remove"`` for RBD image removal tasks.
+
+* ``metadata`` is a dictionary where we can store key-value pairs that
+ characterize the task. For instance, when creating a task for creating
+ RBD images we can specify the metadata argument as
+ ``{'pool_name': "rbd", image_name': "test-img"}``.
+
+* ``func`` is the python function that implements the operation code, which
+ will be executed asynchronously.
+
+* ``args`` and ``kwargs`` are the positional and named arguments that will be
+ passed to ``func`` when the task manager starts its execution.
+
+The ``TaskManager.run`` method triggers the asynchronous execution of function
+``func`` and returns a ``Task`` object.
+The ``Task`` provides the public method ``Task.wait(timeout)``, which can be
+used to wait for the task to complete up to a timeout defined in seconds and
+provided as an argument. If no argument is provided the ``wait`` method
+blocks until the task is finished.
+
+The ``Task.wait`` is very useful for tasks that usually are fast to execute but
+that sometimes may take a long time to run.
+The return value of the ``Task.wait`` method is a pair ``(state, value)``
+where ``state`` is a string with following possible values:
+
+* ``VALUE_DONE = "done"``
+* ``VALUE_EXECUTING = "executing"``
+
+The ``value`` will store the result of the execution of function ``func`` if
+``state == VALUE_DONE``. If ``state == VALUE_EXECUTING`` then
+``value == None``.
+
+The pair ``(name, metadata)`` should unequivocally identify the task being
+run, which means that if you try to trigger a new task that matches the same
+``(name, metadata)`` pair of the currently running task, then the new task
+is not created and you get the task object of the current running task.
+
+For instance, consider the following example:
+
+.. code-block:: python
+
+ task1 = TaskManager.run("dummy/task", {'attr': 2}, func)
+ task2 = TaskManager.run("dummy/task", {'attr': 2}, func)
+
+If the second call to ``TaskManager.run`` executes while the first task is
+still executing then it will return the same task object:
+``assert task1 == task2``.
+
+
+How to get the list of executing and finished asynchronous tasks?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The list of executing and finished tasks is included in the ``Summary``
+controller, which is already polled every 5 seconds by the dashboard frontend.
+But we also provide a dedicated controller to get the same list of executing
+and finished tasks.
+
+The ``Task`` controller exposes the ``/api/task`` endpoint that returns the
+list of executing and finished tasks. This endpoint accepts the ``name``
+parameter that accepts a glob expression as its value.
+For instance, an HTTP GET request of the URL ``/api/task?name=rbd/*``
+will return all executing and finished tasks which name starts with ``rbd/``.
+
+To prevent the finished tasks list from growing unbounded, we will always
+maintain the 10 most recent finished tasks, and the remaining older finished
+tasks will be removed when reaching a TTL of 1 minute. The TTL is calculated
+using the timestamp when the task finished its execution. After a minute, when
+the finished task information is retrieved, either by the summary controller or
+by the task controller, it is automatically deleted from the list and it will
+not be included in further task queries.
+
+Each executing task is represented by the following dictionary::
+
+ {
+ 'name': "name", # str
+ 'metadata': { }, # dict
+ 'begin_time': "2018-03-14T15:31:38.423605Z", # str (ISO 8601 format)
+ 'progress': 0 # int (percentage)
+ }
+
+Each finished task is represented by the following dictionary::
+
+ {
+ 'name': "name", # str
+ 'metadata': { }, # dict
+ 'begin_time': "2018-03-14T15:31:38.423605Z", # str (ISO 8601 format)
+ 'end_time': "2018-03-14T15:31:39.423605Z", # str (ISO 8601 format)
+ 'duration': 0.0, # float
+ 'progress': 0 # int (percentage)
+ 'success': True, # bool
+ 'ret_value': None, # object, populated only if 'success' == True
+ 'exception': None, # str, populated only if 'success' == False
+ }
+
+
+How to use asynchronous APIs with asynchronous tasks?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``TaskManager.run`` method as described in a previous section, is well
+suited for calling blocking functions, as it runs the function inside a newly
+created thread. But sometimes we want to call some function of an API that is
+already asynchronous by nature.
+
+For these cases we want to avoid creating a new thread for just running a
+non-blocking function, and want to leverage the asynchronous nature of the
+function. The ``TaskManager.run`` is already prepared to be used with
+non-blocking functions by passing an object of the type ``TaskExecutor`` as an
+additional parameter called ``executor``. The full method signature of
+``TaskManager.run``::
+
+ TaskManager.run(name, metadata, func, args=None, kwargs=None, executor=None)
+
+
+The ``TaskExecutor`` class is responsible for code that executes a given task
+function, and defines three methods that can be overridden by
+subclasses::
+
+ def init(self, task)
+ def start(self)
+ def finish(self, ret_value, exception)
+
+The ``init`` method is called before the running the task function, and
+receives the task object (of class ``Task``).
+
+The ``start`` method runs the task function. The default implementation is to
+run the task function in the current thread context.
+
+The ``finish`` method should be called when the task function finishes with
+either the ``ret_value`` populated with the result of the execution, or with
+an exception object in the case that execution raised an exception.
+
+To leverage the asynchronous nature of a non-blocking function, the developer
+should implement a custom executor by creating a subclass of the
+``TaskExecutor`` class, and provide an instance of the custom executor class
+as the ``executor`` parameter of the ``TaskManager.run``.
+
+To better understand the expressive power of executors, we write a full example
+of use a custom executor to execute the ``MgrModule.send_command`` asynchronous
+function:
+
+.. code-block:: python
+
+ import json
+ from mgr_module import CommandResult
+ from .. import mgr
+ from ..tools import ApiController, RESTController, NotificationQueue, \
+ TaskManager, TaskExecutor
+
+
+ class SendCommandExecutor(TaskExecutor):
+ def __init__(self):
+ super(SendCommandExecutor, self).__init__()
+ self.tag = None
+ self.result = None
+
+ def init(self, task):
+ super(SendCommandExecutor, self).init(task)
+
+ # we need to listen for 'command' events to know when the command
+ # finishes
+ NotificationQueue.register(self._handler, 'command')
+
+ # store the CommandResult object to retrieve the results
+ self.result = self.task.fn_args[0]
+ if len(self.task.fn_args) > 4:
+ # the user specified a tag for the command, so let's use it
+ self.tag = self.task.fn_args[4]
+ else:
+ # let's generate a unique tag for the command
+ self.tag = 'send_command_{}'.format(id(self))
+ self.task.fn_args.append(self.tag)
+
+ def _handler(self, data):
+ if data == self.tag:
+ # the command has finished, notifying the task with the result
+ self.finish(self.result.wait(), None)
+ # deregister listener to avoid memory leaks
+ NotificationQueue.deregister(self._handler, 'command')
+
+
+ @ApiController('test')
+ class Test(RESTController):
+
+ def _run_task(self, osd_id):
+ task = TaskManager.run("test/task", {}, mgr.send_command,
+ [CommandResult(''), 'osd', osd_id,
+ json.dumps({'prefix': 'perf histogram dump'})],
+ executor=SendCommandExecutor())
+ return task.wait(1.0)
+
+ def get(self, osd_id):
+ status, value = self._run_task(osd_id)
+ return {'status': status, 'value': value}
+
+
+The above ``SendCommandExecutor`` executor class can be used for any call to
+``MgrModule.send_command``. This means that we should need just one custom
+executor class implementation for each non-blocking API that we use in our
+controllers.
+
+The default executor, used when no executor object is passed to
+``TaskManager.run``, is the ``ThreadedExecutor``. You can check its
+implementation in the ``tools.py`` file.
+
+
+How to update the execution progress of an asynchronous task?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The asynchronous tasks infrastructure provides support for updating the
+execution progress of an executing task.
+The progress can be updated from within the code the task is executing, which
+usually is the place where we have the progress information available.
+
+To update the progress from within the task code, the ``TaskManager`` class
+provides a method to retrieve the current task object::
+
+ TaskManager.current_task()
+
+The above method is only available when using the default executor
+``ThreadedExecutor`` for executing the task.
+The ``current_task()`` method returns the current ``Task`` object. The
+``Task`` object provides two public methods to update the execution progress
+value: the ``set_progress(percentage)``, and the ``inc_progress(delta)``
+methods.
+
+The ``set_progress`` method receives as argument an integer value representing
+the absolute percentage that we want to set to the task.
+
+The ``inc_progress`` method receives as argument an integer value representing
+the delta we want to increment to the current execution progress percentage.
+
+Take the following example of a controller that triggers a new task and
+updates its progress:
+
+.. code-block:: python
+
+ from __future__ import absolute_import
+ import random
+ import time
+ import cherrypy
+ from ..tools import TaskManager, ApiController, BaseController
+
+
+ @ApiController('dummy_task')
+ class DummyTask(BaseController):
+ def _dummy(self):
+ top = random.randrange(100)
+ for i in range(top):
+ TaskManager.current_task().set_progress(i*100/top)
+ # or TaskManager.current_task().inc_progress(100/top)
+ time.sleep(1)
+ return "finished"
+
+ @cherrypy.expose
+ @cherrypy.tools.json_out()
+ def default(self):
+ task = TaskManager.run("dummy/task", {}, self._dummy)
+ return task.wait(5) # wait for five seconds
+
+
+How to deal with asynchronous tasks in the front-end?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+All executing and most recently finished asynchronous tasks are displayed on
+"Background-Tasks" and if finished on "Recent-Notifications" in the menu bar.
+For each task a operation name for three states (running, success and failure),
+a function that tells who is involved and error descriptions, if any, have to
+be provided. This can be achieved by appending
+``TaskManagerMessageService.messages``. This has to be done to achieve
+consistency among all tasks and states.
+
+Operation Object
+ Ensures consistency among all tasks. It consists of three verbs for each
+ different state f.e.
+ ``{running: 'Creating', failure: 'create', success: 'Created'}``.
+
+#. Put running operations in present participle f.e. ``'Updating'``.
+#. Failed messages always start with ``'Failed to '`` and should be continued
+ with the operation in present tense f.e. ``'update'``.
+#. Put successful operations in past tense f.e. ``'Updated'``.
+
+Involves Function
+ Ensures consistency among all messages of a task, it resembles who's
+ involved by the operation. It's a function that returns a string which
+ takes the metadata from the task to return f.e.
+ ``"RBD 'somePool/someImage'"``.
+
+Both combined create the following messages:
+
+* Failure => ``"Failed to create RBD 'somePool/someImage'"``
+* Running => ``"Creating RBD 'somePool/someImage'"``
+* Success => ``"Created RBD 'somePool/someImage'"``
+
+For automatic task handling use ``TaskWrapperService.wrapTaskAroundCall``.
+
+If for some reason ``wrapTaskAroundCall`` is not working for you,
+you have to subscribe to your asynchronous task manually through
+``TaskManagerService.subscribe``, and provide it with a callback,
+in case of a success to notify the user. A notification can
+be triggered with ``NotificationService.notifyTask``. It will use
+``TaskManagerMessageService.messages`` to display a message based on the state
+of a task.
+
+Notifications of API errors are handled by ``ApiInterceptorService``.
+
+Usage example:
+
+.. code-block:: javascript
+
+ export class TaskManagerMessageService {
+ // ...
+ messages = {
+ // Messages for task 'rbd/create'
+ 'rbd/create': new TaskManagerMessage(
+ // Message prefixes
+ ['create', 'Creating', 'Created'],
+ // Message suffix
+ (metadata) => `RBD '${metadata.pool_name}/${metadata.image_name}'`,
+ (metadata) => ({
+ // Error code and description
+ '17': `Name is already used by RBD '${metadata.pool_name}/${
+ metadata.image_name}'.`
+ })
+ ),
+ // ...
+ };
+ // ...
+ }
+
+ export class RBDFormComponent {
+ // ...
+ createAction() {
+ const request = this.createRequest();
+ // Subscribes to 'call' with submitted 'task' and handles notifications
+ return this.taskWrapper.wrapTaskAroundCall({
+ task: new FinishedTask('rbd/create', {
+ pool_name: request.pool_name,
+ image_name: request.name
+ }),
+ call: this.rbdService.create(request)
+ });
+ }
+ // ...
+ }
+
+
+REST API documentation
+~~~~~~~~~~~~~~~~~~~~~~
+Ceph-Dashboard provides two types of documentation for the **Ceph RESTful API**:
+
+* **Static documentation**: available at :ref:`mgr-ceph-api`. This comes from a versioned specification located at ``src/pybind/mgr/dashboard/openapi.yaml``.
+* **Interactive documentation**: available from a running Ceph-Dashboard instance (top-right ``?`` icon > API Docs).
+
+If changes are made to the ``controllers/`` directory, it's very likely that
+they will result in changes to the generated OpenAPI specification. For that
+reason, a checker has been implemented to block unintended changes. This check
+is automatically triggered by the Pull Request CI (``make check``) and can be
+also manually invoked: ``tox -e openapi-check``.
+
+If that checker failed, it means that the current Pull Request is modifying the
+Ceph API and therefore:
+
+#. The versioned OpenAPI specification should be updated explicitly: ``tox -e openapi-fix``.
+#. The team @ceph/api will be requested for reviews (this is automated via Github CODEOWNERS), in order to asses the impact of changes.
+
+Additionally, Sphinx documentation can be generated from the OpenAPI
+specification with ``tox -e openapi-doc``.
+
+The Ceph RESTful OpenAPI specification is dynamically generated from the
+``Controllers`` in ``controllers/`` directory. However, by default it is not
+very detailed, so there are two decorators that can and should be used to add
+more information:
+
+* ``@EndpointDoc()`` for documentation of endpoints. It has four optional arguments
+ (explained below): ``description``, ``group``, ``parameters`` and
+ ``responses``.
+* ``@ControllerDoc()`` for documentation of controller or group associated with
+ the endpoints. It only takes the two first arguments: ``description`` and
+ ``group``.
+
+
+``description``: A a string with a short (1-2 sentences) description of the object.
+
+
+``group``: By default, an endpoint is grouped together with other endpoints
+within the same controller class. ``group`` is a string that can be used to
+assign an endpoint or all endpoints in a class to another controller or a
+conceived group name.
+
+
+``parameters``: A dict used to describe path, query or request body parameters.
+By default, all parameters for an endpoint are listed on the Swagger UI page,
+including information of whether the parameter is optional/required and default
+values. However, there will be no description of the parameter and the parameter
+type will only be displayed in some cases.
+When adding information, each parameters should be described as in the example
+below. Note that the parameter type should be expressed as a built-in python
+type and not as a string. Allowed values are ``str``, ``int``, ``bool``, ``float``.
+
+.. code-block:: python
+
+ @EndpointDoc(parameters={'my_string': (str, 'Description of my_string')})
+ def method(my_string): pass
+
+For body parameters, more complex cases are possible. If the parameter is a
+dictionary, the type should be replaced with a ``dict`` containing its nested
+parameters. When describing nested parameters, the same format as other
+parameters is used. However, all nested parameters are set as required by default.
+If the nested parameter is optional this must be specified as for ``item2`` in
+the example below. If a nested parameters is set to optional, it is also
+possible to specify the default value (this will not be provided automatically
+for nested parameters).
+
+.. code-block:: python
+
+ @EndpointDoc(parameters={
+ 'my_dictionary': ({
+ 'item1': (str, 'Description of item1'),
+ 'item2': (str, 'Description of item2', True), # item2 is optional
+ 'item3': (str, 'Description of item3', True, 'foo'), # item3 is optional with 'foo' as default value
+ }, 'Description of my_dictionary')})
+ def method(my_dictionary): pass
+
+If the parameter is a ``list`` of primitive types, the type should be
+surrounded with square brackets.
+
+.. code-block:: python
+
+ @EndpointDoc(parameters={'my_list': ([int], 'Description of my_list')})
+ def method(my_list): pass
+
+If the parameter is a ``list`` with nested parameters, the nested parameters
+should be placed in a dictionary and surrounded with square brackets.
+
+.. code-block:: python
+
+ @EndpointDoc(parameters={
+ 'my_list': ([{
+ 'list_item': (str, 'Description of list_item'),
+ 'list_item2': (str, 'Description of list_item2')
+ }], 'Description of my_list')})
+ def method(my_list): pass
+
+
+``responses``: A dict used for describing responses. Rules for describing
+responses are the same as for request body parameters, with one difference:
+responses also needs to be assigned to the related response code as in the
+example below:
+
+.. code-block:: python
+
+ @EndpointDoc(responses={
+ '400':{'my_response': (str, 'Description of my_response')}})
+ def method(): pass
+
+
+Error Handling in Python
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Good error handling is a key requirement in creating a good user experience
+and providing a good API.
+
+Dashboard code should not duplicate C++ code. Thus, if error handling in C++
+is sufficient to provide good feedback, a new wrapper to catch these errors
+is not necessary. On the other hand, input validation is the best place to
+catch errors and generate the best error messages. If required, generate
+errors as soon as possible.
+
+The backend provides few standard ways of returning errors.
+
+First, there is a generic Internal Server Error::
+
+ Status Code: 500
+ {
+ "version": <cherrypy version, e.g. 13.1.0>,
+ "detail": "The server encountered an unexpected condition which prevented it from fulfilling the request.",
+ }
+
+
+For errors generated by the backend, we provide a standard error
+format::
+
+ Status Code: 400
+ {
+ "detail": str(e), # E.g. "[errno -42] <some error message>"
+ "component": "rbd", # this can be null to represent a global error code
+ "code": "3", # Or a error name, e.g. "code": "some_error_key"
+ }
+
+
+In case, the API Endpoints uses @ViewCache to temporarily cache results,
+the error looks like so::
+
+ Status Code 400
+ {
+ "detail": str(e), # E.g. "[errno -42] <some error message>"
+ "component": "rbd", # this can be null to represent a global error code
+ "code": "3", # Or a error name, e.g. "code": "some_error_key"
+ 'status': 3, # Indicating the @ViewCache error status
+ }
+
+In case, the API Endpoints uses a task the error looks like so::
+
+ Status Code 400
+ {
+ "detail": str(e), # E.g. "[errno -42] <some error message>"
+ "component": "rbd", # this can be null to represent a global error code
+ "code": "3", # Or a error name, e.g. "code": "some_error_key"
+ "task": { # Information about the task itself
+ "name": "taskname",
+ "metadata": {...}
+ }
+ }
+
+
+Our WebUI should show errors generated by the API to the user. Especially
+field-related errors in wizards and dialogs or show non-intrusive notifications.
+
+Handling exceptions in Python should be an exception. In general, we
+should have few exception handlers in our project. Per default, propagate
+errors to the API, as it will take care of all exceptions anyway. In general,
+log the exception by adding ``logger.exception()`` with a description to the
+handler.
+
+We need to distinguish between user errors from internal errors and
+programming errors. Using different exception types will ease the
+task for the API layer and for the user interface:
+
+Standard Python errors, like ``SystemError``, ``ValueError`` or ``KeyError``
+will end up as internal server errors in the API.
+
+In general, do not ``return`` error responses in the REST API. They will be
+returned by the error handler. Instead, raise the appropriate exception.
+
+Plug-ins
+~~~~~~~~
+
+New functionality can be provided by means of a plug-in architecture. Among the
+benefits this approach brings in, loosely coupled development is one of the most
+notable. As the Ceph Dashboard grows in feature richness, its code-base becomes
+more and more complex. The hook-based nature of a plug-in architecture allows to
+extend functionality in a controlled manner, and isolate the scope of the
+changes.
+
+Ceph Dashboard relies on `Pluggy <https://pluggy.readthedocs.io>`_ to provide
+for plug-ing support. On top of pluggy, an interface-based approach has been
+implemented, with some safety checks (method override and abstract method
+checks).
+
+In order to create a new plugin, the following steps are required:
+
+#. Add a new file under ``src/pybind/mgr/dashboard/plugins``.
+#. Import the ``PLUGIN_MANAGER`` instance and the ``Interfaces``.
+#. Create a class extending the desired interfaces. The plug-in library will
+ check if all the methods of the interfaces have been properly overridden.
+#. Register the plugin in the ``PLUGIN_MANAGER`` instance.
+#. Import the plug-in from within the Ceph Dashboard ``module.py`` (currently no
+ dynamic loading is implemented).
+
+The available Mixins (helpers) are:
+
+- ``CanMgr``: provides the plug-in with access to the ``mgr`` instance under ``self.mgr``.
+
+The available Interfaces are:
+
+- ``Initializable``: requires overriding ``init()`` hook. This method is run at
+ the very beginning of the dashboard module, right after all imports have been
+ performed.
+- ``Setupable``: requires overriding ``setup()`` hook. This method is run in the
+ Ceph Dashboard ``serve()`` method, right after CherryPy has been configured,
+ but before it is started. It's a placeholder for the plug-in initialization
+ logic.
+- ``HasOptions``: requires overriding ``get_options()`` hook by returning a list
+ of ``Options()``. The options returned here are added to the
+ ``MODULE_OPTIONS``.
+- ``HasCommands``: requires overriding ``register_commands()`` hook by defining
+ the commands the plug-in can handle and decorating them with ``@CLICommand``.
+ The commands can be optionally returned, so that they can be invoked
+ externally (which makes unit testing easier).
+- ``HasControllers``: requires overriding ``get_controllers()`` hook by defining
+ and returning the controllers as usual.
+- ``FilterRequest.BeforeHandler``: requires overriding
+ ``filter_request_before_handler()`` hook. This method receives a
+ ``cherrypy.request`` object for processing. A usual implementation of this
+ method will allow some requests to pass or will raise a ``cherrypy.HTTPError``
+ based on the ``request`` metadata and other conditions.
+
+New interfaces and hooks should be added as soon as they are required to
+implement new functionality. The above list only comprises the hooks needed for
+the existing plugins.
+
+A sample plugin implementation would look like this:
+
+.. code-block:: python
+
+ # src/pybind/mgr/dashboard/plugins/mute.py
+
+ from . import PLUGIN_MANAGER as PM
+ from . import interfaces as I
+
+ from mgr_module import CLICommand, Option
+ import cherrypy
+
+ @PM.add_plugin
+ class Mute(I.CanMgr, I.Setupable, I.HasOptions, I.HasCommands,
+ I.FilterRequest.BeforeHandler, I.HasControllers):
+ @PM.add_hook
+ def get_options(self):
+ return [Option('mute', default=False, type='bool')]
+
+ @PM.add_hook
+ def setup(self):
+ self.mute = self.mgr.get_module_option('mute')
+
+ @PM.add_hook
+ def register_commands(self):
+ @CLICommand("dashboard mute")
+ def _(mgr):
+ self.mute = True
+ self.mgr.set_module_option('mute', True)
+ return 0
+
+ @PM.add_hook
+ def filter_request_before_handler(self, request):
+ if self.mute:
+ raise cherrypy.HTTPError(500, "I'm muted :-x")
+
+ @PM.add_hook
+ def get_controllers(self):
+ from ..controllers import ApiController, RESTController
+
+ @ApiController('/mute')
+ class MuteController(RESTController):
+ def get(_):
+ return self.mute
+
+ return [MuteController]
+
+
+Additionally, a helper for creating plugins ``SimplePlugin`` is provided. It
+facilitates the basic tasks (Options, Commands, and common Mixins). The previous
+plugin could be rewritten like this:
+
+.. code-block:: python
+
+ from . import PLUGIN_MANAGER as PM
+ from . import interfaces as I
+ from .plugin import SimplePlugin as SP
+
+ import cherrypy
+
+ @PM.add_plugin
+ class Mute(SP, I.Setupable, I.FilterRequest.BeforeHandler, I.HasControllers):
+ OPTIONS = [
+ SP.Option('mute', default=False, type='bool')
+ ]
+
+ def shut_up(self):
+ self.set_option('mute', True)
+ self.mute = True
+ return 0
+
+ COMMANDS = [
+ SP.Command("dashboard mute", handler=shut_up)
+ ]
+
+ @PM.add_hook
+ def setup(self):
+ self.mute = self.get_option('mute')
+
+ @PM.add_hook
+ def filter_request_before_handler(self, request):
+ if self.mute:
+ raise cherrypy.HTTPError(500, "I'm muted :-x")
+
+ @PM.add_hook
+ def get_controllers(self):
+ from ..controllers import ApiController, RESTController
+
+ @ApiController('/mute')
+ class MuteController(RESTController):
+ def get(_):
+ return self.mute
+
+ return [MuteController]
diff --git a/doc/dev/developer_guide/essentials.rst b/doc/dev/developer_guide/essentials.rst
new file mode 100644
index 000000000..2fe7a13cd
--- /dev/null
+++ b/doc/dev/developer_guide/essentials.rst
@@ -0,0 +1,338 @@
+Essentials (tl;dr)
+==================
+
+This chapter presents essential information that every Ceph developer needs
+to know.
+
+Leads
+-----
+
+The Ceph project is led by Sage Weil. In addition, each major project
+component has its own lead. The following table shows all the leads and
+their nicks on `GitHub`_:
+
+.. _github: https://github.com/
+
+========= ================ =============
+Scope Lead GitHub nick
+========= ================ =============
+Ceph Sage Weil liewegas
+RADOS Neha Ojha neha-ojha
+RGW Yehuda Sadeh yehudasa
+RGW Matt Benjamin mattbenjamin
+RBD Jason Dillaman dillaman
+CephFS Patrick Donnelly batrick
+Dashboard Lenz Grimmer LenzGr
+MON Joao Luis jecluis
+Build/Ops Ken Dreyer ktdreyer
+Docs Zac Dover zdover23
+========= ================ =============
+
+The Ceph-specific acronyms in the table are explained in
+:doc:`/architecture`.
+
+History
+-------
+
+See the `History chapter of the Wikipedia article`_.
+
+.. _`History chapter of the Wikipedia article`: https://en.wikipedia.org/wiki/Ceph_%28software%29#History
+
+Licensing
+---------
+
+Ceph is free software.
+
+Unless stated otherwise, the Ceph source code is distributed under the
+terms of the LGPL2.1 or LGPL3.0. For full details, see the file
+`COPYING`_ in the top-level directory of the source-code tree.
+
+.. _`COPYING`:
+ https://github.com/ceph/ceph/blob/master/COPYING
+
+Source code repositories
+------------------------
+
+The source code of Ceph lives on `GitHub`_ in a number of repositories below
+the `Ceph "organization"`_.
+
+.. _`Ceph "organization"`: https://github.com/ceph
+
+A working knowledge of git_ is essential to make a meaningful contribution to the project as a developer.
+
+.. _git: https://git-scm.com/doc
+
+Although the `Ceph "organization"`_ includes several software repositories,
+this document covers only one: https://github.com/ceph/ceph.
+
+Redmine issue tracker
+---------------------
+
+Although `GitHub`_ is used for code, Ceph-related issues (Bugs, Features,
+Backports, Documentation, etc.) are tracked at http://tracker.ceph.com,
+which is powered by `Redmine`_.
+
+.. _Redmine: http://www.redmine.org
+
+The tracker has a Ceph project with a number of subprojects loosely
+corresponding to the various architectural components (see
+:doc:`/architecture`).
+
+Mere `registration`_ in the tracker automatically grants permissions
+sufficient to open new issues and comment on existing ones.
+
+.. _registration: http://tracker.ceph.com/account/register
+
+To report a bug or propose a new feature, `jump to the Ceph project`_ and
+click on `New issue`_.
+
+.. _`jump to the Ceph project`: http://tracker.ceph.com/projects/ceph
+.. _`New issue`: http://tracker.ceph.com/projects/ceph/issues/new
+
+.. _mailing-list:
+
+Mailing lists
+-------------
+
+Ceph Development Mailing List
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The ``dev@ceph.io`` list is for discussion about the development of Ceph,
+its interoperability with other technology, and the operations of the
+project itself.
+
+The email discussion list for Ceph development is open to all. Subscribe by
+sending a message to ``dev-request@ceph.io`` with the following line in the
+body of the message::
+
+ subscribe ceph-devel
+
+
+Ceph Client Patch Review Mailing List
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The ``ceph-devel@vger.kernel.org`` list is for discussion and patch review
+for the Linux kernel Ceph client component. Note that this list used to
+be an all-encompassing list for developers. When searching the archives,
+remember that this list contains the generic devel-ceph archives before mid-2018.
+
+Subscribe to the list covering the Linux kernel Ceph client component by sending
+a message to ``majordomo@vger.kernel.org`` with the following line in the body
+of the message::
+
+ subscribe ceph-devel
+
+
+Other Ceph Mailing Lists
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are also `other Ceph-related mailing lists`_.
+
+.. _`other Ceph-related mailing lists`: https://ceph.com/irc/
+
+.. _irc:
+
+
+IRC
+---
+
+In addition to mailing lists, the Ceph community also communicates in real time
+using `Internet Relay Chat`_.
+
+.. _`Internet Relay Chat`: http://www.irchelp.org/
+
+The Ceph community gathers in the #ceph channel of the Open and Free Technology
+Community (OFTC) IRC network.
+
+Created in 1988, Internet Relay Chat (IRC) is a relay-based, real-time chat
+protocol. It is mainly designed for group (many-to-many) communication in
+discussion forums called channels, but also allows one-to-one communication via
+private message. On IRC you can talk to many other members using Ceph, on
+topics ranging from idle chit-chat to support questions. Though a channel might
+have many people in it at any one time, they might not always be at their
+keyboard; so if no-one responds, just wait around and someone will hopefully
+answer soon enough.
+
+Registration
+^^^^^^^^^^^^
+
+If you intend to use the IRC service on a continued basis, you are advised to
+register an account. Registering gives you a unique IRC identity and allows you
+to access channels where unregistered users have been locked out for technical
+reasons.
+
+See ``the official OFTC (Open and Free Technology Community) documentation's
+registration instructions
+<https://www.oftc.net/Services/#register-your-account>`` to learn how to
+register your IRC account.
+
+Channels
+~~~~~~~~
+
+To connect to the OFTC IRC network, download an IRC client and configure it to
+connect to ``irc.oftc.net``. Then join one or more of the channels. Discussions
+inside #ceph are logged and archives are available online.
+
+Here are the real-time discussion channels for the Ceph community:
+
+ - #ceph
+ - #ceph-devel
+ - #cephfs
+ - #ceph-dashboard
+ - #ceph-orchestrators
+ - #sepia
+
+.. _submitting-patches:
+
+Submitting patches
+------------------
+
+The canonical instructions for submitting patches are contained in the
+file `CONTRIBUTING.rst`_ in the top-level directory of the source-code
+tree. There may be some overlap between this guide and that file.
+
+.. _`CONTRIBUTING.rst`:
+ https://github.com/ceph/ceph/blob/main/CONTRIBUTING.rst
+
+All newcomers are encouraged to read that file carefully.
+
+Building from source
+--------------------
+
+See instructions at :doc:`/install/build-ceph`.
+
+Using ccache to speed up local builds
+-------------------------------------
+`ccache`_ can make the process of rebuilding the ceph source tree faster.
+
+Before you use `ccache`_ to speed up your rebuilds of the ceph source tree,
+make sure that your source tree is clean and will produce no build failures.
+When you have a clean source tree, you can confidently use `ccache`_, secure in
+the knowledge that you're not using a dirty tree.
+
+Old build artifacts can cause build failures. You might introduce these
+artifacts unknowingly when switching from one branch to another. If you see
+build errors when you attempt a local build, follow the procedure below to
+clean your source tree.
+
+Cleaning the Source Tree
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. prompt:: bash $
+
+ make clean
+
+.. note:: The following commands will remove everything in the source tree
+ that isn't tracked by git. Make sure to back up your log files
+ and configuration options before running these commands.
+
+.. prompt:: bash $
+
+ git clean -fdx; git submodule foreach git clean -fdx
+
+Building Ceph with ccache
+^^^^^^^^^^^^^^^^^^^^^^^^^
+``ccache`` is available as a package in most distros. To build ceph with
+ccache, run the following command.
+
+.. prompt:: bash $
+
+ cmake -DWITH_CCACHE=ON ..
+
+Using ccache to Speed Up Build Times
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+``ccache`` can be used for speeding up all builds of the system. For more
+details, refer to the `run modes`_ section of the ccache manual. The default
+settings of ``ccache`` can be displayed with the ``ccache -s`` command.
+
+.. note:: We recommend overriding the ``max_size``. The default is 10G.
+ Use a larger value, like 25G. Refer to the `configuration`_ section
+ of the ccache manual for more information.
+
+To further increase the cache hit rate and reduce compile times in a
+development environment, set the version information and build timestamps to
+fixed values. This makes it unnecessary to rebuild the binaries that contain
+this information.
+
+This can be achieved by adding the following settings to the ``ccache``
+configuration file ``ccache.conf``::
+
+ sloppiness = time_macros
+ run_second_cpp = true
+
+Now, set the environment variable ``SOURCE_DATE_EPOCH`` to a fixed value (a
+UNIX timestamp) and set ``ENABLE_GIT_VERSION`` to ``OFF`` when running
+``cmake``:
+
+.. prompt:: bash $
+
+ export SOURCE_DATE_EPOCH=946684800
+ cmake -DWITH_CCACHE=ON -DENABLE_GIT_VERSION=OFF ..
+
+.. note:: Binaries produced with these build options are not suitable for
+ production or debugging purposes, as they do not contain the correct build
+ time and git version information.
+
+.. _`ccache`: https://ccache.samba.org/
+.. _`run modes`: https://ccache.samba.org/manual.html#_run_modes
+.. _`configuration`: https://ccache.samba.org/manual.html#_configuration
+
+Development-mode cluster
+------------------------
+
+See :doc:`/dev/quick_guide`.
+
+Kubernetes/Rook development cluster
+-----------------------------------
+
+See :ref:`kubernetes-dev`
+
+.. _backporting:
+
+Backporting
+-----------
+
+All bugfixes should be merged to the ``main`` branch before being
+backported. To flag a bugfix for backporting, make sure it has a
+`tracker issue`_ associated with it and set the ``Backport`` field to a
+comma-separated list of previous releases (e.g. "hammer,jewel") that you think
+need the backport.
+The rest (including the actual backporting) will be taken care of by the
+`Stable Releases and Backports`_ team.
+
+.. _`tracker issue`: http://tracker.ceph.com/
+.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
+
+Dependabot
+----------
+
+Dependabot is a GitHub bot that scans the dependencies in the repositories for
+security vulnerabilities (CVEs). If a fix is available for a discovered CVE,
+Dependabot creates a pull request to update the dependency.
+
+Dependabot also indicates the compatibility score of the upgrade. This score is
+based on the number of CI failures that occur in other GitHub repositories
+where the fix was applied.
+
+With some configuration, Dependabot can perform non-security updates (for
+example, it can upgrade to the latest minor version or patch version).
+
+Dependabot supports `several languages and package managers
+<https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/about-dependabot-version-updates#supported-repositories-and-ecosystems>`_.
+As of July 2022, the Ceph project receives alerts only from pip (based on the
+`requirements.txt` files) and npm (`package*.json`). It is possible to extend
+these alerts to git submodules, Golang, and Java. As of July 2022, there is no
+support for C++ package managers such as vcpkg, conan, C++20 modules.
+
+Many of the dependencies discovered by Dependabot will best be updated
+elsewhere than the Ceph Github repository (distribution packages, for example,
+will be a better place to update some of the dependencies). Nonetheless, the
+list of new and existing vulnerabilities generated by Dependabot will be
+useful.
+
+`Here is an example of a Dependabot pull request.
+<https://github.com/ceph/ceph/pull/46998>`_
+
+Guidance for use of cluster log
+-------------------------------
+
+If your patches emit messages to the Ceph cluster log, please consult
+this: :doc:`/dev/logging`.
diff --git a/doc/dev/developer_guide/index.rst b/doc/dev/developer_guide/index.rst
new file mode 100644
index 000000000..30d5e5e22
--- /dev/null
+++ b/doc/dev/developer_guide/index.rst
@@ -0,0 +1,25 @@
+============================================
+Contributing to Ceph: A Guide for Developers
+============================================
+
+:Author: Loic Dachary
+:Author: Nathan Cutler
+:License: Creative Commons Attribution Share Alike 3.0 (CC-BY-SA-3.0)
+
+.. note:: You may also be interested in the :doc:`/dev/internals` documentation.
+
+.. toctree::
+ :maxdepth: 1
+
+ Introduction <intro>
+ Essentials <essentials>
+ What is Merged and When <merging>
+ Issue tracker <issue-tracker>
+ Basic workflow <basic-workflow>
+ Tests: Unit Tests <tests-unit-tests>
+ Tests: Integration Tests <tests-integration-tests>
+ Running Tests Locally <running-tests-locally>
+ Running Integration Tests using Teuthology <running-tests-using-teuth>
+ Running Tests in the Cloud <running-tests-in-cloud>
+ Ceph Dashboard Developer Documentation (formerly HACKING.rst) <dash-devel>
+ cephadm Developer Documentation <../cephadm/index>
diff --git a/doc/dev/developer_guide/intro.rst b/doc/dev/developer_guide/intro.rst
new file mode 100644
index 000000000..67b449c55
--- /dev/null
+++ b/doc/dev/developer_guide/intro.rst
@@ -0,0 +1,25 @@
+Introduction
+============
+
+This guide has two aims. First, it should lower the barrier to entry for
+software developers who wish to get involved in the Ceph project. Second,
+it should serve as a reference for Ceph developers.
+
+We assume that readers are already familiar with Ceph (the distributed
+object store and file system designed to provide excellent performance,
+reliability and scalability). If not, please refer to the `project website`_
+and especially the `publications list`_. Another way to learn about what's
+happening in Ceph is to check out our `youtube channel`_ , where we post Tech
+Talks, Code walk-throughs and Ceph Developer Monthly recordings.
+
+.. _`project website`: https://ceph.com
+.. _`publications list`: https://ceph.com/publications/
+.. _`youtube channel`: https://www.youtube.com/c/CephStorage
+
+Since this document is to be consumed by developers, who are assumed to
+have Internet access, topics covered elsewhere, either within the Ceph
+documentation or elsewhere on the web, are treated by linking. If you
+notice that a link is broken or if you know of a better link, please
+`report it as a bug`_.
+
+.. _`report it as a bug`: http://tracker.ceph.com/projects/ceph/issues/new
diff --git a/doc/dev/developer_guide/issue-tracker.rst b/doc/dev/developer_guide/issue-tracker.rst
new file mode 100644
index 000000000..eae68f3f0
--- /dev/null
+++ b/doc/dev/developer_guide/issue-tracker.rst
@@ -0,0 +1,39 @@
+.. _issue-tracker:
+
+Issue Tracker
+=============
+
+See `Redmine Issue Tracker`_ for a brief introduction to the Ceph Issue
+Tracker.
+
+Ceph developers use the issue tracker to
+
+1. keep track of issues - bugs, fix requests, feature requests, backport
+requests, etc.
+
+2. communicate with other developers and keep them informed as work
+on the issues progresses.
+
+Issue tracker conventions
+-------------------------
+
+When you start working on an existing issue, it's nice to let the other
+developers know this - to avoid duplication of labor. Typically, this is
+done by changing the :code:`Assignee` field (to yourself) and changing the
+:code:`Status` to *In progress*. Newcomers to the Ceph community typically do
+not have sufficient privileges to update these fields, however: they can
+simply update the issue with a brief note.
+
+.. table:: Meanings of some commonly used statuses
+
+ ================ ===========================================
+ Status Meaning
+ ================ ===========================================
+ New Initial status
+ In Progress Somebody is working on it
+ Need Review Pull request is open with a fix
+ Pending Backport Fix has been merged, backport(s) pending
+ Resolved Fix and backports (if any) have been merged
+ ================ ===========================================
+
+.. _Redmine issue tracker: https://tracker.ceph.com
diff --git a/doc/dev/developer_guide/merging.rst b/doc/dev/developer_guide/merging.rst
new file mode 100644
index 000000000..36e10fc84
--- /dev/null
+++ b/doc/dev/developer_guide/merging.rst
@@ -0,0 +1,138 @@
+.. _merging:
+
+Commit merging: scope and cadence
+==================================
+
+Commits are merged into branches according to criteria specific to each phase
+of the Ceph release lifecycle. This chapter codifies these criteria.
+
+Development releases (i.e. x.0.z)
+---------------------------------
+
+What ?
+^^^^^^
+
+* Features
+* Bug fixes
+
+Where ?
+^^^^^^^
+
+Features are merged to the *main* branch. Bug fixes should be merged to the
+corresponding named branch (e.g. *nautilus* for 14.0.z, *pacific* for 16.0.z,
+etc.). However, this is not mandatory - bug fixes and documentation
+enhancements can be merged to the *main* branch as well, since the *main*
+branch is itself occasionally merged to the named branch during the development
+releases phase. In either case, if a bug fix is important it can also be
+flagged for backport to one or more previous stable releases.
+
+When ?
+^^^^^^
+
+After each stable release, candidate branches for previous releases enter
+phase 2 (see below). For example: the *jewel* named branch was created when
+the *infernalis* release candidates entered phase 2. From this point on,
+*main* was no longer associated with *infernalis*. After he named branch of
+the next stable release is created, *main* will be occasionally merged into
+it.
+
+Branch merges
+^^^^^^^^^^^^^
+
+* The latest stable release branch is merged periodically into main.
+* The main branch is merged periodically into the branch of the stable release.
+* The main is merged into the stable release branch
+ immediately after each development (x.0.z) release.
+
+Stable release candidates (i.e. x.1.z) phase 1
+----------------------------------------------
+
+What ?
+^^^^^^
+
+* Bug fixes only
+
+Where ?
+^^^^^^^
+
+The stable release branch (e.g. *jewel* for 10.0.z, *luminous*
+for 12.0.z, etc.) or *main*. Bug fixes should be merged to the named
+branch corresponding to the stable release candidate (e.g. *jewel* for
+10.1.z) or to *main*. During this phase, all commits to *main* will be
+merged to the named branch, and vice versa. In other words, it makes
+no difference whether a commit is merged to the named branch or to
+*main* - it will make it into the next release candidate either way.
+
+When ?
+^^^^^^
+
+After the first stable release candidate is published, i.e. after the
+x.1.0 tag is set in the release branch.
+
+Branch merges
+^^^^^^^^^^^^^
+
+* The stable release branch is merged periodically into *main*.
+* The *main* branch is merged periodically into the stable release branch.
+* The *main* branch is merged into the stable release branch
+ immediately after each x.1.z release candidate.
+
+Stable release candidates (i.e. x.1.z) phase 2
+----------------------------------------------
+
+What ?
+^^^^^^
+
+* Bug fixes only
+
+Where ?
+^^^^^^^
+
+The stable release branch (e.g. *mimic* for 13.0.z, *octopus* for 15.0.z
+,etc.). During this phase, all commits to the named branch will be merged into
+*main*. Cherry-picking to the named branch during release candidate phase 2
+is performed manually since the official backporting process begins only when
+the release is pronounced "stable".
+
+When ?
+^^^^^^
+
+After Sage Weil announces that it is time for phase 2 to happen.
+
+Branch merges
+^^^^^^^^^^^^^
+
+* The stable release branch is occasionally merged into main.
+
+Stable releases (i.e. x.2.z)
+----------------------------
+
+What ?
+^^^^^^
+
+* Bug fixes
+* Features are sometime accepted
+* Commits should be cherry-picked from *main* when possible
+* Commits that are not cherry-picked from *main* must pertain to a bug unique to
+ the stable release
+* See also the `backport HOWTO`_ document
+
+.. _`backport HOWTO`:
+ http://tracker.ceph.com/projects/ceph-releases/wiki/HOWTO#HOWTO
+
+Where ?
+^^^^^^^
+
+The stable release branch (*hammer* for 0.94.x, *infernalis* for 9.2.x,
+etc.)
+
+When ?
+^^^^^^
+
+After the stable release is published, i.e. after the "vx.2.0" tag is set in
+the release branch.
+
+Branch merges
+^^^^^^^^^^^^^
+
+Never
diff --git a/doc/dev/developer_guide/running-tests-in-cloud.rst b/doc/dev/developer_guide/running-tests-in-cloud.rst
new file mode 100644
index 000000000..60118aefd
--- /dev/null
+++ b/doc/dev/developer_guide/running-tests-in-cloud.rst
@@ -0,0 +1,289 @@
+Running Tests in the Cloud
+==========================
+
+In this chapter, we will explain in detail how use an OpenStack
+tenant as an environment for Ceph `integration testing`_.
+
+Assumptions and caveat
+----------------------
+
+We assume that:
+
+1. you are the only person using the tenant
+2. you have the credentials
+3. the tenant supports the ``nova`` and ``cinder`` APIs
+
+Caveat: be aware that, as of this writing (July 2016), testing in
+OpenStack clouds is a new feature. Things may not work as advertised.
+If you run into trouble, ask for help on `IRC`_ or the `Mailing list`_, or
+open a bug report at the `ceph-workbench bug tracker`_.
+
+.. _`ceph-workbench bug tracker`: http://ceph-workbench.dachary.org/root/ceph-workbench/issues
+
+Prepare tenant
+--------------
+
+If you have not tried to use ``ceph-workbench`` with this tenant before,
+proceed to the next step.
+
+To start with a clean slate, login to your tenant via the Horizon dashboard
+and:
+
+* terminate the ``teuthology`` and ``packages-repository`` instances, if any
+* delete the ``teuthology`` and ``teuthology-worker`` security groups, if any
+* delete the ``teuthology`` and ``teuthology-myself`` key pairs, if any
+
+Also do the above if you ever get key-related errors ("invalid key", etc.)
+when trying to schedule suites.
+
+Getting ceph-workbench
+----------------------
+
+Since testing in the cloud is done using the ``ceph-workbench ceph-qa-suite``
+tool, you will need to install that first. It is designed
+to be installed via Docker, so if you don't have Docker running on your
+development machine, take care of that first. You can follow `the official
+tutorial <https://docs.docker.com/engine/installation/>`_ to install if
+you have not installed yet.
+
+Once Docker is up and running, install ``ceph-workbench`` by following the
+`Installation instructions in the ceph-workbench documentation
+<http://ceph-workbench.readthedocs.io/en/latest/#installation>`_.
+
+Linking ceph-workbench with your OpenStack tenant
+-------------------------------------------------
+
+Before you can trigger your first teuthology suite, you will need to link
+``ceph-workbench`` with your OpenStack account.
+
+First, download a ``openrc.sh`` file by clicking on the "Download OpenStack
+RC File" button, which can be found in the "API Access" tab of the "Access
+& Security" dialog of the OpenStack Horizon dashboard.
+
+Second, create a ``~/.ceph-workbench`` directory, set its permissions to
+700, and move the ``openrc.sh`` file into it. Make sure that the filename
+is exactly ``~/.ceph-workbench/openrc.sh``.
+
+Third, edit the file so it does not ask for your OpenStack password
+interactively. Comment out the relevant lines and replace them with
+something like::
+
+.. prompt:: bash $
+
+ export OS_PASSWORD="aiVeth0aejee3eep8rogho3eep7Pha6ek"
+
+When ``ceph-workbench ceph-qa-suite`` connects to your OpenStack tenant for
+the first time, it will generate two keypairs: ``teuthology-myself`` and
+``teuthology``.
+
+.. If this is not the first time you have tried to use
+.. ``ceph-workbench ceph-qa-suite`` with this tenant, make sure to delete any
+.. stale keypairs with these names!
+
+Run the dummy suite
+-------------------
+
+You are now ready to take your OpenStack teuthology setup for a test
+drive
+
+.. prompt:: bash $
+
+ ceph-workbench ceph-qa-suite --suite dummy
+
+Be forewarned that the first run of ``ceph-workbench ceph-qa-suite`` on a
+pristine tenant will take a long time to complete because it downloads a VM
+image and during this time the command may not produce any output.
+
+The images are cached in OpenStack, so they are only downloaded once.
+Subsequent runs of the same command will complete faster.
+
+Although ``dummy`` suite does not run any tests, in all other respects it
+behaves just like a teuthology suite and produces some of the same
+artifacts.
+
+The last bit of output should look something like this::
+
+ pulpito web interface: http://149.202.168.201:8081/
+ ssh access : ssh -i /home/smithfarm/.ceph-workbench/teuthology-myself.pem ubuntu@149.202.168.201 # logs in /usr/share/nginx/html
+
+What this means is that ``ceph-workbench ceph-qa-suite`` triggered the test
+suite run. It does not mean that the suite run has completed. To monitor
+progress of the run, check the Pulpito web interface URL periodically, or
+if you are impatient, ssh to the teuthology machine using the ssh command
+shown and do
+
+.. prompt:: bash $
+
+ tail -f /var/log/teuthology.*
+
+The `/usr/share/nginx/html` directory contains the complete logs of the
+test suite. If we had provided the ``--upload`` option to the
+``ceph-workbench ceph-qa-suite`` command, these logs would have been
+uploaded to http://teuthology-logs.public.ceph.com.
+
+Run a standalone test
+---------------------
+
+The standalone test explained in `Reading a standalone test`_ can be run
+with the following command
+
+.. prompt:: bash $
+
+ ceph-workbench ceph-qa-suite --suite rados/singleton/all/admin-socket.yaml
+
+This will run the suite shown on the current ``master`` branch of
+``ceph/ceph.git``. You can specify a different branch with the ``--ceph``
+option, and even a different git repo with the ``--ceph-git-url`` option. (Run
+``ceph-workbench ceph-qa-suite --help`` for an up-to-date list of available
+options.)
+
+The first run of a suite will also take a long time, because ceph packages
+have to be built, first. Again, the packages so built are cached and
+``ceph-workbench ceph-qa-suite`` will not build identical packages a second
+time.
+
+Interrupt a running suite
+-------------------------
+
+Teuthology suites take time to run. From time to time one may wish to
+interrupt a running suite. One obvious way to do this is::
+
+.. prompt:: bash $
+
+ ceph-workbench ceph-qa-suite --teardown
+
+This destroys all VMs created by ``ceph-workbench ceph-qa-suite`` and
+returns the OpenStack tenant to a "clean slate".
+
+Sometimes you may wish to interrupt the running suite, but keep the logs,
+the teuthology VM, the packages-repository VM, etc. To do this, you can
+``ssh`` to the teuthology VM (using the ``ssh access`` command reported
+when you triggered the suite -- see `Run the dummy suite`_) and, once
+there
+
+.. prompt:: bash $
+
+ sudo /etc/init.d/teuthology restart
+
+This will keep the teuthology machine, the logs and the packages-repository
+instance but nuke everything else.
+
+Upload logs to archive server
+-----------------------------
+
+Since the teuthology instance in OpenStack is only semi-permanent, with
+limited space for storing logs, ``teuthology-openstack`` provides an
+``--upload`` option which, if included in the ``ceph-workbench ceph-qa-suite``
+command, will cause logs from all failed jobs to be uploaded to the log
+archive server maintained by the Ceph project. The logs will appear at the
+URL::
+
+ http://teuthology-logs.public.ceph.com/$RUN
+
+where ``$RUN`` is the name of the run. It will be a string like this::
+
+ ubuntu-2016-07-23_16:08:12-rados-hammer-backports---basic-openstack
+
+Even if you don't providing the ``--upload`` option, however, all the logs can
+still be found on the teuthology machine in the directory
+``/usr/share/nginx/html``.
+
+Provision VMs ad hoc
+--------------------
+
+From the teuthology VM, it is possible to provision machines on an "ad hoc"
+basis, to use however you like. The magic incantation is::
+
+.. prompt:: bash $
+
+ teuthology-lock --lock-many $NUMBER_OF_MACHINES \
+ --os-type $OPERATING_SYSTEM \
+ --os-version $OS_VERSION \
+ --machine-type openstack \
+ --owner $EMAIL_ADDRESS
+
+The command must be issued from the ``~/teuthology`` directory. The possible
+values for ``OPERATING_SYSTEM`` AND ``OS_VERSION`` can be found by examining
+the contents of the directory ``teuthology/openstack/``. For example
+
+.. prompt:: bash $
+
+ teuthology-lock --lock-many 1 --os-type ubuntu --os-version 16.04 \
+ --machine-type openstack --owner foo@example.com
+
+When you are finished with the machine, find it in the list of machines
+
+.. prompt:: bash $
+
+ openstack server list
+
+to determine the name or ID, and then terminate it with
+
+.. prompt:: bash $
+
+ openstack server delete $NAME_OR_ID
+
+Deploy a cluster for manual testing
+-----------------------------------
+
+The `teuthology framework`_ and ``ceph-workbench ceph-qa-suite`` are
+versatile tools that automatically provision Ceph clusters in the cloud and
+run various tests on them in an automated fashion. This enables a single
+engineer, in a matter of hours, to perform thousands of tests that would
+keep dozens of human testers occupied for days or weeks if conducted
+manually.
+
+However, there are times when the automated tests do not cover a particular
+scenario and manual testing is desired. It turns out that it is simple to
+adapt a test to stop and wait after the Ceph installation phase, and the
+engineer can then ssh into the running cluster. Simply add the following
+snippet in the desired place within the test YAML and schedule a run with the
+test::
+
+ tasks:
+ - exec:
+ client.0:
+ - sleep 1000000000 # forever
+
+(Make sure you have a ``client.0`` defined in your ``roles`` stanza or adapt
+accordingly.)
+
+The same effect can be achieved using the ``interactive`` task::
+
+ tasks:
+ - interactive
+
+By following the test log, you can determine when the test cluster has entered
+the "sleep forever" condition. At that point, you can ssh to the teuthology
+machine and from there to one of the target VMs (OpenStack) or teuthology
+worker machines machine (Sepia) where the test cluster is running.
+
+The VMs (or "instances" in OpenStack terminology) created by
+``ceph-workbench ceph-qa-suite`` are named as follows:
+
+``teuthology`` - the teuthology machine
+
+``packages-repository`` - VM where packages are stored
+
+``ceph-*`` - VM where packages are built
+
+``target*`` - machines where tests are run
+
+The VMs named ``target*`` are used by tests. If you are monitoring the
+teuthology log for a given test, the hostnames of these target machines can
+be found out by searching for the string ``Locked targets``::
+
+ 2016-03-20T11:39:06.166 INFO:teuthology.task.internal:Locked targets:
+ target149202171058.teuthology: null
+ target149202171059.teuthology: null
+
+The IP addresses of the target machines can be found by running ``openstack
+server list`` on the teuthology machine, but the target VM hostnames (e.g.
+``target149202171058.teuthology``) are resolvable within the teuthology
+cluster.
+
+.. _Integration testing: ../tests-integration-tests
+.. _IRC: ../essentials/#irc
+.. _Mailing List: ../essentials/#mailing-list
+.. _Reading A Standalone Test: ../testing-integration-tests/#reading-a-standalone-test
+.. _teuthology framework: https://github.com/ceph/teuthology
diff --git a/doc/dev/developer_guide/running-tests-locally.rst b/doc/dev/developer_guide/running-tests-locally.rst
new file mode 100644
index 000000000..b786c12e8
--- /dev/null
+++ b/doc/dev/developer_guide/running-tests-locally.rst
@@ -0,0 +1,138 @@
+Running Unit Tests
+==================
+
+How to run s3-tests locally
+---------------------------
+
+RGW code can be tested by building Ceph locally from source, starting a vstart
+cluster, and running the "s3-tests" suite against it.
+
+The following instructions should work on jewel and above.
+
+Step 1 - build Ceph
+^^^^^^^^^^^^^^^^^^^
+
+Refer to :doc:`/install/build-ceph`.
+
+You can do step 2 separately while it is building.
+
+Step 2 - vstart
+^^^^^^^^^^^^^^^
+
+When the build completes, and still in the top-level directory of the git
+clone where you built Ceph, do the following, for cmake builds::
+
+ cd build/
+ RGW=1 ../src/vstart.sh -n
+
+This will produce a lot of output as the vstart cluster is started up. At the
+end you should see a message like::
+
+ started. stop.sh to stop. see out/* (e.g. 'tail -f out/????') for debug output.
+
+This means the cluster is running.
+
+
+Step 3 - run s3-tests
+^^^^^^^^^^^^^^^^^^^^^
+
+.. highlight:: console
+
+To run the s3tests suite do the following::
+
+ $ ../qa/workunits/rgw/run-s3tests.sh
+
+
+Running test using vstart_runner.py
+-----------------------------------
+CephFS and Ceph Manager code is be tested using `vstart_runner.py`_.
+
+Running your first test
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+The Python tests in Ceph repository can be executed on your local machine
+using `vstart_runner.py`_. To do that, you'd need `teuthology`_ installed::
+
+ $ virtualenv --python=python3 venv
+ $ source venv/bin/activate
+ $ pip install 'setuptools >= 12'
+ $ pip install git+https://github.com/ceph/teuthology#egg=teuthology[test]
+ $ deactivate
+
+The above steps installs teuthology in a virtual environment. Before running
+a test locally, build Ceph successfully from the source (refer
+:doc:`/install/build-ceph`) and do::
+
+ $ cd build
+ $ ../src/vstart.sh -n -d -l
+ $ source ~/path/to/teuthology/venv/bin/activate
+
+To run a specific test, say `test_reconnect_timeout`_ from
+`TestClientRecovery`_ in ``qa/tasks/cephfs/test_client_recovery``, you can
+do::
+
+ $ python ../qa/tasks/vstart_runner.py tasks.cephfs.test_client_recovery.TestClientRecovery.test_reconnect_timeout
+
+The above command runs vstart_runner.py and passes the test to be executed as
+an argument to vstart_runner.py. In a similar way, you can also run the group
+of tests in the following manner::
+
+ $ # run all tests in class TestClientRecovery
+ $ python ../qa/tasks/vstart_runner.py tasks.cephfs.test_client_recovery.TestClientRecovery
+ $ # run all tests in test_client_recovery.py
+ $ python ../qa/tasks/vstart_runner.py tasks.cephfs.test_client_recovery
+
+Based on the argument passed, vstart_runner.py collects tests and executes as
+it would execute a single test.
+
+vstart_runner.py can take the following options -
+
+--clear-old-log deletes old log file before running the test
+--create create Ceph cluster before running a test
+--create-cluster-only creates the cluster and quits; tests can be issued
+ later
+--interactive drops a Python shell when a test fails
+--log-ps-output logs ps output; might be useful while debugging
+--teardown tears Ceph cluster down after test(s) has finished
+ runnng
+--kclient use the kernel cephfs client instead of FUSE
+--brxnet=<net/mask> specify a new net/mask for the mount clients' network
+ namespace container (Default: 192.168.0.0/16)
+
+.. note:: If using the FUSE client, ensure that the fuse package is installed
+ and enabled on the system and that ``user_allow_other`` is added
+ to ``/etc/fuse.conf``.
+
+.. note:: If using the kernel client, the user must have the ability to run
+ commands with passwordless sudo access. A failure on the kernel
+ client may crash the host, so it's recommended to use this
+ functionality within a virtual machine.
+
+Internal working of vstart_runner.py -
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+vstart_runner.py primarily does three things -
+
+* collects and runs the tests
+ vstart_runner.py setups/teardowns the cluster and collects and runs the
+ test. This is implemented using methods ``scan_tests()``, ``load_tests()``
+ and ``exec_test()``. This is where all the options that vstart_runner.py
+ takes are implemented along with other features like logging and copying
+ the traceback to the bottom of the log.
+
+* provides an interface for issuing and testing shell commands
+ The tests are written assuming that the cluster exists on remote machines.
+ vstart_runner.py provides an interface to run the same tests with the
+ cluster that exists within the local machine. This is done using the class
+ ``LocalRemote``. Class ``LocalRemoteProcess`` can manage the process that
+ executes the commands from ``LocalRemote``, class ``LocalDaemon`` provides
+ an interface to handle Ceph daemons and class ``LocalFuseMount`` can
+ create and handle FUSE mounts.
+
+* provides an interface to operate Ceph cluster
+ ``LocalCephManager`` provides methods to run Ceph cluster commands with
+ and without admin socket and ``LocalCephCluster`` provides methods to set
+ or clear ``ceph.conf``.
+
+.. _test_reconnect_timeout: https://github.com/ceph/ceph/blob/master/qa/tasks/cephfs/test_client_recovery.py#L133
+.. _TestClientRecovery: https://github.com/ceph/ceph/blob/master/qa/tasks/cephfs/test_client_recovery.py#L86
+.. _teuthology: https://github.com/ceph/teuthology
+.. _vstart_runner.py: https://github.com/ceph/ceph/blob/master/qa/tasks/vstart_runner.py
diff --git a/doc/dev/developer_guide/running-tests-using-teuth.rst b/doc/dev/developer_guide/running-tests-using-teuth.rst
new file mode 100644
index 000000000..492b7790e
--- /dev/null
+++ b/doc/dev/developer_guide/running-tests-using-teuth.rst
@@ -0,0 +1,183 @@
+Running Integration Tests using Teuthology
+==========================================
+
+Getting binaries
+----------------
+To run integration tests using teuthology, you need to have Ceph binaries
+built for your branch. Follow these steps to initiate the build process -
+
+#. Push the branch to `ceph-ci`_ repository. This triggers the process of
+ building the binaries.
+
+#. To confirm that the build process has been initiated, spot the branch name
+ at `Shaman`_. Little after the build process has been initiated, the single
+ entry with your branch name would multiply, each new entry for a different
+ combination of distro and flavour.
+
+#. Wait until the packages are built and uploaded, and the repository offering
+ them are created. This is marked by colouring the entries for the branch
+ name green. Preferably, wait until each entry is coloured green. Usually,
+ it takes around 2-3 hours depending on the availability of the machines.
+
+.. note:: Branch to be pushed on ceph-ci can be any branch, it shouldn't
+ necessarily be a PR branch.
+
+.. note:: In case you are pushing master or any other standard branch, check
+ `Shaman`_ beforehand since it already might have builds ready for it.
+
+Triggering Tests
+----------------
+After building is complete, proceed to trigger tests -
+
+#. Log in to the teuthology machine::
+
+ ssh <username>@teuthology.front.sepia.ceph.com
+
+ This would require Sepia lab access. To know how to request it, see: https://ceph.github.io/sepia/adding_users/
+
+#. Next, get teuthology installed. Run the first set of commands in
+ `Running Your First Test`_ for that. After that, activate the virtual
+ environment in which teuthology is installed.
+
+#. Run the ``teuthology-suite`` command::
+
+ teuthology-suite -v -m smithi -c wip-devname-feature-x -s fs -p 110 --filter "cephfs-shell"
+
+ Following are the options used in above command with their meanings -
+ -v verbose
+ -m machine name
+ -c branch name, the branch that was pushed on ceph-ci
+ -s test-suite name
+ -p higher the number, lower the priority of the job
+ --filter filter tests in given suite that needs to run, the arg to
+ filter should be the test you want to run
+
+.. note:: The priority number present in the command above is just a
+ placeholder. It might be highly inappropriate for the jobs you may want to
+ trigger. See `Testing Priority`_ section to pick a priority number.
+
+.. note:: Don't skip passing a priority number, the default value is 1000
+ which way too high; the job probably might never run.
+
+#. Wait for the tests to run. ``teuthology-suite`` prints a link to the
+ `Pulpito`_ page created for the tests triggered.
+
+Other frequently used/useful options are ``-d`` (or ``--distro``),
+``--distroversion``, ``--filter-out``, ``--timeout``, ``flavor``, ``-rerun``,
+``-l`` (for limiting number of jobs) , ``-n`` (for how many times job would
+run) and ``-e`` (for email notifications). Run ``teuthology-suite --help``
+to read description of these and every other options available.
+
+Testing QA changes (without re-building binaires)
+-------------------------------------------------
+While writing a PR you might need to test your PR repeatedly using teuthology.
+If you are making non-QA changes, you need to follow the standard process of
+triggering builds, waiting for it to finish and then triggering tests and
+wait for the result. But if changes you made are purely changes in qa/,
+you don't need rebuild the binaries. Instead you can test binaries built for
+the ceph-ci branch and instruct ``teuthology-suite`` command to use a separate
+branch for running tests. The separate branch can be passed to the command
+by using ``--suite-repo`` and ``--suite-branch``. Pass the link to the GitHub
+fork where your PR branch exists to the first option and pass the PR branch
+name to the second option.
+
+For example, if you want to make changes in ``qa/`` after testing ``branch-x``
+(of which has ceph-ci branch is ``wip-username-branch-x``) by running
+following command::
+
+ teuthology-suite -v -m smithi -c wip-username-branch-x -s fs -p 50 --filter cephfs-shell
+
+You can make the modifications locally, update the PR branch and then
+trigger tests from your PR branch as follows::
+
+ teuthology-suite -v -m smithi -c wip-username-branch-x -s fs -p 50 --filter cephfs-shell --suite-repo https://github.com/username/ceph --suite-branch branch-x
+
+You can verify if the tests were run using this branch by looking at values
+for the keys ``suite_branch``, ``suite_repo`` and ``suite_sha1`` in the job
+config printed at the very beginning of the teuthology job.
+
+About Suites and Filters
+------------------------
+See `Suites Inventory`_ for a list of suites of integration tests present
+right now. Alternatively, each directory under ``qa/suites`` in Ceph
+repository is an integration test suite, so looking within that directory
+to decide an appropriate argument for ``-s`` also works.
+
+For picking an argument for ``--filter``, look within
+``qa/suites/<suite-name>/<subsuite-name>/tasks`` to get keywords for filtering
+tests. Each YAML file in there can trigger a bunch of tests; using the name of
+the file, without the extension part of the file name, as an argument to the
+``--filter`` will trigger those tests. For example, the sample command above
+uses ``cephfs-shell`` since there's a file named ``cephfs-shell.yaml`` in
+``qa/suites/fs/basic_functional/tasks/``. In case, the file name doesn't hint
+what bunch of tests it would trigger, look at the contents of the file for
+``modules`` attribute. For ``cephfs-shell.yaml`` the ``modules`` attribute
+is ``tasks.cephfs.test_cephfs_shell`` which means it'll trigger all tests in
+``qa/tasks/cephfs/test_cephfs_shell.py``.
+
+Killing Tests
+-------------
+Sometimes a teuthology job might not complete running for several minutes or
+even hours after tests that were trigged have completed running and other
+times wrong set of tests can be triggered is filter wasn't chosen carefully.
+To save resource it's better to termniate such a job. Following is the command
+to terminate a job::
+
+ teuthology-kill -r teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi
+
+Let's call the argument passed to ``-r`` as test ID. It can be found
+easily in the link to the Pulpito page for the tests you triggered. For
+example, for the above test ID, the link is - http://pulpito.front.sepia.ceph.com/teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi/
+
+Re-running Tests
+----------------
+Pass ``--rerun`` option, with test ID as an argument to it, to
+``teuthology-suite`` command::
+
+ teuthology-suite -v -m smithi -c wip-rishabh-fs-test_cephfs_shell-fix -p 50 --rerun teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi
+
+The meaning of rest of the options is already covered in `Triggering Tests`
+section.
+
+Teuthology Archives
+-------------------
+Once the tests have finished running, the log for the job can be obtained by
+clicking on job ID at the Pulpito page for your tests. It's more convenient to
+download the log and then view it rather than viewing it in an internet
+browser since these logs can easily be upto size of 1 GB. What's much more
+easier is to log in to the teuthology machine again
+(``teuthology.front.sepia.ceph.com``), and access the following path::
+
+ /ceph/teuthology-archive/<test-id>/<job-id>/teuthology.log
+
+For example, for above test ID path is::
+
+ /ceph/teuthology-archive/teuthology-2019-12-10_05:00:03-smoke-master-testing-basic-smithi/4588482/teuthology.log
+
+This way the log remotely can be viewed remotely without having to wait too
+much.
+
+Naming the ceph-ci branch
+-------------------------
+There are no hard conventions (except for the case of stable branch; see
+next paragraph) for how the branch pushed on ceph-ci is named. But, to make
+builds and tests easily identitifiable on Shaman and Pulpito respectively,
+prepend it with your name. For example branch ``feature-x`` can be named
+``wip-yourname-feature-x`` while pushing on ceph-ci.
+
+In case you are using one of the stable branches (e.g. nautilis, mimic,
+etc.), include the name of that stable branch in your ceph-ci branch name.
+For example, ``feature-x`` PR branch should be named as
+``wip-feature-x-nautilus``. *This is not just a matter of convention but this,
+more essentially, builds your branch in the correct environment.*
+
+Delete the branch from ceph-ci, once it's not required anymore. If you are
+logged in at GitHub, all your branches on ceph-ci can be easily found here -
+https://github.com/ceph/ceph-ci/branches.
+
+.. _ceph-ci: https://github.com/ceph/ceph-ci
+.. _Pulpito: http://pulpito.front.sepia.ceph.com/
+.. _Running Your First Test: ../running-tests-locally/#running-your-first-test
+.. _Shaman: https://shaman.ceph.com/builds/ceph/
+.. _Suites Inventory: ../tests-integration-tests/#suites-inventory
+.. _Testing Priority: ../tests-integration-tests/#testing-priority
diff --git a/doc/dev/developer_guide/tests-integration-tests.rst b/doc/dev/developer_guide/tests-integration-tests.rst
new file mode 100644
index 000000000..c8e6dbcd4
--- /dev/null
+++ b/doc/dev/developer_guide/tests-integration-tests.rst
@@ -0,0 +1,522 @@
+.. _testing-integration-tests:
+
+Testing - Integration Tests
+===========================
+
+Ceph has two types of tests: :ref:`make check <make-check>` tests and integration tests.
+When a test requires multiple machines, root access or lasts for a
+longer time (for example, to simulate a realistic Ceph deployment), it
+is deemed to be an integration test. Integration tests are organized into
+"suites", which are defined in the `ceph/qa sub-directory`_ and run with
+the ``teuthology-suite`` command.
+
+The ``teuthology-suite`` command is part of the `teuthology framework`_.
+In the sections that follow we attempt to provide a detailed introduction
+to that framework from the perspective of a beginning Ceph developer.
+
+Teuthology consumes packages
+----------------------------
+
+It may take some time to understand the significance of this fact, but it
+is `very` significant. It means that automated tests can be conducted on
+multiple platforms using the same packages (RPM, DEB) that can be
+installed on any machine running those platforms.
+
+Teuthology has a `list of platforms that it supports
+<https://github.com/ceph/ceph/tree/master/qa/distros/supported>`_ (as
+of September 2020 the list consisted of "RHEL/CentOS 8" and "Ubuntu 18.04"). It
+expects to be provided pre-built Ceph packages for these platforms.
+Teuthology deploys these platforms on machines (bare-metal or
+cloud-provisioned), installs the packages on them, and deploys Ceph
+clusters on them - all as called for by the test.
+
+The Nightlies
+-------------
+
+A number of integration tests are run on a regular basis in the `Sepia
+lab`_ against the official Ceph repositories (on the ``master`` development
+branch and the stable branches). Traditionally, these tests are called "the
+nightlies" because the Ceph core developers used to live and work in
+the same time zone and from their perspective the tests were run overnight.
+
+The results of the nightlies are published at http://pulpito.ceph.com/. The
+developer nick shows in the
+test results URL and in the first column of the Pulpito dashboard. The
+results are also reported on the `ceph-qa mailing list
+<https://ceph.com/irc/>`_ for analysis.
+
+Testing Priority
+----------------
+
+The ``teuthology-suite`` command includes an almost mandatory option ``-p <N>``
+which specifies the priority of the jobs submitted to the queue. The lower
+the value of ``N``, the higher the priority. The option is almost mandatory
+because the default is ``1000`` which matches the priority of the nightlies.
+Nightlies are often half-finished and cancelled due to the volume of testing
+done so your jobs may never finish. Therefore, it is common to select a
+priority less than 1000.
+
+Job priority should be selected based on the following recommendations:
+
+* **Priority < 10:** Use this if the sky is falling and some group of tests
+ must be run ASAP.
+
+* **10 <= Priority < 50:** Use this if your tests are urgent and blocking
+ other important development.
+
+* **50 <= Priority < 75:** Use this if you are testing a particular
+ feature/fix and running fewer than about 25 jobs. This range can also be
+ used for urgent release testing.
+
+* **75 <= Priority < 100:** Tech Leads will regularly schedule integration
+ tests with this priority to verify pull requests against master.
+
+* **100 <= Priority < 150:** This priority is to be used for QE validation of
+ point releases.
+
+* **150 <= Priority < 200:** Use this priority for 100 jobs or fewer of a
+ particular feature/fix that you'd like results on in a day or so.
+
+* **200 <= Priority < 1000:** Use this priority for large test runs that can
+ be done over the course of a week.
+
+In case you don't know how many jobs would be triggered by
+``teuthology-suite`` command, use ``--dry-run`` to get a count first and then
+issue ``teuthology-suite`` command again, this time without ``--dry-run`` and
+with ``-p`` and an appropriate number as an argument to it.
+
+To skip the priority check, use ``--force-priority``. In order to be sensitive
+to the runs of other developers who also need to do testing, please use it in
+emergency only.
+
+Suites Inventory
+----------------
+
+The ``suites`` directory of the `ceph/qa sub-directory`_ contains
+all the integration tests, for all the Ceph components.
+
+`ceph-deploy <https://github.com/ceph/ceph/tree/master/qa/suites/ceph-deploy>`_
+ install a Ceph cluster with ``ceph-deploy`` (:ref:`ceph-deploy man page <ceph-deploy>`)
+
+`dummy <https://github.com/ceph/ceph/tree/master/qa/suites/dummy>`_
+ get a machine, do nothing and return success (commonly used to
+ verify the :ref:`testing-integration-tests` infrastructure works as expected)
+
+`fs <https://github.com/ceph/ceph/tree/master/qa/suites/fs>`_
+ test CephFS mounted using kernel and FUSE clients, also with multiple MDSs.
+
+`krbd <https://github.com/ceph/ceph/tree/master/qa/suites/krbd>`_
+ test the RBD kernel module
+
+`powercycle <https://github.com/ceph/ceph/tree/master/qa/suites/powercycle>`_
+ verify the Ceph cluster behaves when machines are powered off
+ and on again
+
+`rados <https://github.com/ceph/ceph/tree/master/qa/suites/rados>`_
+ run Ceph clusters including OSDs and MONs, under various conditions of
+ stress
+
+`rbd <https://github.com/ceph/ceph/tree/master/qa/suites/rbd>`_
+ run RBD tests using actual Ceph clusters, with and without qemu
+
+`rgw <https://github.com/ceph/ceph/tree/master/qa/suites/rgw>`_
+ run RGW tests using actual Ceph clusters
+
+`smoke <https://github.com/ceph/ceph/tree/master/qa/suites/smoke>`_
+ run tests that exercise the Ceph API with an actual Ceph cluster
+
+`teuthology <https://github.com/ceph/ceph/tree/master/qa/suites/teuthology>`_
+ verify that teuthology can run integration tests, with and without OpenStack
+
+`upgrade <https://github.com/ceph/ceph/tree/master/qa/suites/upgrade>`_
+ for various versions of Ceph, verify that upgrades can happen
+ without disrupting an ongoing workload
+
+.. _`ceph-deploy man page`: ../../man/8/ceph-deploy
+
+teuthology-describe-tests
+-------------------------
+
+In February 2016, a new feature called ``teuthology-describe-tests`` was
+added to the `teuthology framework`_ to facilitate documentation and better
+understanding of integration tests (`feature announcement
+<http://article.gmane.org/gmane.comp.file-systems.ceph.devel/29287>`_).
+
+The upshot is that tests can be documented by embedding ``meta:``
+annotations in the yaml files used to define the tests. The results can be
+seen in the `ceph-qa-suite wiki
+<http://tracker.ceph.com/projects/ceph-qa-suite/wiki/>`_.
+
+Since this is a new feature, many yaml files have yet to be annotated.
+Developers are encouraged to improve the documentation, in terms of both
+coverage and quality.
+
+How integration tests are run
+-----------------------------
+
+Given that - as a new Ceph developer - you will typically not have access
+to the `Sepia lab`_, you may rightly ask how you can run the integration
+tests in your own environment.
+
+One option is to set up a teuthology cluster on bare metal. Though this is
+a non-trivial task, it `is` possible. Here are `some notes
+<http://docs.ceph.com/teuthology/docs/LAB_SETUP.html>`_ to get you started
+if you decide to go this route.
+
+If you have access to an OpenStack tenant, you have another option: the
+`teuthology framework`_ has an OpenStack backend, which is documented `here
+<https://github.com/dachary/teuthology/tree/openstack#openstack-backend>`__.
+This OpenStack backend can build packages from a given git commit or
+branch, provision VMs, install the packages and run integration tests
+on those VMs. This process is controlled using a tool called
+``ceph-workbench ceph-qa-suite``. This tool also automates publishing of
+test results at http://teuthology-logs.public.ceph.com.
+
+Running integration tests on your code contributions and publishing the
+results allows reviewers to verify that changes to the code base do not
+cause regressions, or to analyze test failures when they do occur.
+
+Every teuthology cluster, whether bare-metal or cloud-provisioned, has a
+so-called "teuthology machine" from which tests suites are triggered using the
+``teuthology-suite`` command.
+
+A detailed and up-to-date description of each `teuthology-suite`_ option is
+available by running the following command on the teuthology machine
+
+.. prompt:: bash $
+
+ teuthology-suite --help
+
+.. _teuthology-suite: http://docs.ceph.com/teuthology/docs/teuthology.suite.html
+
+How integration tests are defined
+---------------------------------
+
+Integration tests are defined by yaml files found in the ``suites``
+subdirectory of the `ceph/qa sub-directory`_ and implemented by python
+code found in the ``tasks`` subdirectory. Some tests ("standalone tests")
+are defined in a single yaml file, while other tests are defined by a
+directory tree containing yaml files that are combined, at runtime, into a
+larger yaml file.
+
+Reading a standalone test
+-------------------------
+
+Let us first examine a standalone test, or "singleton".
+
+Here is a commented example using the integration test
+`rados/singleton/all/admin-socket.yaml
+<https://github.com/ceph/ceph/blob/master/qa/suites/rados/singleton/all/admin-socket.yaml>`_
+
+.. code-block:: yaml
+
+ roles:
+ - - mon.a
+ - osd.0
+ - osd.1
+ tasks:
+ - install:
+ - ceph:
+ - admin_socket:
+ osd.0:
+ version:
+ git_version:
+ help:
+ config show:
+ config set filestore_dump_file /tmp/foo:
+ perf dump:
+ perf schema:
+
+The ``roles`` array determines the composition of the cluster (how
+many MONs, OSDs, etc.) on which this test is designed to run, as well
+as how these roles will be distributed over the machines in the
+testing cluster. In this case, there is only one element in the
+top-level array: therefore, only one machine is allocated to the
+test. The nested array declares that this machine shall run a MON with
+id ``a`` (that is the ``mon.a`` in the list of roles) and two OSDs
+(``osd.0`` and ``osd.1``).
+
+The body of the test is in the ``tasks`` array: each element is
+evaluated in order, causing the corresponding python file found in the
+``tasks`` subdirectory of the `teuthology repository`_ or
+`ceph/qa sub-directory`_ to be run. "Running" in this case means calling
+the ``task()`` function defined in that file.
+
+In this case, the `install
+<https://github.com/ceph/teuthology/blob/master/teuthology/task/install/__init__.py>`_
+task comes first. It installs the Ceph packages on each machine (as
+defined by the ``roles`` array). A full description of the ``install``
+task is `found in the python file
+<https://github.com/ceph/teuthology/blob/master/teuthology/task/install/__init__.py>`_
+(search for "def task").
+
+The ``ceph`` task, which is documented `here
+<https://github.com/ceph/ceph/blob/master/qa/tasks/ceph.py>`__ (again,
+search for "def task"), starts OSDs and MONs (and possibly MDSs as well)
+as required by the ``roles`` array. In this example, it will start one MON
+(``mon.a``) and two OSDs (``osd.0`` and ``osd.1``), all on the same
+machine. Control moves to the next task when the Ceph cluster reaches
+``HEALTH_OK`` state.
+
+The next task is ``admin_socket`` (`source code
+<https://github.com/ceph/ceph/blob/master/qa/tasks/admin_socket.py>`_).
+The parameter of the ``admin_socket`` task (and any other task) is a
+structure which is interpreted as documented in the task. In this example
+the parameter is a set of commands to be sent to the admin socket of
+``osd.0``. The task verifies that each of them returns on success (i.e.
+exit code zero).
+
+This test can be run with
+
+.. prompt:: bash $
+
+ teuthology-suite --machine-type smithi --suite rados/singleton/all/admin-socket.yaml fs/ext4.yaml
+
+Test descriptions
+-----------------
+
+Each test has a "test description", which is similar to a directory path,
+but not the same. In the case of a standalone test, like the one in
+`Reading a standalone test`_, the test description is identical to the
+relative path (starting from the ``suites/`` directory of the
+`ceph/qa sub-directory`_) of the yaml file defining the test.
+
+Much more commonly, tests are defined not by a single yaml file, but by a
+`directory tree of yaml files`. At runtime, the tree is walked and all yaml
+files (facets) are combined into larger yaml "programs" that define the
+tests. A full listing of the yaml defining the test is included at the
+beginning of every test log.
+
+In these cases, the description of each test consists of the
+subdirectory under `suites/
+<https://github.com/ceph/ceph/tree/master/qa/suites>`_ containing the
+yaml facets, followed by an expression in curly braces (``{}``) consisting of
+a list of yaml facets in order of concatenation. For instance the
+test description::
+
+ ceph-deploy/basic/{distros/centos_7.0.yaml tasks/ceph-deploy.yaml}
+
+signifies the concatenation of two files:
+
+* ceph-deploy/basic/distros/centos_7.0.yaml
+* ceph-deploy/basic/tasks/ceph-deploy.yaml
+
+How tests are built from directories
+------------------------------------
+
+As noted in the previous section, most tests are not defined in a single
+yaml file, but rather as a `combination` of files collected from a
+directory tree within the ``suites/`` subdirectory of the `ceph/qa sub-directory`_.
+
+The set of all tests defined by a given subdirectory of ``suites/`` is
+called an "integration test suite", or a "teuthology suite".
+
+Combination of yaml facets is controlled by special files (``%`` and
+``+``) that are placed within the directory tree and can be thought of as
+operators. The ``%`` file is the "convolution" operator and ``+``
+signifies concatenation.
+
+Convolution operator
+^^^^^^^^^^^^^^^^^^^^
+
+The convolution operator, implemented as an empty file called ``%``, tells
+teuthology to construct a test matrix from yaml facets found in
+subdirectories below the directory containing the operator.
+
+For example, the `ceph-deploy suite
+<https://github.com/ceph/ceph/tree/master/qa/suites/ceph-deploy/>`_ is
+defined by the ``suites/ceph-deploy/`` tree, which consists of the files and
+subdirectories in the following structure
+
+.. code-block:: none
+
+ qa/suites/ceph-deploy
+ ├── %
+ ├── distros
+ │   ├── centos_latest.yaml
+ │   └── ubuntu_latest.yaml
+ └── tasks
+ ├── ceph-admin-commands.yaml
+ └── rbd_import_export.yaml
+
+This is interpreted as a 2x1 matrix consisting of two tests:
+
+1. ceph-deploy/basic/{distros/centos_7.0.yaml tasks/ceph-deploy.yaml}
+2. ceph-deploy/basic/{distros/ubuntu_16.04.yaml tasks/ceph-deploy.yaml}
+
+i.e. the concatenation of centos_7.0.yaml and ceph-deploy.yaml and
+the concatenation of ubuntu_16.04.yaml and ceph-deploy.yaml, respectively.
+In human terms, this means that the task found in ``ceph-deploy.yaml`` is
+intended to run on both CentOS 7.0 and Ubuntu 16.04.
+
+Without the file percent, the ``ceph-deploy`` tree would be interpreted as
+three standalone tests:
+
+* ceph-deploy/basic/distros/centos_7.0.yaml
+* ceph-deploy/basic/distros/ubuntu_16.04.yaml
+* ceph-deploy/basic/tasks/ceph-deploy.yaml
+
+(which would of course be wrong in this case).
+
+Referring to the `ceph/qa sub-directory`_, you will notice that the
+``centos_7.0.yaml`` and ``ubuntu_16.04.yaml`` files in the
+``suites/ceph-deploy/basic/distros/`` directory are implemented as symlinks.
+By using symlinks instead of copying, a single file can appear in multiple
+suites. This eases the maintenance of the test framework as a whole.
+
+All the tests generated from the ``suites/ceph-deploy/`` directory tree
+(also known as the "ceph-deploy suite") can be run with
+
+.. prompt:: bash $
+
+ teuthology-suite --machine-type smithi --suite ceph-deploy
+
+An individual test from the `ceph-deploy suite`_ can be run by adding the
+``--filter`` option
+
+.. prompt:: bash $
+
+ teuthology-suite \
+ --machine-type smithi \
+ --suite ceph-deploy/basic \
+ --filter 'ceph-deploy/basic/{distros/ubuntu_16.04.yaml tasks/ceph-deploy.yaml}'
+
+.. note:: To run a standalone test like the one in `Reading a standalone
+ test`_, ``--suite`` alone is sufficient. If you want to run a single
+ test from a suite that is defined as a directory tree, ``--suite`` must
+ be combined with ``--filter``. This is because the ``--suite`` option
+ understands POSIX relative paths only.
+
+Concatenation operator
+^^^^^^^^^^^^^^^^^^^^^^
+
+For even greater flexibility in sharing yaml files between suites, the
+special file plus (``+``) can be used to concatenate files within a
+directory. For instance, consider the `suites/rbd/thrash
+<https://github.com/ceph/ceph/tree/master/qa/suites/rbd/thrash>`_
+tree
+
+.. code-block:: none
+
+ qa/suites/rbd/thrash
+ ├── %
+ ├── clusters
+ │   ├── +
+ │   ├── fixed-2.yaml
+ │   └── openstack.yaml
+ └── workloads
+ ├── rbd_api_tests_copy_on_read.yaml
+ ├── rbd_api_tests.yaml
+ └── rbd_fsx_rate_limit.yaml
+
+This creates two tests:
+
+* rbd/thrash/{clusters/fixed-2.yaml clusters/openstack.yaml workloads/rbd_api_tests_copy_on_read.yaml}
+* rbd/thrash/{clusters/fixed-2.yaml clusters/openstack.yaml workloads/rbd_api_tests.yaml}
+
+Because the ``clusters/`` subdirectory contains the special file plus
+(``+``), all the other files in that subdirectory (``fixed-2.yaml`` and
+``openstack.yaml`` in this case) are concatenated together
+and treated as a single file. Without the special file plus, they would
+have been convolved with the files from the workloads directory to create
+a 2x2 matrix:
+
+* rbd/thrash/{clusters/openstack.yaml workloads/rbd_api_tests_copy_on_read.yaml}
+* rbd/thrash/{clusters/openstack.yaml workloads/rbd_api_tests.yaml}
+* rbd/thrash/{clusters/fixed-2.yaml workloads/rbd_api_tests_copy_on_read.yaml}
+* rbd/thrash/{clusters/fixed-2.yaml workloads/rbd_api_tests.yaml}
+
+The ``clusters/fixed-2.yaml`` file is shared among many suites to
+define the following ``roles``
+
+.. code-block:: yaml
+
+ roles:
+ - [mon.a, mon.c, osd.0, osd.1, osd.2, client.0]
+ - [mon.b, osd.3, osd.4, osd.5, client.1]
+
+The ``rbd/thrash`` suite as defined above, consisting of two tests,
+can be run with
+
+.. prompt:: bash $
+
+ teuthology-suite --machine-type smithi --suite rbd/thrash
+
+A single test from the rbd/thrash suite can be run by adding the
+``--filter`` option
+
+.. prompt:: bash $
+
+ teuthology-suite \
+ --machine-type smithi \
+ --suite rbd/thrash \
+ --filter 'rbd/thrash/{clusters/fixed-2.yaml clusters/openstack.yaml workloads/rbd_api_tests_copy_on_read.yaml}'
+
+Filtering tests by their description
+------------------------------------
+
+When a few jobs fail and need to be run again, the ``--filter`` option
+can be used to select tests with a matching description. For instance, if the
+``rados`` suite fails the `all/peer.yaml <https://github.com/ceph/ceph/blob/master/qa/suites/rados/singleton/all/peer.yaml>`_ test, the following will only
+run the tests that contain this file
+
+.. prompt:: bash $
+
+ teuthology-suite --machine-type smithi --suite rados --filter all/peer.yaml
+
+The ``--filter-out`` option does the opposite (it matches tests that do `not`
+contain a given string), and can be combined with the ``--filter`` option.
+
+Both ``--filter`` and ``--filter-out`` take a comma-separated list of strings
+(which means the comma character is implicitly forbidden in filenames found in
+the `ceph/qa sub-directory`_). For instance
+
+.. prompt:: bash $
+
+ teuthology-suite --machine-type smithi --suite rados --filter all/peer.yaml,all/rest-api.yaml
+
+will run tests that contain either
+`all/peer.yaml <https://github.com/ceph/ceph/blob/master/qa/suites/rados/singleton/all/peer.yaml>`_
+or
+`all/rest-api.yaml <https://github.com/ceph/ceph/blob/master/qa/suites/rados/singleton/all/rest-api.yaml>`_
+
+Each string is looked up anywhere in the test description and has to
+be an exact match: they are not regular expressions.
+
+Reducing the number of tests
+----------------------------
+
+The ``rados`` suite generates tens or even hundreds of thousands of tests out
+of a few hundred files. This happens because teuthology constructs test
+matrices from subdirectories wherever it encounters a file named ``%``. For
+instance, all tests in the `rados/basic suite
+<https://github.com/ceph/ceph/tree/master/qa/suites/rados/basic>`_ run with
+different messenger types: ``simple``, ``async`` and ``random``, because they
+are combined (via the special file ``%``) with the `msgr directory
+<https://github.com/ceph/ceph/tree/master/qa/suites/rados/basic/msgr>`_
+
+All integration tests are required to be run before a Ceph release is
+published. When merely verifying whether a contribution can be merged without
+risking a trivial regression, it is enough to run a subset. The ``--subset``
+option can be used to reduce the number of tests that are triggered. For
+instance
+
+.. prompt:: bash $
+
+ teuthology-suite --machine-type smithi --suite rados --subset 0/4000
+
+will run as few tests as possible. The tradeoff in this case is that
+not all combinations of test variations will together,
+but no matter how small a ratio is provided in the ``--subset``,
+teuthology will still ensure that all files in the suite are in at
+least one test. Understanding the actual logic that drives this
+requires reading the teuthology source code.
+
+The ``--limit`` option only runs the first ``N`` tests in the suite:
+this is rarely useful, however, because there is no way to control which
+test will be first.
+
+.. _ceph/qa sub-directory: https://github.com/ceph/ceph/tree/master/qa
+.. _Sepia Lab: https://wiki.sepia.ceph.com/doku.php
+.. _teuthology repository: https://github.com/ceph/teuthology
+.. _teuthology framework: https://github.com/ceph/teuthology
diff --git a/doc/dev/developer_guide/tests-unit-tests.rst b/doc/dev/developer_guide/tests-unit-tests.rst
new file mode 100644
index 000000000..72d724d98
--- /dev/null
+++ b/doc/dev/developer_guide/tests-unit-tests.rst
@@ -0,0 +1,177 @@
+Testing - unit tests
+====================
+
+The Ceph GitHub repository has two types of tests: unit tests (also called
+``make check`` tests) and integration tests. Strictly speaking, the
+``make check`` tests are not "unit tests", but rather tests that can be run
+easily on a single build machine after compiling Ceph from source, whereas
+integration tests require package installation and multi-machine clusters to
+run.
+
+.. _make-check:
+
+What does "make check" mean?
+----------------------------
+
+After compiling Ceph, the code can be run through a battery of tests. For
+historical reasons, this is often referred to as ``make check`` even though
+the actual command used to run the tests is now ``ctest``. To be included in
+this group of tests, a test must:
+
+* bind ports that do not conflict with other tests
+* not require root access
+* not require more than one machine to run
+* complete within a few minutes
+
+For the sake of simplicity, this class of tests is referred to as "make
+check tests" or "unit tests". This is meant to distinguish these tests from
+the more complex "integration tests" that are run via the `teuthology
+framework`_.
+
+While it is possible to run ``ctest`` directly, it can be tricky to correctly
+set up your environment for it. Fortunately, there is a script that makes it
+easy to run the unit tests on your code. This script can be run from the
+top-level directory of the Ceph source tree by invoking:
+
+ .. prompt:: bash $
+
+ ./run-make-check.sh
+
+You will need a minimum of 8GB of RAM and 32GB of free drive space for this
+command to complete successfully on x86_64 architectures; other architectures
+may have different requirements. Depending on your hardware, it can take from
+twenty minutes to three hours to complete.
+
+
+How unit tests are declared
+---------------------------
+
+Unit tests are declared in the ``CMakeLists.txt`` file, which is found in the
+``./src`` directory. The ``add_ceph_test`` and ``add_ceph_unittest`` CMake
+functions are used to declare unit tests. ``add_ceph_test`` and
+``add_ceph_unittest`` are themselves defined in
+``./cmake/modules/AddCephTest.cmake``.
+
+Some unit tests are scripts and other unit tests are binaries that are
+compiled during the build process.
+
+* ``add_ceph_test`` function - used to declare unit test scripts
+* ``add_ceph_unittest`` function - used for unit test binaries
+
+Unit testing of CLI tools
+-------------------------
+Some of the CLI tools are tested using special files ending with the extension
+``.t`` and stored under ``./src/test/cli``. These tests are run using a tool
+called `cram`_ via a shell script called ``./src/test/run-cli-tests``.
+`cram`_ tests that are not suitable for ``make check`` can also be run by
+teuthology using the `cram task`_.
+
+.. _`cram`: https://bitheap.org/cram/
+.. _`cram task`: https://github.com/ceph/ceph/blob/master/qa/tasks/cram.py
+
+Tox-based testing of Python modules
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Some of the Python modules in Ceph use `tox <https://tox.readthedocs.io/en/latest/>`_
+to run their unit tests.
+
+Most of these Python modules can be found in the directory ``./src/pybind/``.
+
+Currently (December 2020) the following modules use **tox**:
+
+* Cephadm (``./src/cephadm/tox.ini``)
+* Ceph Manager Python API (``./src/pybind/mgr``)
+
+ * ``./src/pybind/mgr/tox.ini``
+
+ * ``./src/pybind/mgr/dashboard/tox.ini``
+
+ * ``./src/pybind/tox.ini``
+
+* Dashboard (``./src/pybind/mgr/dashboard``)
+* Python common (``./src/python-common/tox.ini``)
+* CephFS (``./src/tools/cephfs/tox.ini``)
+* ceph-volume
+
+ * ``./src/ceph-volume/tox.ini``
+
+ * ``./src/ceph-volume/plugin/zfs/tox.ini``
+
+ * ``./src/ceph-volume/ceph_volume/tests/functional/batch/tox.ini``
+
+ * ``./src/ceph-volume/ceph_volume/tests/functional/simple/tox.ini``
+
+ * ``./src/ceph-volume/ceph_volume/tests/functional/lvm/tox.ini``
+
+Configuring Tox environments and tasks
+""""""""""""""""""""""""""""""""""""""
+Most tox configurations support multiple environments and tasks.
+
+The list of environments and tasks that are supported is in the ``tox.ini``
+file, under ``envlist``. For example, here are the first three lines of
+``./src/cephadm/tox.ini``::
+
+ [tox]
+ envlist = py3, mypy
+ skipsdist=true
+
+In this example, the ``Python 3`` and ``mypy`` environments are specified.
+
+The list of environments can be retrieved with the following command:
+
+ .. prompt:: bash $
+
+ tox --list
+
+Or:
+
+ .. prompt:: bash $
+
+ tox -l
+
+Running Tox
+"""""""""""
+To run **tox**, just execute ``tox`` in the directory containing
+``tox.ini``. If you do not specify any environments (for example, ``-e
+$env1,$env2``), then ``tox`` will run all environments. Jenkins will run
+``tox`` by executing ``./src/script/run_tox.sh``.
+
+Here are some examples from Ceph Dashboard that show how to specify different
+environments and run options::
+
+ ## Run Python 2+3 tests+lint commands:
+ $ tox -e py27,py3,lint,check
+
+ ## Run Python 3 tests+lint commands:
+ $ tox -e py3,lint,check
+
+ ## To run it as Jenkins would:
+ $ ../../../script/run_tox.sh --tox-env py3,lint,check
+
+Manager core unit tests
+"""""""""""""""""""""""
+
+Currently only doctests_ inside ``mgr_util.py`` are run.
+
+To add more files to be tested inside the core of the manager, open the
+``tox.ini`` file and add the files to be tested at the end of the line that
+includes ``mgr_util.py``.
+
+.. _doctests: https://docs.python.org/3/library/doctest.html
+
+Unit test caveats
+-----------------
+
+#. Unlike the various Ceph daemons and ``ceph-fuse``, the unit tests are
+ linked against the default memory allocator (glibc) unless they are
+ explicitly linked against something else. This enables tools such as
+ **valgrind** to be used in the tests.
+
+#. Google Test unit testing library hides the client output from the shell.
+ In order to debug the client after setting the desired debug level
+ (e.g ``ceph config set client debug_rbd 20``), the debug log file can
+ be found at ``build/out/client.admin.<pid>.log``.
+ This can also be handy when examining teuthology failed unit test
+ jobs, the job's debug level can be set at the relevant yaml file.
+
+.. _make check:
+.. _teuthology framework: https://github.com/ceph/teuthology
diff --git a/doc/dev/development-workflow.rst b/doc/dev/development-workflow.rst
new file mode 100644
index 000000000..dfcab929d
--- /dev/null
+++ b/doc/dev/development-workflow.rst
@@ -0,0 +1,248 @@
+=====================
+Development workflows
+=====================
+
+This page explains the workflows a developer is expected to follow to
+implement the goals that are part of the Ceph release cycle. It does not
+go into technical details and is designed to provide a high level view
+instead. Each chapter is about a given goal such as ``Merging bug
+fixes or features`` or ``Publishing point releases and backporting``.
+
+A key aspect of all workflows is that none of them blocks another. For
+instance, a bug fix can be backported and merged to a stable branch
+while the next point release is being published. For that specific
+example to work, a branch should be created to avoid any
+interference. In practice it is not necessary for Ceph because:
+
+* there are few people involved
+* the frequency of backports is not too high
+* the reviewers, who know a release is being published, are unlikely
+ to merge anything that may cause issues
+
+This ad-hoc approach implies the workflows are changed on a regular
+basis to adapt. For instance, ``quality engineers`` were not involved
+in the workflow to publish ``dumpling`` point releases. The number of
+commits being backported to ``firefly`` made it impractical for developers
+tasked to write code or fix bugs to also run and verify the full suite
+of integration tests. Inserting ``quality engineers`` makes it
+possible for someone to participate in the workflow by analyzing test
+results.
+
+The workflows are not enforced when they impose an overhead that does
+not make sense. For instance, if the release notes for a point release
+were not written prior to checking all integration tests, they can be
+committed to the stable branch and the result sent for publication
+without going through another run of integration tests.
+
+Release Cycle
+=============
+
+::
+
+ Ceph hammer infernalis
+ Developer CDS CDS
+ Summit | |
+ | |
+ development | |
+ release | v0.88 v0.89 v0.90 ... | v9.0.0
+ --v--^----^--v---^------^--v- ---v----^----^--- 2015
+ | | | |
+ stable giant | | hammer
+ release v0.87 | | v0.94
+ | |
+ point firefly dumpling
+ release v0.80.8 v0.67.12
+
+
+Four times a year, the development roadmap is discussed online during
+the `Ceph Developer Summit <http://tracker.ceph.com/projects/ceph/wiki/Planning#Ceph-Developer-Summit>`_. A
+new stable release (hammer, infernalis, jewel ...) is published at the same
+frequency. Every other release (firefly, hammer, jewel...) is a `Long Term
+Stable (LTS) <../../releases>`_. See `Understanding the release cycle
+<../../releases#understanding-the-release-cycle>`_ for more information.
+
+Merging bug fixes or features
+=============================
+
+The development branch is ``master`` and the workflow followed by all
+developers can be summarized as follows:
+
+* The developer prepares a series of commits
+* The developer submits the series of commits via a pull request
+* A reviewer is assigned the pull request
+* When the pull request looks good to the reviewer, it is merged into
+ an integration branch by the tester
+* After a successful run of integration tests, the pull request is
+ merged by the tester
+
+The ``developer`` is the author of a series of commits. The
+``reviewer`` is responsible for providing feedback to the developer on
+a regular basis and the developer is invited to ping the reviewer if
+nothing happened after a week. After the ``reviewer`` is satisfied
+with the pull request, (s)he passes it to the ``tester``. The
+``tester`` is responsible for running teuthology integration tests on
+the pull request. If nothing happens within a month the ``reviewer`` is
+invited to ping the ``tester``.
+
+Resolving bug reports and implementing features
+===============================================
+
+All bug reports and feature requests are in the `issue tracker
+<http://tracker.ceph.com>`_ and the workflow can be summarized as
+follows:
+
+* The reporter creates the issue with priority ``Normal``
+* A developer may pick the issue right away
+* During a bi-weekly bug scrub, the team goes over all new issue and
+ assign them a priority
+* The bugs with higher priority are worked on first
+
+Each ``team`` is responsible for a project, managed by :ref:`leads <governance>`.
+
+The ``developer`` assigned to an issue is responsible for it. The
+status of an open issue can be:
+
+* ``New``: it is unclear if the issue needs work.
+* ``Verified``: the bug can be reproduced or showed up multiple times
+* ``In Progress``: the developer is working on it this week
+* ``Pending Backport``: the fix needs to be backported to the stable
+ releases listed in the backport field
+
+For each ``Pending Backport`` issue, there exists at least one issue
+in the ``Backport`` tracker to record the work done to cherry pick the
+necessary commits from the master branch to the target stable branch.
+See `the backporter manual
+<http://tracker.ceph.com/projects/ceph-releases/wiki/HOWTO>`_ for more
+information.
+
+Running and interpreting teuthology integration tests
+=====================================================
+
+The :doc:`/dev/sepia` runs `teuthology
+<https://github.com/ceph/teuthology/>`_ integration tests `on a regular basis <http://tracker.ceph.com/projects/ceph-releases/wiki/HOWTO_monitor_the_automated_tests_AKA_nightlies#Automated-tests-AKA-nightlies>`_ and the
+results are posted on `pulpito <http://pulpito.ceph.com/>`_ and the
+`ceph-qa mailing list <https://ceph.com/irc/>`_.
+
+* The job failures are `analyzed by quality engineers and developers
+ <http://tracker.ceph.com/projects/ceph-releases/wiki/HOWTO_monitor_the_automated_tests_AKA_nightlies#List-of-suites-and-watchers>`_
+* If the cause is environmental (e.g. network connectivity), an issue
+ is created in the `sepia lab project
+ <http://tracker.ceph.com/projects/lab/issues/new>`_
+* If the bug is known, a pulpito URL to the failed job is added to the issue
+* If the bug is new, an issue is created
+
+The ``quality engineer`` is either a developer or a member of the QE
+team. There is at least one integration test suite per project:
+
+* `rgw <https://github.com/ceph/ceph/tree/master/qa/suites/rgw>`_ suite
+* `CephFS <https://github.com/ceph/ceph/tree/master/qa/suites/fs>`_ suite
+* `rados <https://github.com/ceph/ceph/tree/master/qa/suites/rados>`_ suite
+* `rbd <https://github.com/ceph/ceph/tree/master/qa/suites/rbd>`_ suite
+
+and many others such as
+
+* `upgrade <https://github.com/ceph/ceph/tree/master/qa/suites/upgrade>`_ suites
+* `power-cyle <https://github.com/ceph/ceph/tree/master/qa/suites/powercycle>`_ suite
+* ...
+
+Preparing a new release
+=======================
+
+A release is prepared in a dedicated branch, different from the
+``master`` branch.
+
+* For a stable releases it is the branch matching the release code
+ name (dumpling, firefly, etc.)
+* For a development release it is the ``next`` branch
+
+The workflow expected of all developers to stabilize the release
+candidate is the same as the normal development workflow with the
+following differences:
+
+* The pull requests must target the stable branch or next instead of
+ master
+* The reviewer rejects pull requests that are not bug fixes
+* The ``Backport`` issues matching a teuthology test failure and set
+ with priority ``Urgent`` must be fixed before the release
+
+Cutting a new stable release
+============================
+
+A new stable release can be cut when:
+
+* all ``Backport`` issues with priority ``Urgent`` are fixed
+* integration and upgrade tests run successfully
+
+Publishing a new stable release implies a risk of regression or
+discovering new bugs during the upgrade, no matter how carefully it is
+tested. The decision to cut a release must take this into account: it
+may not be wise to publish a stable release that only fixes a few
+minor bugs. For instance if only one commit has been backported to a
+stable release that is not a LTS, it is better to wait until there are
+more.
+
+When a stable release is to be retired, it may be safer to
+recommend an upgrade to the next LTS release instead of
+proposing a new point release to fix a problem. For instance, the
+``dumpling`` v0.67.11 release has bugs related to backfilling which have
+been fixed in ``firefly`` v0.80.x. A backport fixing these backfilling
+bugs has been tested in the draft point release ``dumpling`` v0.67.12 but
+they are large enough to introduce a risk of regression. As ``dumpling``
+is to be retired, users suffering from this bug can
+upgrade to ``firefly`` to fix it. Unless users manifest themselves and ask
+for ``dumpling`` v0.67.12, this draft release may never be published.
+
+* The ``Ceph lead`` decides a new stable release must be published
+* The ``release master`` gets approval from all leads
+* The ``release master`` writes and commits the release notes
+* The ``release master`` informs the ``quality engineer`` that the
+ branch is ready for testing
+* The ``quality engineer`` runs additional integration tests
+* If the ``quality engineer`` discovers new bugs that require an
+ ``Urgent Backport``, the release goes back to being prepared, it
+ was not ready after all
+* The ``quality engineer`` informs the ``publisher`` that the branch
+ is ready for release
+* The ``publisher`` `creates the packages and sets the release tag
+ <../release-process>`_
+
+The person responsible for each role is:
+
+* Sage Weil is the ``Ceph lead``
+* Sage Weil is the ``release master`` for major stable releases
+ (``firefly`` 0.80, ``hammer`` 0.94 etc.)
+* Loic Dachary is the ``release master`` for stable point releases
+ (``firefly`` 0.80.10, ``hammer`` 0.94.1 etc.)
+* Yuri Weinstein is the ``quality engineer``
+* Alfredo Deza is the ``publisher``
+
+Cutting a new development release
+=================================
+
+The publication workflow of a development release is the same as
+preparing a new release and cutting it, with the following
+differences:
+
+* The ``next`` branch is reset to the tip of ``master`` after
+ publication
+* The ``quality engineer`` is not required to run additional tests,
+ the ``release master`` directly informs the ``publisher`` that the
+ release is ready to be published.
+
+Publishing point releases and backporting
+=========================================
+
+The publication workflow of the point releases is the same as
+preparing a new release and cutting it, with the following
+differences:
+
+* The ``backport`` field of each issue contains the code name of the
+ stable release
+* There is exactly one issue in the ``Backport`` tracker for each
+ stable release to which the issue is backported
+* All commits are cherry-picked with ``git cherry-pick -x`` to
+ reference the original commit
+
+See `the backporter manual
+<http://tracker.ceph.com/projects/ceph-releases/wiki/HOWTO>`_ for more
+information.
diff --git a/doc/dev/documenting.rst b/doc/dev/documenting.rst
new file mode 100644
index 000000000..c1a11890e
--- /dev/null
+++ b/doc/dev/documenting.rst
@@ -0,0 +1,144 @@
+==================
+ Documenting Ceph
+==================
+
+User documentation
+==================
+
+The documentation on docs.ceph.com is generated from the restructuredText
+sources in ``/doc/`` in the Ceph git repository.
+
+Please make sure that your changes are written in a way that is intended
+for end users of the software, unless you are making additions in
+``/doc/dev/``, which is the section for developers.
+
+All pull requests that modify user-facing functionality must
+include corresponding updates to documentation: see
+`Submitting Patches`_ for more detail.
+
+Check your .rst syntax is working as expected by using the "View"
+button in the github user interface when looking at a diff on
+an .rst file, or build the docs locally using the ``admin/build-doc``
+script.
+
+For more information about the Ceph documentation, see
+:doc:`/start/documenting-ceph`.
+
+Code Documentation
+==================
+
+C and C++ can be documented with Doxygen_, using the subset of Doxygen
+markup supported by Breathe_.
+
+.. _Doxygen: http://www.doxygen.nl/
+.. _Breathe: https://github.com/michaeljones/breathe
+
+The general format for function documentation is::
+
+ /**
+ * Short description
+ *
+ * Detailed description when necessary
+ *
+ * preconditons, postconditions, warnings, bugs or other notes
+ *
+ * parameter reference
+ * return value (if non-void)
+ */
+
+This should be in the header where the function is declared, and
+functions should be grouped into logical categories. The `librados C
+API`_ provides a complete example. It is pulled into Sphinx by
+`librados.rst`_, which is rendered at :doc:`/rados/api/librados`.
+
+To generate the doxygen documentation in HTML format use:
+
+::
+
+ # make doxygen
+
+HTML output will be under: ``build-doc/doxygen/html``
+
+.. _`librados C API`: https://github.com/ceph/ceph/blob/master/src/include/rados/librados.h
+.. _`librados.rst`: https://github.com/ceph/ceph/raw/master/doc/rados/api/librados.rst
+
+Drawing diagrams
+================
+
+Graphviz
+--------
+
+You can use Graphviz_, as explained in the `Graphviz extension documentation`_.
+
+.. _Graphviz: http://graphviz.org/
+.. _`Graphviz extension documentation`: https://www.sphinx-doc.org/en/master/usage/extensions/graphviz.html
+
+.. graphviz::
+
+ digraph "example" {
+ foo -> bar;
+ bar -> baz;
+ bar -> th
+ }
+
+Most of the time, you'll want to put the actual DOT source in a
+separate file, like this::
+
+ .. graphviz:: myfile.dot
+
+See the `Dot User's Manual <https://www.graphviz.org/pdf/dotguide.pdf>`_ by
+Emden R. Gansner, Eleftherios Koutsofios, and Stephen North for examples of
+digraphs. This is especially useful if this is your first time encountering
+GraphViz.
+
+Ditaa
+-----
+
+You can use Ditaa_:
+
+.. _Ditaa: http://ditaa.sourceforge.net/
+
+.. ditaa::
+
+ +--------------+ /=----\
+ | hello, world |-->| hi! |
+ +--------------+ \-----/
+
+
+Blockdiag
+---------
+
+If a use arises, we can integrate Blockdiag_. It is a Graphviz-style
+declarative language for drawing things, and includes:
+
+- `block diagrams`_: boxes and arrows (automatic layout, as opposed to
+ Ditaa_)
+- `sequence diagrams`_: timelines and messages between them
+- `activity diagrams`_: subsystems and activities in them
+- `network diagrams`_: hosts, LANs, IP addresses etc (with `Cisco
+ icons`_ if wanted)
+
+.. _Blockdiag: http://blockdiag.com/en/
+.. _`Cisco icons`: https://pypi.org/project/blockdiagcontrib-cisco/
+.. _`block diagrams`: http://blockdiag.com/en/blockdiag/
+.. _`sequence diagrams`: http://blockdiag.com/en/seqdiag/index.html
+.. _`activity diagrams`: http://blockdiag.com/en/actdiag/index.html
+.. _`network diagrams`: http://blockdiag.com/en/nwdiag/
+
+
+Inkscape
+--------
+
+You can use Inkscape to generate scalable vector graphics.
+https://inkscape.org/en/ for restructedText documents.
+
+If you generate diagrams with Inkscape, you should
+commit both the Scalable Vector Graphics (SVG) file and export a
+Portable Network Graphic (PNG) file. Reference the PNG file.
+
+By committing the SVG file, others will be able to update the
+SVG diagrams using Inkscape.
+
+HTML5 will support SVG inline.
+
+.. _`Submitting Patches`: https://github.com/ceph/ceph/blob/master/SubmittingPatches.rst
diff --git a/doc/dev/encoding.rst b/doc/dev/encoding.rst
new file mode 100644
index 000000000..398c85d95
--- /dev/null
+++ b/doc/dev/encoding.rst
@@ -0,0 +1,95 @@
+
+Serialization (encode/decode)
+=============================
+
+When a structure is sent over the network or written to disk, it is
+encoded into a string of bytes. Serializable structures have
+``encode`` and ``decode`` methods that write and read from ``bufferlist``
+objects representing byte strings.
+
+Adding a field to a structure
+-----------------------------
+
+You can see examples of this all over the Ceph code, but here's an
+example:
+
+::
+
+ class AcmeClass
+ {
+ int member1;
+ std::string member2;
+
+ void encode(bufferlist &bl)
+ {
+ ENCODE_START(1, 1, bl);
+ ::encode(member1, bl);
+ ::encode(member2, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator &bl)
+ {
+ DECODE_START(1, bl);
+ ::decode(member1, bl);
+ ::decode(member2, bl);
+ DECODE_FINISH(bl);
+ }
+ };
+
+The ``ENCODE_START`` macro writes a header that specifies a *version* and
+a *compat_version* (both initially 1). The message version is incremented
+whenever a change is made to the encoding. The compat_version is incremented
+only if the change will break existing decoders -- decoders are tolerant
+of trailing bytes, so changes that add fields at the end of the structure
+do not require incrementing compat_version.
+
+The ``DECODE_START`` macro takes an argument specifying the most recent
+message version that the code can handle. This is compared with the
+compat_version encoded in the message, and if the message is too new then
+an exception will be thrown. Because changes to compat_verison are rare,
+this isn't usually something to worry about when adding fields.
+
+In practice, changes to encoding usually involve simply adding the desired fields
+at the end of the ``encode`` and ``decode`` functions, and incrementing
+the versions in ``ENCODE_START`` and ``DECODE_START``. For example, here's how
+to add a third field to ``AcmeClass``:
+
+::
+
+ class AcmeClass
+ {
+ int member1;
+ std::string member2;
+ std::vector<std::string> member3;
+
+ void encode(bufferlist &bl)
+ {
+ ENCODE_START(2, 1, bl);
+ ::encode(member1, bl);
+ ::encode(member2, bl);
+ ::encode(member3, bl);
+ ENCODE_FINISH(bl);
+ }
+
+ void decode(bufferlist::iterator &bl)
+ {
+ DECODE_START(2, bl);
+ ::decode(member1, bl);
+ ::decode(member2, bl);
+ if (struct_v >= 2) {
+ ::decode(member3, bl);
+ }
+ DECODE_FINISH(bl);
+ }
+ };
+
+Note that the compat_version did not change because the encoded message
+will still be decodable by versions of the code that only understand
+version 1 -- they will just ignore the trailing bytes where we encode ``member3``.
+
+In the ``decode`` function, decoding the new field is conditional: this is
+because we might still be passed older-versioned messages that do not
+have the field. The ``struct_v`` variable is a local set by the ``DECODE_START``
+macro.
+
diff --git a/doc/dev/erasure-coded-pool.rst b/doc/dev/erasure-coded-pool.rst
new file mode 100644
index 000000000..8ad697702
--- /dev/null
+++ b/doc/dev/erasure-coded-pool.rst
@@ -0,0 +1,135 @@
+Erasure Coded pool
+==================
+
+Purpose
+-------
+
+Erasure-coded pools require less storage space compared to replicated
+pools. The erasure-coding support has higher computational requirements and
+only supports a subset of the operations allowed on an object (for instance,
+partial write is not supported).
+
+Use cases
+---------
+
+Cold storage
+~~~~~~~~~~~~
+
+An erasure-coded pool is created to store a large number of 1GB
+objects (imaging, genomics, etc.) and 10% of them are read per
+month. New objects are added every day and the objects are not
+modified after being written. On average there is one write for 10,000
+reads.
+
+A replicated pool is created and set as a cache tier for the
+erasure coded pool. An agent demotes objects (i.e. moves them from the
+replicated pool to the erasure-coded pool) if they have not been
+accessed in a week.
+
+The erasure-coded pool CRUSH rule targets hardware designed for
+cold storage with high latency and slow access time. The replicated
+pool CRUSH rule targets faster hardware to provide better response
+times.
+
+Cheap multidatacenter storage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Ten datacenters are connected with dedicated network links. Each
+datacenter contains the same amount of storage with no power-supply
+backup and no air-cooling system.
+
+An erasure-coded pool is created with a CRUSH rule that will
+ensure no data loss if at most three datacenters fail
+simultaneously. The overhead is 50% with erasure code configured to
+split data in six (k=6) and create three coding chunks (m=3). With
+replication the overhead would be 400% (four replicas).
+
+Interface
+---------
+
+Set up an erasure-coded pool::
+
+ $ ceph osd pool create ecpool erasure
+
+Set up an erasure-coded pool and the associated CRUSH rule ``ecrule``::
+
+ $ ceph osd crush rule create-erasure ecrule
+ $ ceph osd pool create ecpool erasure default ecrule
+
+Set the CRUSH failure domain to osd (instead of host, which is the default)::
+
+ $ ceph osd erasure-code-profile set myprofile \
+ crush-failure-domain=osd
+ $ ceph osd erasure-code-profile get myprofile
+ k=2
+ m=2
+ plugin=jerasure
+ technique=reed_sol_van
+ crush-failure-domain=osd
+ $ ceph osd pool create ecpool erasure myprofile
+
+Control the parameters of the erasure code plugin::
+
+ $ ceph osd erasure-code-profile set myprofile \
+ k=3 m=2
+ $ ceph osd erasure-code-profile get myprofile
+ k=3
+ m=2
+ plugin=jerasure
+ technique=reed_sol_van
+ $ ceph osd pool create ecpool erasure myprofile
+
+Choose an alternate erasure code plugin::
+
+ $ ceph osd erasure-code-profile set myprofile \
+ plugin=example technique=xor
+ $ ceph osd erasure-code-profile get myprofile
+ k=2
+ m=2
+ plugin=example
+ technique=xor
+ $ ceph osd pool create ecpool 12 12 erasure \
+ myprofile
+
+Display the default erasure code profile::
+
+ $ ceph osd erasure-code-profile ls
+ default
+ $ ceph osd erasure-code-profile get default
+ k=2
+ m=2
+ plugin=jerasure
+ technique=reed_sol_van
+
+Create a profile to set the data to be distributed on six OSDs (k+m=6) and sustain the loss of three OSDs (m=3) without losing data::
+
+ $ ceph osd erasure-code-profile set myprofile k=3 m=3
+ $ ceph osd erasure-code-profile get myprofile
+ k=3
+ m=3
+ plugin=jerasure
+ technique=reed_sol_van
+ $ ceph osd erasure-code-profile ls
+ default
+ myprofile
+
+Remove a profile that is no longer in use (otherwise it will fail with EBUSY)::
+
+ $ ceph osd erasure-code-profile ls
+ default
+ myprofile
+ $ ceph osd erasure-code-profile rm myprofile
+ $ ceph osd erasure-code-profile ls
+ default
+
+Set the rule to ssd (instead of default)::
+
+ $ ceph osd erasure-code-profile set myprofile \
+ crush-root=ssd
+ $ ceph osd erasure-code-profile get myprofile
+ k=2
+ m=2
+ plugin=jerasure
+ technique=reed_sol_van
+ crush-root=ssd
+
diff --git a/doc/dev/file-striping.rst b/doc/dev/file-striping.rst
new file mode 100644
index 000000000..405c9718d
--- /dev/null
+++ b/doc/dev/file-striping.rst
@@ -0,0 +1,161 @@
+File striping
+=============
+
+The text below describes how files from Ceph file system clients are
+stored across objects stored in RADOS.
+
+ceph_file_layout
+----------------
+
+Ceph distributes (stripes) the data for a given file across a number
+of underlying objects. The way file data is mapped to those objects
+is defined by the ceph_file_layout structure. The data distribution
+is a modified RAID 0, where data is striped across a set of objects up
+to a (per-file) fixed size, at which point another set of objects
+holds the file's data. The second set also holds no more than the
+fixed amount of data, and then another set is used, and so on.
+
+Defining some terminology will go a long way toward explaining the
+way file data is laid out across Ceph objects.
+
+- file
+ A collection of contiguous data, named from the perspective of
+ the Ceph client (i.e., a file on a Linux system using Ceph
+ storage). The data for a file is divided into fixed-size
+ "stripe units," which are stored in ceph "objects."
+- stripe unit
+ The size (in bytes) of a block of data used in the RAID 0
+ distribution of a file. All stripe units for a file have equal
+ size. The last stripe unit is typically incomplete--i.e. it
+ represents the data at the end of the file as well as unused
+ "space" beyond it up to the end of the fixed stripe unit size.
+- stripe count
+ The number of consecutive stripe units that constitute a RAID 0
+ "stripe" of file data.
+- stripe
+ A contiguous range of file data, RAID 0 striped across "stripe
+ count" objects in fixed-size "stripe unit" blocks.
+- object
+ A collection of data maintained by Ceph storage. Objects are
+ used to hold portions of Ceph client files.
+- object set
+ A set of objects that together represent a contiguous portion of
+ a file.
+
+Three fields in the ceph_file_layout structure define this mapping::
+
+ u32 fl_stripe_unit;
+ u32 fl_stripe_count;
+ u32 fl_object_size;
+
+(They are actually maintained in their on-disk format, __le32.)
+
+The role of the first two fields should be clear from the
+definitions above.
+
+The third field is the maximum size (in bytes) of an object used to
+back file data. The object size is a multiple of the stripe unit.
+
+A file's data is blocked into stripe units, and consecutive stripe
+units are stored on objects in an object set. The number of objects
+in a set is the same as the stripe count. No object storing file
+data will exceed the file's designated object size, so after some
+fixed number of complete stripes, a new object set is used to store
+subsequent file data.
+
+Note that by default, Ceph uses a simple striping strategy in which
+object_size equals stripe_unit and stripe_count is 1. This simply
+puts one stripe_unit in each object.
+
+Here's a more complex example::
+
+ file size = 1 trillion = 1000000000000 bytes
+
+ fl_stripe_unit = 64KB = 65536 bytes
+ fl_stripe_count = 5 stripe units per stripe
+ fl_object_size = 64GB = 68719476736 bytes
+
+This means::
+
+ file stripe size = 64KB * 5 = 320KB = 327680 bytes
+ each object holds 64GB / 64KB = 1048576 stripe units
+ file object set size = 64GB * 5 = 320GB = 343597383680 bytes
+ (also 1048576 stripe units * 327680 bytes per stripe unit)
+
+So the file's 1 trillion bytes can be divided into complete object
+sets, then complete stripes, then complete stripe units, and finally
+a single incomplete stripe unit::
+
+ - 1 trillion bytes / 320GB per object set = 2 complete object sets
+ (with 312805232640 bytes remaining)
+ - 312805232640 bytes / 320KB per stripe = 954605 complete stripes
+ (with 266240 bytes remaining)
+ - 266240 bytes / 64KB per stripe unit = 4 complete stripe units
+ (with 4096 bytes remaining)
+ - and the final incomplete stripe unit holds those 4096 bytes.
+
+The ASCII art below attempts to capture this::
+
+ _________ _________ _________ _________ _________
+ /object 0\ /object 1\ /object 2\ /object 3\ /object 4\
+ +=========+ +=========+ +=========+ +=========+ +=========+
+ | stripe | | stripe | | stripe | | stripe | | stripe |
+ o | unit | | unit | | unit | | unit | | unit | stripe 0
+ b | 0 | | 1 | | 2 | | 3 | | 4 |
+ j |---------| |---------| |---------| |---------| |---------|
+ e | stripe | | stripe | | stripe | | stripe | | stripe |
+ c | unit | | unit | | unit | | unit | | unit | stripe 1
+ t | 5 | | 6 | | 7 | | 8 | | 9 |
+ |---------| |---------| |---------| |---------| |---------|
+ s | . | | . | | . | | . | | . |
+ e . . . . .
+ t | . | | . | | . | | . | | . |
+ |---------| |---------| |---------| |---------| |---------|
+ 0 | stripe | | stripe | | stripe | | stripe | | stripe | stripe
+ | unit | | unit | | unit | | unit | | unit | 1048575
+ | 5242875 | | 5242876 | | 5242877 | | 5242878 | | 5242879 |
+ \=========/ \=========/ \=========/ \=========/ \=========/
+
+ _________ _________ _________ _________ _________
+ /object 5\ /object 6\ /object 7\ /object 8\ /object 9\
+ +=========+ +=========+ +=========+ +=========+ +=========+
+ | stripe | | stripe | | stripe | | stripe | | stripe | stripe
+ o | unit | | unit | | unit | | unit | | unit | 1048576
+ b | 5242880 | | 5242881 | | 5242882 | | 5242883 | | 5242884 |
+ j |---------| |---------| |---------| |---------| |---------|
+ e | stripe | | stripe | | stripe | | stripe | | stripe | stripe
+ c | unit | | unit | | unit | | unit | | unit | 1048577
+ t | 5242885 | | 5242886 | | 5242887 | | 5242888 | | 5242889 |
+ |---------| |---------| |---------| |---------| |---------|
+ s | . | | . | | . | | . | | . |
+ e . . . . .
+ t | . | | . | | . | | . | | . |
+ |---------| |---------| |---------| |---------| |---------|
+ 1 | stripe | | stripe | | stripe | | stripe | | stripe | stripe
+ | unit | | unit | | unit | | unit | | unit | 2097151
+ | 10485755| | 10485756| | 10485757| | 10485758| | 10485759|
+ \=========/ \=========/ \=========/ \=========/ \=========/
+
+ _________ _________ _________ _________ _________
+ /object 10\ /object 11\ /object 12\ /object 13\ /object 14\
+ +=========+ +=========+ +=========+ +=========+ +=========+
+ | stripe | | stripe | | stripe | | stripe | | stripe | stripe
+ o | unit | | unit | | unit | | unit | | unit | 2097152
+ b | 10485760| | 10485761| | 10485762| | 10485763| | 10485764|
+ j |---------| |---------| |---------| |---------| |---------|
+ e | stripe | | stripe | | stripe | | stripe | | stripe | stripe
+ c | unit | | unit | | unit | | unit | | unit | 2097153
+ t | 10485765| | 10485766| | 10485767| | 10485768| | 10485769|
+ |---------| |---------| |---------| |---------| |---------|
+ s | . | | . | | . | | . | | . |
+ e . . . . .
+ t | . | | . | | . | | . | | . |
+ |---------| |---------| |---------| |---------| |---------|
+ 2 | stripe | | stripe | | stripe | | stripe | | stripe | stripe
+ | unit | | unit | | unit | | unit | | unit | 3051756
+ | 15258780| | 15258781| | 15258782| | 15258783| | 15258784|
+ |---------| |---------| |---------| |---------| |---------|
+ | stripe | | stripe | | stripe | | stripe | | (partial| (partial
+ | unit | | unit | | unit | | unit | | stripe | stripe
+ | 15258785| | 15258786| | 15258787| | 15258788| | unit) | 3051757)
+ \=========/ \=========/ \=========/ \=========/ \=========/
diff --git a/doc/dev/freebsd.rst b/doc/dev/freebsd.rst
new file mode 100644
index 000000000..b1645b873
--- /dev/null
+++ b/doc/dev/freebsd.rst
@@ -0,0 +1,53 @@
+==============================
+FreeBSD Implementation details
+==============================
+
+
+Disk layout
+-----------
+
+Current implementation works on ZFS pools
+
+* created in /var/lib/ceph
+* One ZFS pool per OSD, like::
+
+ gpart create -s GPT ada1
+ gpart add -t freebsd-zfs -l osd1 ada1
+ zpool create -o mountpoint=/var/lib/ceph/osd/osd.1 osd
+
+* Maybe add some cache and log (ZIL)? Assuming that ada2 is an SSD::
+
+ gpart create -s GPT ada2
+ gpart add -t freebsd-zfs -l osd1-log -s 1G ada2
+ zpool add osd1 log gpt/osd1-log
+ gpart add -t freebsd-zfs -l osd1-cache -s 10G ada2
+ zpool add osd1 log gpt/osd1-cache
+
+* Note: *UFS2 does not allow large xattribs*
+
+
+Configuration
+-------------
+
+As per FreeBSD default parts of extra software go into ``/usr/local/``. Which
+means that for ``/etc/ceph.conf`` the default location is
+``/usr/local/etc/ceph/ceph.conf``. Smartest thing to do is to create a softlink
+from ``/etc/ceph`` to ``/usr/local/etc/ceph``::
+
+ ln -s /usr/local/etc/ceph /etc/ceph
+
+A sample file is provided in ``/usr/local/share/doc/ceph/sample.ceph.conf``
+
+
+MON creation
+------------
+
+Monitors are created by following the manual creation steps on::
+
+ https://docs.ceph.com/en/latest/install/manual-freebsd-deployment/
+
+
+OSD creation
+------------
+
+OSDs can be manually created only, see :ref:`freebsd_adding_osds`
diff --git a/doc/dev/generatedocs.rst b/doc/dev/generatedocs.rst
new file mode 100644
index 000000000..8632eb176
--- /dev/null
+++ b/doc/dev/generatedocs.rst
@@ -0,0 +1,83 @@
+Building Ceph Documentation
+===========================
+
+Ceph utilizes Python's Sphinx documentation tool. For details on
+the Sphinx documentation tool, refer to `The Sphinx Documentation Tool <https://www.sphinx-doc.org/en/master/>`_.
+
+To build the Ceph documentation set, you must:
+
+1. Clone the Ceph repository
+2. Install the required tools
+3. Build the documents
+4. Demo the documents (Optional)
+
+.. highlight:: bash
+
+Clone the Ceph Repository
+-------------------------
+
+To clone the Ceph repository, you must have ``git`` installed
+on your local host. To install ``git``, execute::
+
+ sudo apt-get install git
+
+To clone the Ceph repository, execute::
+
+ git clone git://github.com/ceph/ceph
+
+You should have a full copy of the Ceph repository.
+
+
+Install the Required Tools
+--------------------------
+
+To build the Ceph documentation, some dependencies are required.
+To know what packages are needed, you can launch this command::
+
+ cd ceph
+ admin/build-doc
+
+If dependencies are missing, the command above will fail
+with a message that suggests you a command to install all
+missing dependencies.
+
+
+Build the Documents
+-------------------
+
+Once you have installed all the dependencies, execute the build (the
+same command as above)::
+
+ cd ceph
+ admin/build-doc
+
+Once you build the documentation set, you may navigate to the source directory to view it::
+
+ cd build-doc/output
+
+There should be an ``html`` directory and a ``man`` directory containing documentation
+in HTML and manpage formats respectively.
+
+``admin/build-doc`` takes a long time to prepare the environment and build the document.
+But you can just rebuild the document on changes using::
+
+ admin/build-doc livehtml
+
+This feature uses ``sphinx-autobuild`` under the hood. You can also pass options to it. For
+instance, to open the browser after building the documentation::
+
+ admin/build-doc livehtml -- --open-browser
+
+Please see `sphinx-autobuild <https://pypi.org/project/sphinx-autobuild/>`_ for more details.
+
+Demo the Documents
+-------------------
+
+Once you build the documentation, as described above, you can demo the rendered documents
+by running ``serve-doc``::
+
+ cd ceph
+ admin/serve-doc
+
+This will serve the ``build-doc/output/html`` directory over port 8080 via
+Python's ``SimpleHTTPServer`` module.
diff --git a/doc/dev/iana.rst b/doc/dev/iana.rst
new file mode 100644
index 000000000..d2daf3395
--- /dev/null
+++ b/doc/dev/iana.rst
@@ -0,0 +1,16 @@
+IANA Numbers
+============
+
+Private Enterprise Number (PEN) Assignment
+------------------------------------------
+
+50495
+
+Organization ``Ceph``.
+
+Port number (monitor)
+---------------------
+
+3300
+
+That's 0xce4, or ce4h, or (sort of) "ceph."
diff --git a/doc/dev/internals.rst b/doc/dev/internals.rst
new file mode 100644
index 000000000..a894394c9
--- /dev/null
+++ b/doc/dev/internals.rst
@@ -0,0 +1,52 @@
+================
+ Ceph Internals
+================
+
+.. note:: If you're looking for how to use Ceph as a library from your
+ own software, please see :doc:`/api/index`.
+
+You can start a development mode Ceph cluster, after compiling the source, with::
+
+ cd build
+ OSD=3 MON=3 MGR=3 ../src/vstart.sh -n -x
+ # check that it's there
+ bin/ceph health
+
+.. rubric:: Mailing list
+
+The ``dev@ceph.io`` list is for discussion about the development of Ceph,
+its interoperability with other technology, and the operations of the
+project itself. Subscribe by sending a message to ``dev-request@ceph.io``
+with the line::
+
+ subscribe ceph-devel
+
+in the body of the message.
+
+The ceph-devel@vger.kernel.org list is for discussion
+and patch review for the Linux kernel Ceph client component.
+Subscribe by sending a message to ``majordomo@vger.kernel.org`` with the line::
+
+ subscribe ceph-devel
+
+in the body of the message.
+
+.. raw:: html
+
+ <!---
+
+.. rubric:: Contents
+
+.. toctree::
+ :glob:
+
+ *
+ osd_internals/index*
+ mds_internals/index*
+ radosgw/index*
+ ceph-volume/index*
+ crimson/index*
+
+.. raw:: html
+
+ --->
diff --git a/doc/dev/kubernetes.rst b/doc/dev/kubernetes.rst
new file mode 100644
index 000000000..75b100b24
--- /dev/null
+++ b/doc/dev/kubernetes.rst
@@ -0,0 +1,228 @@
+
+.. _kubernetes-dev:
+
+=======================================
+Hacking on Ceph in Kubernetes with Rook
+=======================================
+
+.. warning::
+
+ This is *not* official user documentation for setting up production
+ Ceph clusters with Kubernetes. It is aimed at developers who want
+ to hack on Ceph in Kubernetes.
+
+This guide is aimed at Ceph developers getting started with running
+in a Kubernetes environment. It assumes that you may be hacking on Rook,
+Ceph or both, so everything is built from source.
+
+TL;DR for hacking on MGR modules
+================================
+
+Make your changes to the Python code base and then from Ceph's
+``build`` directory, run::
+
+ ../src/script/kubejacker/kubejacker.sh '192.168.122.1:5000'
+
+where ``'192.168.122.1:5000'`` is a local docker registry and
+Rook's ``CephCluster`` CR uses ``image: 192.168.122.1:5000/ceph/ceph:latest``.
+
+1. Build a kubernetes cluster
+=============================
+
+Before installing Ceph/Rook, make sure you've got a working kubernetes
+cluster with some nodes added (i.e. ``kubectl get nodes`` shows you something).
+The rest of this guide assumes that your development workstation has network
+access to your kubernetes cluster, such that ``kubectl`` works from your
+workstation.
+
+`There are many ways <https://kubernetes.io/docs/setup/>`_
+to build a kubernetes cluster: here we include some tips/pointers on where
+to get started.
+
+`kubic-terraform-kvm <https://github.com/kubic-project/kubic-terraform-kvm>`_
+might also be an option.
+
+Or `Host your own <https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/>`_ with
+``kubeadm``.
+
+Some Tips
+---------
+
+Here are some tips for a smoother ride with
+
+``kubeadm``:
+
+- If you have previously added any yum/deb repos for kubernetes packages,
+ disable them before trying to use the packages.cloud.google.com repository.
+ If you don't, you'll get quite confusing conflicts.
+- Even if your distro already has docker, make sure you're installing it
+ a version from docker.com that is within the range mentioned in the
+ kubeadm install instructions. Especially, note that the docker in CentOS 7, 8
+ will *not* work.
+
+``minikube``:
+
+- Start up minikube by passing local docker registry address::
+ ``minikube start --driver=docker --insecure-registry='192.168.122.1:5000'``
+
+Hosted elsewhere
+----------------
+
+If you do not have any servers to hand, you might try a pure
+container provider such as Google Compute Engine. Your mileage may
+vary when it comes to what kinds of storage devices are visible
+to your kubernetes cluster.
+
+Make sure you check how much it's costing you before you spin up a big cluster!
+
+
+2. Run a docker registry
+========================
+
+Run this somewhere accessible from both your workstation and your
+kubernetes cluster (i.e. so that ``docker push/pull`` just works everywhere).
+This is likely to be the same host you're using as your kubernetes master.
+
+1. Install the ``docker-distribution`` package.
+2. If you want to configure the port, edit ``/etc/docker-distribution/registry/config.yml``
+3. Enable the registry service:
+
+::
+
+ systemctl enable docker-distribution
+ systemctl start docker-distribution
+
+You may need to mark the registry as **insecure**.
+
+3. Build Rook
+=============
+
+.. note::
+
+ Building Rook is **not required** to make changes to Ceph.
+
+Install Go if you don't already have it.
+
+Download the Rook source code:
+
+::
+
+ go get github.com/rook/rook
+
+ # Ignore this warning, as Rook is not a conventional go package
+ can't load package: package github.com/rook/rook: no Go files in /home/jspray/go/src/github.com/rook/rook
+
+You will now have a Rook source tree in ~/go/src/github.com/rook/rook -- you may
+be tempted to clone it elsewhere, but your life will be easier if you
+leave it in your GOPATH.
+
+Run ``make`` in the root of your Rook tree to build its binaries and containers:
+
+::
+
+ make
+ ...
+ === saving image build-9204c79b/ceph-amd64
+ === docker build build-9204c79b/ceph-toolbox-base-amd64
+ sha256:653bb4f8d26d6178570f146fe637278957e9371014ea9fce79d8935d108f1eaa
+ === docker build build-9204c79b/ceph-toolbox-amd64
+ sha256:445d97b71e6f8de68ca1c40793058db0b7dd1ebb5d05789694307fd567e13863
+ === caching image build-9204c79b/ceph-toolbox-base-amd64
+
+You can use ``docker image ls`` to see the resulting built images. The
+images you care about are the ones with tags ending "ceph-amd64" (used
+for the Rook operator and Ceph daemons) and "ceph-toolbox-amd64" (used
+for the "toolbox" container where the CLI is run).
+
+4. Build Ceph
+=============
+
+.. note::
+
+ Building Ceph is **not required** to make changes to MGR modules
+ written in Python.
+
+
+The Rook containers and the Ceph containers are independent now. Note that
+Rook's Ceph client libraries need to communicate with the Ceph cluster,
+therefore a compatible major version is required.
+
+You can run a Registry docker container with access to your Ceph source
+tree using a command like:
+
+::
+
+ docker run -i -v /my/ceph/src:/my/ceph/src -p 192.168.122.1:5000:5000 -t --name registry registry:2
+
+
+Once you have built Ceph, you can inject the resulting binaries into
+the Rook container image using the ``kubejacker.sh`` script (run from
+your build directory but from *outside* your build container).
+
+5. Run Kubejacker
+=================
+
+``kubejacker`` needs access to your docker registry. Execute the script
+to build a docker image containing your latest Ceph binaries:
+
+::
+
+ build$ ../src/script/kubejacker/kubejacker.sh "<host>:<port>"
+
+
+Now you've got your freshly built Rook and freshly built Ceph into
+a single container image, ready to run. Next time you change something
+in Ceph, you can re-run this to update your image and restart your
+kubernetes containers. If you change something in Rook, then re-run the Rook
+build, and the Ceph build too.
+
+5. Run a Rook cluster
+=====================
+
+Please refer to `Rook's documentation <https://rook.io/docs/rook/master/ceph-quickstart.html>`_
+for setting up a Rook operator, a Ceph cluster and the toolbox.
+
+The Rook source tree includes example .yaml files in
+``cluster/examples/kubernetes/ceph/``. Copy these into
+a working directory, and edit as necessary to configure
+the setup you want:
+
+- Ensure that ``spec.cephVersion.image`` points to your docker registry::
+
+ spec:
+ cephVersion:
+ allowUnsupported: true
+ image: 192.168.122.1:5000/ceph/ceph:latest
+
+Then, load the configuration into the kubernetes API using ``kubectl``:
+
+::
+
+ kubectl apply -f ./cluster-test.yaml
+
+Use ``kubectl -n rook-ceph get pods`` to check the operator
+pod the Ceph daemons and toolbox are is coming up.
+
+Once everything is up and running,
+you should be able to open a shell in the toolbox container and
+run ``ceph status``.
+
+If your mon services start but the rest don't, it could be that they're
+unable to form a quorum due to a Kubernetes networking issue: check that
+containers in your Kubernetes cluster can ping containers on other nodes.
+
+Cheat sheet
+===========
+
+Open a shell in your toolbox container::
+
+ kubectl -n rook-ceph exec -it $(kubectl -n rook-ceph get pod -l "app=rook-ceph-tools" -o jsonpath="{.items[0].metadata.name}") -- bash
+
+Inspect the Rook operator container's logs::
+
+ kubectl -n rook-ceph logs -l app=rook-ceph-operator
+
+Inspect the ceph-mgr container's logs::
+
+ kubectl -n rook-ceph logs -l app=rook-ceph-mgr
+
diff --git a/doc/dev/libs.rst b/doc/dev/libs.rst
new file mode 100644
index 000000000..203dd38b0
--- /dev/null
+++ b/doc/dev/libs.rst
@@ -0,0 +1,18 @@
+======================
+ Library architecture
+======================
+
+Ceph is structured into libraries which are built and then combined together to
+make executables and other libraries.
+
+- libcommon: a collection of utilities which are available to nearly every ceph
+ library and executable. In general, libcommon should not contain global
+ variables, because it is intended to be linked into libraries such as
+ libcephfs.so.
+
+- libglobal: a collection of utilities focused on the needs of Ceph daemon
+ programs. In here you will find pidfile management functions, signal
+ handlers, and so forth.
+
+.. todo:: document other libraries
+
diff --git a/doc/dev/logging.rst b/doc/dev/logging.rst
new file mode 100644
index 000000000..67d3de141
--- /dev/null
+++ b/doc/dev/logging.rst
@@ -0,0 +1,106 @@
+
+Use of the cluster log
+======================
+
+(Note: none of this applies to the local "dout" logging. This is about
+the cluster log that we send through the mon daemons)
+
+Severity
+--------
+
+Use ERR for situations where the cluster cannot do its job for some reason.
+For example: we tried to do a write, but it returned an error, or we tried
+to read something, but it's corrupt so we can't, or we scrubbed a PG but
+the data was inconsistent so we can't recover.
+
+Use WRN for incidents that the cluster can handle, but have some abnormal/negative
+aspect, such as a temporary degradation of service, or an unexpected internal
+value. For example, a metadata error that can be auto-fixed, or a slow operation.
+
+Use INFO for ordinary cluster operations that do not indicate a fault in
+Ceph. It is especially important that INFO level messages are clearly
+worded and do not cause confusion or alarm.
+
+Frequency
+---------
+
+It is important that messages of all severities are not excessively
+frequent. Consumers may be using a rotating log buffer that contains
+messages of all severities, so even DEBUG messages could interfere
+with proper display of the latest INFO messages if the DEBUG messages
+are too frequent.
+
+Remember that if you have a bad state (as opposed to event), that is
+what health checks are for -- do not spam the cluster log to indicate
+a continuing unhealthy state.
+
+Do not emit cluster log messages for events that scale with
+the number of clients or level of activity on the system, or for
+events that occur regularly in normal operation. For example, it
+would be inappropriate to emit a INFO message about every
+new client that connects (scales with #clients), or to emit and INFO
+message about every CephFS subtree migration (occurs regularly).
+
+Language and formatting
+-----------------------
+
+(Note: these guidelines matter much less for DEBUG-level messages than
+ for INFO and above. Concentrate your efforts on making INFO/WRN/ERR
+ messages as readable as possible.)
+
+Use the passive voice. For example, use "Object xyz could not be read", rather
+than "I could not read the object xyz".
+
+Print long/big identifiers, such as inode numbers, as hex, prefixed
+with an 0x so that the user can tell it is hex. We do this because
+the 0x makes it unambiguous (no equivalent for decimal), and because
+the hex form is more likely to fit on the screen.
+
+Print size quantities as a human readable MB/GB/etc, including the unit
+at the end of the number. Exception: if you are specifying an offset,
+where precision is essential to the meaning, then you can specify
+the value in bytes (but print it as hex).
+
+Make a good faith effort to fit your message on a single line. It does
+not have to be guaranteed, but it should at least usually be
+the case. That means, generally, no printing of lists unless there
+are only a few items in the list.
+
+Use nouns that are meaningful to the user, and defined in the
+documentation. Common acronyms are OK -- don't waste screen space
+typing "Rados Object Gateway" instead of RGW. Do not use internal
+class names like "MDCache" or "Objecter". It is okay to mention
+internal structures if they are the direct subject of the message,
+for example in a corruption, but use plain English.
+Example: instead of "Objecter requests" say "OSD client requests"
+Example: it is okay to mention internal structure in the context
+of "Corrupt session table" (but don't say "Corrupt SessionTable")
+
+Where possible, describe the consequence for system availability, rather
+than only describing the underlying state. For example, rather than
+saying "MDS myfs.0 is replaying", say that "myfs is degraded, waiting
+for myfs.0 to finish starting".
+
+While common acronyms are fine, don't randomly truncate words. It's not
+"dir ino", it's "directory inode".
+
+If you're logging something that "should never happen", i.e. a situation
+where it would be an assertion, but we're helpfully not crashing, then
+make that clear in the language -- this is probably not a situation
+that the user can remediate themselves.
+
+Avoid UNIX/programmer jargon. Instead of "errno", just say "error" (or
+preferably give something more descriptive than the number!)
+
+Do not mention cluster map epochs unless they are essential to
+the meaning of the message. For example, "OSDMap epoch 123 is corrupt"
+would be okay (the epoch is the point of the message), but saying "OSD
+123 is down in OSDMap epoch 456" would not be (the osdmap and epoch
+concepts are an implementation detail, the down-ness of the OSD
+is the real message). Feel free to send additional detail to
+the daemon's local log (via `dout`/`derr`).
+
+If you log a problem that may go away in the future, make sure you
+also log when it goes away. Whatever priority you logged the original
+message at, log the "going away" message at INFO.
+
diff --git a/doc/dev/logs.rst b/doc/dev/logs.rst
new file mode 100644
index 000000000..7e703e541
--- /dev/null
+++ b/doc/dev/logs.rst
@@ -0,0 +1,55 @@
+============
+ Debug logs
+============
+
+The main debugging tool for Ceph is the dout and derr logging functions.
+Collectively, these are referred to as "dout logging."
+
+Dout has several log faculties, which can be set at various log
+levels using the configuration management system. So it is possible to enable
+debugging just for the messenger, by setting debug_ms to 10, for example.
+
+The dout macro avoids even generating log messages which are not going to be
+used, by enclosing them in an "if" statement. What this means is that if you
+have the debug level set at 0, and you run this code::
+
+ dout(20) << "myfoo() = " << myfoo() << dendl;
+
+
+myfoo() will not be called here.
+
+Unfortunately, the performance of debug logging is relatively low. This is
+because there is a single, process-wide mutex which every debug output
+statement takes, and every debug output statement leads to a write() system
+call or a call to syslog(). There is also a computational overhead to using C++
+streams to consider. So you will need to be parsimonious in your logging to get
+the best performance.
+
+Sometimes, enabling logging can hide race conditions and other bugs by changing
+the timing of events. Keep this in mind when debugging.
+
+Performance counters
+====================
+
+Ceph daemons use performance counters to track key statistics like number of
+inodes pinned. Performance counters are essentially sets of integers and floats
+which can be set, incremented, and read using the PerfCounters API.
+
+A PerfCounters object is usually associated with a single subsystem. It
+contains multiple counters. This object is thread-safe because it is protected
+by an internal mutex. You can create multiple PerfCounters objects.
+
+Currently, three types of performance counters are supported: u64 counters,
+float counters, and long-run floating-point average counters. These are created
+by PerfCountersBuilder::add_u64, PerfCountersBuilder::add_fl, and
+PerfCountersBuilder::add_fl_avg, respectively. u64 and float counters simply
+provide a single value which can be updated, incremented, and read atomically.
+floating-pointer average counters provide two values: the current total, and
+the number of times the total has been changed. This is intended to provide a
+long-run average value.
+
+Performance counter information can be read in JSON format from the
+administrative socket (admin_sock). This is implemented as a UNIX domain
+socket. The Ceph performance counter plugin for collectd shows an example of how
+to access this information. Another example can be found in the unit tests for
+the administrative sockets.
diff --git a/doc/dev/macos.rst b/doc/dev/macos.rst
new file mode 100644
index 000000000..4f966be3d
--- /dev/null
+++ b/doc/dev/macos.rst
@@ -0,0 +1,50 @@
+build on MacOS
+==============
+
+Since we've switched to C++ 17, and the default clang shipped with Xcode 9 does not support all the C++ 17 language features, it's suggested to install clang using brew::
+
+ brew install llvm
+
+and install all the necessary bits::
+
+ brew install snappy ccache cmake pkg-config
+ pip install cython
+
+install FUSE if you want to build the FUSE support::
+
+ brew cask install osxfuse
+
+then, under the source directory of Ceph::
+
+ mkdir build
+ cd build
+ export PKG_CONFIG_PATH=/usr/local/Cellar/nss/3.48/lib/pkgconfig:/usr/local/Cellar/openssl/1.0.2t/lib/pkgconfig
+ cmake .. -DBOOST_J=4 \
+ -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang \
+ -DCMAKE_CXX_COMPILER=/usr/local/opt/llvm/bin/clang++ \
+ -DCMAKE_EXE_LINKER_FLAGS="-L/usr/local/opt/llvm/lib" \
+ -DENABLE_GIT_VERSION=OFF \
+ -DSNAPPY_ROOT_DIR=/usr/local/Cellar/snappy/1.1.7_1 \
+ -DWITH_BABELTRACE=OFF \
+ -DWITH_BLUESTORE=OFF \
+ -DWITH_CCACHE=OFF \
+ -DWITH_CEPHFS=OFF \
+ -DWITH_KRBD=OFF \
+ -DWITH_LIBCEPHFS=OFF \
+ -DWITH_LTTNG=OFF \
+ -DWITH_LZ4=OFF \
+ -DWITH_MANPAGE=ON \
+ -DWITH_MGR=OFF \
+ -DWITH_MGR_DASHBOARD_FRONTEND=OFF \
+ -DWITH_RADOSGW=OFF \
+ -DWITH_RDMA=OFF \
+ -DWITH_SPDK=OFF \
+ -DWITH_SYSTEMD=OFF \
+ -DWITH_TESTS=OFF \
+ -DWITH_XFS=OFF
+
+The paths to ``nss`` and ``snappy`` might vary if newer versions of the packages are installed.
+
+Also, please consider using boost v1.69 to address the bug of https://github.com/boostorg/atomic/issues/15.
+
+Currently, the most practical uses for Ceph on MacOS might be FUSE and some other librados based applications.
diff --git a/doc/dev/mds_internals/data-structures.rst b/doc/dev/mds_internals/data-structures.rst
new file mode 100644
index 000000000..c77175a16
--- /dev/null
+++ b/doc/dev/mds_internals/data-structures.rst
@@ -0,0 +1,44 @@
+MDS internal data structures
+==============================
+
+*CInode*
+ CInode contains the metadata of a file, there is one CInode for each file.
+ The CInode stores information like who owns the file, how big the file is.
+
+*CDentry*
+ CDentry is the glue that holds inodes and files together by relating inode to
+ file/directory names. A CDentry links to at most one CInode (it may not link
+ to any CInode). A CInode may be linked by multiple CDentries.
+
+*CDir*
+ CDir only exists for directory inode, it's used to link CDentries under the
+ directory. A CInode can have multiple CDir when the directory is fragmented.
+
+These data structures are linked together as::
+
+ CInode
+ CDir
+ | \
+ | \
+ | \
+ CDentry CDentry
+ CInode CInode
+ CDir CDir
+ | | \
+ | | \
+ | | \
+ CDentry CDentry CDentry
+ CInode CInode CInode
+
+As this doc is being written, size of CInode is about 1400 bytes, size of CDentry
+is about 400 bytes, size of CDir is about 700 bytes. These data structures are
+quite large. Please be careful if you want to add new fields to them.
+
+*OpenFileTable*
+ Open file table tracks open files and their ancestor directories. Recovering
+ MDS can easily get open files' paths, significantly reducing the time of
+ loading inodes for open files. Each entry in the table corresponds to an inode,
+ it records linkage information (parent inode and dentry name) of the inode. MDS
+ can constructs the inode's path by recursively lookup parent inode's linkage.
+ Open file table is stored in omap of RADOS objects, table entries correspond to
+ KV pairs in omap.
diff --git a/doc/dev/mds_internals/exports.rst b/doc/dev/mds_internals/exports.rst
new file mode 100644
index 000000000..c5b0e3915
--- /dev/null
+++ b/doc/dev/mds_internals/exports.rst
@@ -0,0 +1,76 @@
+
+===============
+Subtree exports
+===============
+
+Normal Migration
+----------------
+
+The exporter begins by doing some checks in export_dir() to verify
+that it is permissible to export the subtree at this time. In
+particular, the cluster must not be degraded, the subtree root may not
+be freezing or frozen (\ie already exporting, or nested beneath
+something that is exporting), and the path must be pinned (\ie not
+conflicted with a rename). If these conditions are met, the subtree
+freeze is initiated, and the exporter is committed to the subtree
+migration, barring an intervening failure of the importer or itself.
+
+The MExportDirDiscover serves simply to ensure that the base directory
+being exported is open on the destination node. It is pinned by the
+importer to prevent it from being trimmed. This occurs before the
+exporter completes the freeze of the subtree to ensure that the
+importer is able to replicate the necessary metadata. When the
+exporter receives the MExportDirDiscoverAck, it allows the freeze to proceed.
+
+The MExportDirPrep message then follows to populate a spanning tree that
+includes all dirs, inodes, and dentries necessary to reach any nested
+exports within the exported region. This replicates metadata as well,
+but it is pushed out by the exporter, avoiding deadlock with the
+regular discover and replication process. The importer is responsible
+for opening the bounding directories from any third parties before
+acknowledging. This ensures that the importer has correct dir_auth
+information about where authority is delegated for all points nested
+within the subtree being migrated. While processing the MExportDirPrep,
+the importer freezes the entire subtree region to prevent any new
+replication or cache expiration.
+
+The warning stage occurs only if the base subtree directory is open by
+nodes other than the importer and exporter. If so, then a
+MExportDirNotify message informs any bystanders that the authority for
+the region is temporarily ambiguous. In particular, bystanders who
+are trimming items from their cache must send MCacheExpire messages to
+both the old and new authorities. This is necessary to ensure that
+the surviving authority reliably receives all expirations even if the
+importer or exporter fails. While the subtree is frozen (on both the
+importer and exporter), expirations will not be immediately processed;
+instead, they will be queued until the region is unfrozen and it can
+be determined that the node is or is not authoritative for the region.
+
+The MExportDir message sends the actual subtree metadata to the importer.
+Upon receipt, the importer inserts the data into its cache, logs a
+copy in the EImportStart, and replies with an MExportDirAck. The exporter
+can now log an EExport, which ultimately specifies that
+the export was a success. In the presence of failures, it is the
+existence of the EExport that disambiguates authority during recovery.
+
+Once logged, the exporter will send an MExportDirNotify to any
+bystanders, informing them that the authority is no longer ambiguous
+and cache expirations should be sent only to the new authority (the
+importer). Once these are acknowledged, implicitly flushing the
+bystander to exporter message streams of any stray expiration notices,
+the exporter unfreezes the subtree, cleans up its state, and sends a
+final MExportDirFinish to the importer. Upon receipt, the importer logs
+an EImportFinish(true), unfreezes its subtree, and cleans up its
+state.
+
+
+PARTIAL FAILURE RECOVERY
+
+
+
+RECOVERY FROM JOURNAL
+
+
+
+
+
diff --git a/doc/dev/mds_internals/index.rst b/doc/dev/mds_internals/index.rst
new file mode 100644
index 000000000..c8c82ad10
--- /dev/null
+++ b/doc/dev/mds_internals/index.rst
@@ -0,0 +1,10 @@
+==============================
+MDS developer documentation
+==============================
+
+.. rubric:: Contents
+
+.. toctree::
+ :glob:
+
+ *
diff --git a/doc/dev/messenger.rst b/doc/dev/messenger.rst
new file mode 100644
index 000000000..729538913
--- /dev/null
+++ b/doc/dev/messenger.rst
@@ -0,0 +1,33 @@
+============================
+ Messenger notes
+============================
+
+Messenger is the Ceph network layer implementation. Currently Ceph supports
+one messenger type: "async".
+
+ceph_perf_msgr
+==============
+
+ceph_perf_msgr is used to do benchmark for messenger module only and can help
+to find the bottleneck or time consuming within messenger moduleIt just like
+"iperf", we need to start server-side program firstly:
+
+# ./ceph_perf_msgr_server 172.16.30.181:10001 1 0
+
+The first argument is ip:port pair which is telling the destination address the
+client need to specified. The second argument configures the server threads. The
+third argument tells the "think time"(us) when dispatching messages. After Giant,
+CEPH_OSD_OP message which is the actual client read/write io request is fast
+dispatched without queueing to Dispatcher, in order to achieve better performance.
+So CEPH_OSD_OP message will be processed inline, "think time" is used by mock
+this "inline process" process.
+
+# ./ceph_perf_msgr_client 172.16.30.181:10001 1 32 10000 10 4096
+
+The first argument is specified the server ip:port, and the second argument is
+used to specify client threads. The third argument specify the concurrency(the
+max inflight messages for each client thread), the fourth argument specify the
+io numbers will be issued to server per client thread. The fifth argument is
+used to indicate the "think time" for client thread when receiving messages,
+this is also used to mock the client fast dispatch process. The last argument
+specify the message data length to issue.
diff --git a/doc/dev/mon-bootstrap.rst b/doc/dev/mon-bootstrap.rst
new file mode 100644
index 000000000..2b67b4707
--- /dev/null
+++ b/doc/dev/mon-bootstrap.rst
@@ -0,0 +1,212 @@
+===================
+ Monitor bootstrap
+===================
+
+Terminology:
+
+* ``cluster``: a set of monitors
+* ``quorum``: an active set of monitors consisting of a majority of the cluster
+
+In order to initialize a new monitor, it must always be fed:
+
+#. a logical name
+#. secret keys
+#. a cluster fsid (uuid)
+
+In addition, a monitor needs to know two things:
+
+#. what address to bind to
+#. who its peers are (if any)
+
+There are a range of ways to do both.
+
+Logical id
+==========
+
+The logical id should be unique across the cluster. It will be
+appended to ``mon.`` to logically describe the monitor in the Ceph
+cluster. For example, if the logical id is ``foo``, the monitor's
+name will be ``mon.foo``.
+
+For most users, there is no more than one monitor per host, which
+makes the short hostname logical choice.
+
+Secret keys
+===========
+
+The ``mon.`` secret key is stored a ``keyring`` file in the ``mon data`` directory. It can be generated
+with a command like::
+
+ ceph-authtool --create-keyring /path/to/keyring --gen-key -n mon.
+
+When creating a new monitor cluster, the keyring should also contain a ``client.admin`` key that can be used
+to administer the system::
+
+ ceph-authtool /path/to/keyring --gen-key -n client.admin --cap mon 'allow *' --cap osd 'allow *' --cap mds 'allow'
+
+The resulting keyring is fed to ``ceph-mon --mkfs`` with the ``--keyring <keyring>`` command-line argument.
+
+Cluster fsid
+============
+
+The cluster fsid is a normal uuid, like that generated by the ``uuidgen`` command. It
+can be provided to the monitor in two ways:
+
+#. via the ``--fsid <uuid>`` command-line argument (or config file option)
+#. via a monmap provided to the new monitor via the ``--monmap <path>`` command-line argument.
+
+Monitor address
+===============
+
+The monitor address can be provided in several ways.
+
+#. via the ``--public-addr <ip[:port]>`` command-line option (or config file option)
+#. via the ``--public-network <cidr>`` command-line option (or config file option)
+#. via the monmap provided via ``--monmap <path>``, if it includes a monitor with our name
+#. via the bootstrap monmap (provided via ``--inject-monmap <path>`` or generated from ``--mon-host <list>``) if it includes a monitor with no name (``noname-<something>``) and an address configured on the local host.
+
+Peers
+=====
+
+The monitor peers are provided in several ways:
+
+#. via the initial monmap, provided via ``--monmap <filename>``
+#. via the bootstrap monmap generated from ``--mon-host <list>``
+#. via the bootstrap monmap generated from ``[mon.*]`` sections with the deprecated ``mon addr`` options in the config file (note that this method is *not* recommended and does not support binding to both v1 and v2 protocol addresses)
+#. dynamically via the admin socket
+
+However, these methods are not completely interchangeable because of
+the complexity of creating a new monitor cluster without danger of
+races.
+
+Cluster creation
+================
+
+There are three basic approaches to creating a cluster:
+
+#. Create a new cluster by specifying the monitor names and addresses ahead of time.
+#. Create a new cluster by specifying the monitor names ahead of time, and dynamically setting the addresses as ``ceph-mon`` daemons configure themselves.
+#. Create a new cluster by specifying the monitor addresses ahead of time.
+
+
+Names and addresses
+-------------------
+
+Generate a monmap using ``monmaptool`` with the names and addresses of the initial
+monitors. The generated monmap will also include a cluster fsid. Feed that monmap
+to each monitor daemon::
+
+ ceph-mon --mkfs -i <name> --monmap <initial_monmap> --keyring <initial_keyring>
+
+When the daemons start, they will know exactly who they and their peers are.
+
+
+Addresses only
+--------------
+
+The initial monitor addresses can be specified with the ``mon host`` configuration value,
+either via a config file or the command-line argument. This method has the advantage that
+a single global config file for the cluster can have a line like::
+
+ mon host = a.foo.com, b.foo.com, c.foo.com
+
+and will also serve to inform any ceph clients or daemons who the monitors are.
+
+The ``ceph-mon`` daemons will need to be fed the initial keyring and cluster fsid to
+initialize themselves:
+
+ ceph-mon --mkfs -i <name> --fsid <uuid> --keyring <initial_keyring>
+
+When the daemons first start up, they will share their names with each other and form a
+new cluster.
+
+Names only
+----------
+
+In dynamic "cloud" environments, the cluster creator may not (yet)
+know what the addresses of the monitors are going to be. Instead,
+they may want machines to configure and start themselves in parallel
+and, as they come up, form a new cluster on their own. The problem is
+that the monitor cluster relies on strict majorities to keep itself
+consistent, and in order to "create" a new cluster, it needs to know
+what the *initial* set of monitors will be.
+
+This can be done with the ``mon initial members`` config option, which
+should list the ids of the initial monitors that are allowed to create
+the cluster::
+
+ mon initial members = foo, bar, baz
+
+The monitors can then be initialized by providing the other pieces of
+information (they keyring, cluster fsid, and a way of determining
+their own address). For example::
+
+ ceph-mon --mkfs -i <name> --mon-initial-hosts 'foo,bar,baz' --keyring <initial_keyring> --public-addr <ip>
+
+When these daemons are started, they will know their own address, but
+not their peers. They can learn those addresses via the admin socket::
+
+ ceph daemon mon.<id> add_bootstrap_peer_hint <peer ip>
+
+Once they learn enough of their peers from the initial member set,
+they will be able to create the cluster.
+
+
+Cluster expansion
+=================
+
+Cluster expansion is slightly less demanding than creation, because
+the creation of the initial quorum is not an issue and there is no
+worry about creating separately independent clusters.
+
+New nodes can be forced to join an existing cluster in two ways:
+
+#. by providing no initial monitor peers addresses, and feeding them dynamically.
+#. by specifying the ``mon initial members`` config option to prevent the new nodes from forming a new, independent cluster, and feeding some existing monitors via any available method.
+
+Initially peerless expansion
+----------------------------
+
+Create a new monitor and give it no peer addresses other than its own. For
+example::
+
+ ceph-mon --mkfs -i <myid> --fsid <fsid> --keyring <mon secret key> --public-addr <ip>
+
+Once the daemon starts, you can give it one or more peer addresses (preferably a bare IP address with no port; the mon will set the addr types and ports for you) to join with::
+
+ ceph daemon mon.<id> add_bootstrap_peer_hint <peer ip>
+
+Alternatively, you can explicitly specify the addrvec_t with::
+
+ ceph daemon mon.<id> add_bootstrap_peer_hintv <peer addrvec>
+
+For example,::
+
+ ceph daemon mon.new add_bootstrap_peer_hintv v2:1.2.3.4:3300,v1:1.2.3.4:6789
+
+This monitor will never participate in cluster creation; it can only
+join an existing cluster.
+
+Note that the address(es) specified should match exactly the addresses
+the new monitor is binding too. If, for example, the new mon binds to
+only a v2 address but a v2 and v1 address are provided, there is some
+possibility of confusion in the mons.
+
+Expanding with initial members
+------------------------------
+
+You can feed the new monitor some peer addresses initially and avoid badness by also
+setting ``mon initial members``. For example::
+
+ ceph-mon --mkfs -i <myid> --fsid <fsid> --keyring <mon secret key> --public-addr <ip> --mon-host foo,bar,baz
+
+When the daemon is started, ``mon initial members`` must be set via the command line or config file::
+
+ ceph-mon -i <myid> --mon-initial-members foo,bar,baz
+
+to prevent any risk of split-brain.
+
+
+
+
+
diff --git a/doc/dev/mon-elections.rst b/doc/dev/mon-elections.rst
new file mode 100644
index 000000000..86cfc3803
--- /dev/null
+++ b/doc/dev/mon-elections.rst
@@ -0,0 +1,130 @@
+=================
+Monitor Elections
+=================
+
+The Original Algorithm
+======================
+Historically, monitor leader elections have been very simple: the lowest-ranked
+monitor wins!
+
+This is accomplished using a low-state "Elector" module (though it has now
+been split into an Elector that handles message-passing, and an ElectionLogic
+that makes the voting choices). It tracks the election epoch and not much
+else. Odd epochs are elections; even epochs have a leader and let the monitor
+do its ongoing work. When a timeout occurs or the monitor asks for a
+new election, we bump the epoch and send out Propose messages to all known
+monitors.
+In general, if we receive an old message we either drop it or trigger a new
+election (if we think the sender is newly-booted and needs to join quorum). If
+we receive a message from a newer epoch, we bump up our epoch to match and
+either Defer to the Proposer or else bump the epoch again and Propose
+ourselves if we expect to win over them. When we receive a Propose within
+our current epoch, we either Defer to the sender or ignore them (we ignore them
+if they are of a higher rank than us, or higher than the rank we have already
+deferred to).
+(Note that if we have the highest rank it is possible for us to defer to every
+other monitor in sequence within the same election epoch!)
+
+This resolves under normal circumstances because all monitors agree on the
+priority voting order, and epochs are only bumped when a monitor isn't
+participating or sees a possible conflict with the known proposers.
+
+The Problems
+==============
+The original algorithm didn't work at all under a variety of netsplit
+conditions. This didn't manifest often in practice but has become
+important as the community and commercial vendors move Ceph into
+spaces requiring the use of "stretch clusters".
+
+The New Algorithms
+==================
+We still default to the original ("classic") election algorithm, but
+support letting users change to new ones via the CLI. These
+algorithms are implemented as different functions and switch statements
+within the ElectionLogic class.
+
+The first algorithm is very simple: "disallow" lets you add monitors
+to a list of disallowed leaders.
+The second, "connectivity", incorporates connection score ratings
+and elects the monitor with the best score.
+
+Algorithm: disallow
+===================
+If a monitor is in the disallowed list, it always defers to another
+monitor, no matter the rank. Otherwise, it is the same as the classic
+algorithm is.
+Since changing the disallowed list requires a paxos update, monitors
+in an election together should always have the same set. This means
+the election order is constant and static across the full monitor set
+and elections resolve trivially (assuming a connected network).
+
+This algorithm really just exists as a demo and stepping-stone to
+the more advanced connectivity mode, but it may have utility in asymmetric
+networks and clusters.
+
+Algorithm: connectivity
+=======================
+This algorithm takes as input scores for each connection
+(both ways, discussed in the next section) and attempts to elect the monitor
+with the highest total score. We keep the same basic message-passing flow as the
+classic algorithm, in which elections are driven by reacting to Propose messages.
+But this has several challenges since unlike ranks, scores are not static (and
+might change during an election!). To guarantee an election epoch does not
+produce multiple leaders, we must maintain two key invariants:
+* Monitors must maintain static scores during an election epoch
+* Any deferral must be transitive -- if A defers to B and then to C,
+B had better defer to C as well!
+
+We handle these very explicitly: by branching a copy stable_peer_tracker
+of our peer_tracker scoring object whenever starting an election (or
+bumping the epoch), and by refusing to defer to a monitor if it won't
+be deferred to by our current leader choice. (All Propose messages include
+a copy of the scores the leader is working from, so peers can evaluate them.)
+
+Of course, those modifications can easily block. To guarantee forward progress,
+we make several further adjustments:
+* If we want to defer to a new peer, but have already deferred to a peer
+whose scores don't allow that, we bump the election epoch and start()
+the election over again.
+* All election messages include the scores the sender is aware of.
+
+This guarantees we will resolve the election as long as the network is
+reasonably stable (even if disconnected): As long as all score "views"
+result in the same deferral order, an election will complete normally. And by
+broadly sharing scores across the full set of monitors, monitors rapidly
+converge on the global newest state.
+
+This algorithm has one further important feature compared to the classic and
+disallowed handlers: it can ignore out-of-quorum peers. Normally, whenever
+a monitor B receives a Propose from an out-of-quorum peer C, B will itself trigger
+a new election to give C an opportunity to join. But because the
+highest-scoring monitor A may be netsplit from C, this is not desirable. So in
+the connectivity election algorithm, B only "forwards" Propose messages when B's
+scores indicate the cluster would choose a leader other than A.
+
+Connection Scoring
+==================
+We implement scoring within the ConnectionTracker class, which is
+driven by the Elector and provided to ElectionLogic as a resource. Elector
+is responsible for sending out MMonPing messages, and for reporting the
+results in to the ConnectionTracker as calls to report_[live|dead]_connection
+with the relevant peer and the time units the call counts for. (These time units
+are seconds in the monitor, but the ConnectionTracker is agnostic and our unit
+tests count simple time steps.)
+
+We configure a "half life" and each report updates the peer's current status
+(alive or dead) and its total score. The new score is current_score * (1 - units_alive / (2 * half_life)) + (units_alive / (2 * half_life)). (For a dead report, we of course
+subtract the new delta, rather than adding it).
+
+We can further encode and decode the ConnectionTracker for wire transmission,
+and receive_peer_report()s of a full ConnectionTracker (containing all
+known scores) or a ConnectionReport (representing a single peer's scores)
+to slurp up the scores from peers. These scores are of course all versioned so
+we are in no danger of accidentally going backwards in time.
+We can query an individual connection score (if the connection is down, it's 0)
+or the total score of a specific monitor, which is the connection score from all
+other monitors going in to that one.
+
+By default, we consider pings failed after 2 seconds (mon_elector_ping_timeout)
+and ping live connections every second (mon_elector_ping_divisor). The halflife
+is 12 hours (mon_con_tracker_score_halflife).
diff --git a/doc/dev/mon-on-disk-formats.rst b/doc/dev/mon-on-disk-formats.rst
new file mode 100644
index 000000000..e48d39a41
--- /dev/null
+++ b/doc/dev/mon-on-disk-formats.rst
@@ -0,0 +1,91 @@
+##############
+ON-DISK FORMAT
+##############
+
+
+************
+UPGRADE PATH
+************
+
+On-disk formats, or even data structure formats, may be changed during an
+upgrade. Services wishing to do so, may so do it via the
+`PaxosService::upgrade_format()` call path. There is no formalized, unified
+format versioning; the `PaxosService` class keeps track of its
+`format_version` through a key in the store, assumed an `unsigned int`, but
+it will be the service's responsibility to give meaning to those versions.
+
+AUTH MONITOR
+============
+
+versions
+--------
+
+versions are represented with a single `unsigned int`. By default, the value
+zero represents the absence of a formal upgraded format. The first format
+version was introduced in Dumpling; clusters upgrading to Dumpling saw their
+format version being increased from zero to one.::
+
+ 0 to 1 - introduced in v0.65, dev release for v0.67 dumpling
+ 1 to 2 - introduced in v12.0.2, dev release for luminous
+ 2 to 3 - introduced in mimic
+
+ 0 - all clusters pre-dumpling
+ 1 - all clusters dumpling+ and pre-luminous
+ 2 - all clusters luminous+ and pre-mimic
+ 3 - all clusters mimic+
+
+ version 1: introduces new-style monitor caps (i.e., profiles)
+ version 2: introduces mgr caps and bootstrap-mgr key
+ version 3: creates all bootstrap and admin keys if they don't yet exist
+
+callstack
+---------
+
+format_version set on `PaxosService::refresh()`:::
+
+ - initially called from Monitor::refresh_from_paxos
+ - initially called from Monitor::init_paxos()
+ - initially called from Monitor::preinit()
+
+AuthMonitor::upgrade_format() called by `PaxosService::_active()`:::
+
+ - called from C_Committed callback, from PaxosService::propose_pending()
+ - called from C_Active callback, from PaxosService::_active()
+ - called from PaxosService::election_finished()
+
+ - on a freshly deployed cluster, upgrade_format() will be first called
+ *after* create_initial().
+ - on an existing cluster, upgrade_format() will be called after the first
+ election.
+
+ - upgrade_format() is irrelevant on a freshly deployed cluster, as there is
+ no format to upgrade at this point.
+
+boil down
+---------
+
+* if `format_version >= current_version` then format is uptodate, return.
+* if `features doesn't contain LUMINOUS` then `current_version = 1`
+* else if `features doesn't contain MIMIC` then `current_version = 2`
+* else `current_version = 3`
+
+if `format_version == 0`:::
+
+ - upgrade to format version 1
+ - move to new-style monitor caps (i.e., profiles):
+ - set daemon profiles for existing entities
+ - set profile for existing bootstrap keys
+
+if `format_version == 1`:::
+
+ - upgrade to format version 2
+ - for existing entities:
+ - add new cap for mgr
+ - for existing 'mgr' entities, fix 'mon' caps due to bug from kraken
+ setting 'allow \*', and set 'allow profile mgr' instead.
+ - add bootstrap-mgr key.
+
+if `format_version == 2`:::
+
+ - upgrade to format version 3
+ - create all bootstrap keys if they don't currently exist
diff --git a/doc/dev/mon-osdmap-prune.rst b/doc/dev/mon-osdmap-prune.rst
new file mode 100644
index 000000000..6ff059b84
--- /dev/null
+++ b/doc/dev/mon-osdmap-prune.rst
@@ -0,0 +1,415 @@
+===========================
+FULL OSDMAP VERSION PRUNING
+===========================
+
+For each incremental osdmap epoch, the monitor will keep a full osdmap
+epoch in the store.
+
+While this is great when serving osdmap requests from clients, allowing
+us to fulfill their request without having to recompute the full osdmap
+from a myriad of incrementals, it can also become a burden once we start
+keeping an unbounded number of osdmaps.
+
+The monitors will attempt to keep a bounded number of osdmaps in the store.
+This number is defined (and configurable) via ``mon_min_osdmap_epochs``, and
+defaults to 500 epochs. Generally speaking, we will remove older osdmap
+epochs once we go over this limit.
+
+However, there are a few constraints to removing osdmaps. These are all
+defined in ``OSDMonitor::get_trim_to()``.
+
+In the event one of these conditions is not met, we may go over the bounds
+defined by ``mon_min_osdmap_epochs``. And if the cluster does not meet the
+trim criteria for some time (e.g., unclean pgs), the monitor may start
+keeping a lot of osdmaps. This can start putting pressure on the underlying
+key/value store, as well as on the available disk space.
+
+One way to mitigate this problem would be to stop keeping full osdmap
+epochs on disk. We would have to rebuild osdmaps on-demand, or grab them
+from cache if they had been recently served. We would still have to keep
+at least one osdmap, and apply all incrementals on top of either this
+oldest map epoch kept in the store or a more recent map grabbed from cache.
+While this would be feasible, it seems like a lot of cpu (and potentially
+IO) would be going into rebuilding osdmaps.
+
+Additionally, this would prevent the aforementioned problem going forward,
+but would do nothing for stores currently in a state that would truly
+benefit from not keeping osdmaps.
+
+This brings us to full osdmap pruning.
+
+Instead of not keeping full osdmap epochs, we are going to prune some of
+them when we have too many.
+
+Deciding whether we have too many will be dictated by a configurable option
+``mon_osdmap_full_prune_min`` (default: 10000). The pruning algorithm will be
+engaged once we go over this threshold.
+
+We will not remove all ``mon_osdmap_full_prune_min`` full osdmap epochs
+though. Instead, we are going to poke some holes in the sequence of full
+maps. By default, we will keep one full osdmap per 10 maps since the last
+map kept; i.e., if we keep epoch 1, we will also keep epoch 10 and remove
+full map epochs 2 to 9. The size of this interval is configurable with
+``mon_osdmap_full_prune_interval``.
+
+Essentially, we are proposing to keep ~10% of the full maps, but we will
+always honour the minimum number of osdmap epochs, as defined by
+``mon_min_osdmap_epochs``, and these won't be used for the count of the
+minimum versions to prune. For instance, if we have on-disk versions
+[1..50000], we would allow the pruning algorithm to operate only over
+osdmap epochs [1..49500); but, if have on-disk versions [1..10200], we
+won't be pruning because the algorithm would only operate on versions
+[1..9700), and this interval contains less versions than the minimum
+required by ``mon_osdmap_full_prune_min``.
+
+
+ALGORITHM
+=========
+
+Say we have 50,000 osdmap epochs in the store, and we're using the
+defaults for all configurable options.
+
+::
+
+ -----------------------------------------------------------
+ |1|2|..|10|11|..|100|..|1000|..|10000|10001|..|49999|50000|
+ -----------------------------------------------------------
+ ^ first last ^
+
+We will prune when all the following constraints are met:
+
+1. number of versions is greater than ``mon_min_osdmap_epochs``;
+
+2. the number of versions between ``first`` and ``prune_to`` is greater (or
+ equal) than ``mon_osdmap_full_prune_min``, with ``prune_to`` being equal to
+ ``last`` minus ``mon_min_osdmap_epochs``.
+
+If any of these conditions fails, we will *not* prune any maps.
+
+Furthermore, if it is known that we have been pruning, but since then we
+are no longer satisfying at least one of the above constraints, we will
+not continue to prune. In essence, we only prune full osdmaps if the
+number of epochs in the store so warrants it.
+
+As pruning will create gaps in the sequence of full maps, we need to keep
+track of the intervals of missing maps. We do this by keeping a manifest of
+pinned maps -- i.e., a list of maps that, by being pinned, are not to be
+pruned.
+
+While pinned maps are not removed from the store, maps between two consecutive
+pinned maps will; and the number of maps to be removed will be dictated by the
+configurable option ``mon_osdmap_full_prune_interval``. The algorithm makes an
+effort to keep pinned maps apart by as many maps as defined by this option,
+but in the event of corner cases it may allow smaller intervals. Additionally,
+as this is a configurable option that is read any time a prune iteration
+occurs, there is the possibility this interval will change if the user changes
+this config option.
+
+Pinning maps is performed lazily: we will be pinning maps as we are removing
+maps. This grants us more flexibility to change the prune interval while
+pruning is happening, but also simplifies considerably the algorithm, as well
+as the information we need to keep in the manifest. Below we show a simplified
+version of the algorithm:::
+
+ manifest.pin(first)
+ last_to_prune = last - mon_min_osdmap_epochs
+
+ while manifest.get_last_pinned() + prune_interval < last_to_prune AND
+ last_to_prune - first > mon_min_osdmap_epochs AND
+ last_to_prune - first > mon_osdmap_full_prune_min AND
+ num_pruned < mon_osdmap_full_prune_txsize:
+
+ last_pinned = manifest.get_last_pinned()
+ new_pinned = last_pinned + prune_interval
+ manifest.pin(new_pinned)
+ for e in (last_pinned .. new_pinned):
+ store.erase(e)
+ ++num_pruned
+
+In essence, the algorithm ensures that the first version in the store is
+*always* pinned. After all, we need a starting point when rebuilding maps, and
+we can't simply remove the earliest map we have; otherwise we would be unable
+to rebuild maps for the very first pruned interval.
+
+Once we have at least one pinned map, each iteration of the algorithm can
+simply base itself on the manifest's last pinned map (which we can obtain by
+reading the element at the tail of the manifest's pinned maps list).
+
+We'll next need to determine the interval of maps to be removed: all the maps
+from ``last_pinned`` up to ``new_pinned``, which in turn is nothing more than
+``last_pinned`` plus ``mon_osdmap_full_prune_interval``. We know that all maps
+between these two values, ``last_pinned`` and ``new_pinned`` can be removed,
+considering ``new_pinned`` has been pinned.
+
+The algorithm ceases to execute as soon as one of the two initial
+preconditions is not met, or if we do not meet two additional conditions that
+have no weight on the algorithm's correctness:
+
+1. We will stop if we are not able to create a new pruning interval properly
+ aligned with ``mon_osdmap_full_prune_interval`` that is lower than
+ ``last_pruned``. There is no particular technical reason why we enforce
+ this requirement, besides allowing us to keep the intervals with an
+ expected size, and preventing small, irregular intervals that would be
+ bound to happen eventually (e.g., pruning continues over the course of
+ several iterations, removing one or two or three maps each time).
+
+2. We will stop once we know that we have pruned more than a certain number of
+ maps. This value is defined by ``mon_osdmap_full_prune_txsize``, and
+ ensures we don't spend an unbounded number of cycles pruning maps. We don't
+ enforce this value religiously (deletes do not cost much), but we make an
+ effort to honor it.
+
+We could do the removal in one go, but we have no idea how long that would
+take. Therefore, we will perform several iterations, removing at most
+``mon_osdmap_full_prune_txsize`` osdmaps per iteration.
+
+In the end, our on-disk map sequence will look similar to::
+
+ ------------------------------------------
+ |1|10|20|30|..|49500|49501|..|49999|50000|
+ ------------------------------------------
+ ^ first last ^
+
+
+Because we are not pruning all versions in one go, we need to keep state
+about how far along on our pruning we are. With that in mind, we have
+created a data structure, ``osdmap_manifest_t``, that holds the set of pinned
+maps:::
+
+ struct osdmap_manifest_t:
+ set<version_t> pinned;
+
+Given we are only pinning maps while we are pruning, we don't need to keep
+track of additional state about the last pruned version. We know as a matter
+of fact that we have pruned all the intermediate maps between any two
+consecutive pinned maps.
+
+The question one could ask, though, is how can we be sure we pruned all the
+intermediate maps if, for instance, the monitor crashes. To ensure we are
+protected against such an event, we always write the osdmap manifest to disk
+on the same transaction that is deleting the maps. This way we have the
+guarantee that, if the monitor crashes, we will read the latest version of the
+manifest: either containing the newly pinned maps, meaning we also pruned the
+in-between maps; or we will find the previous version of the osdmap manifest,
+which will not contain the maps we were pinning at the time we crashed, given
+the transaction on which we would be writing the updated osdmap manifest was
+not applied (alongside with the maps removal).
+
+The osdmap manifest will be written to the store each time we prune, with an
+updated list of pinned maps. It is written in the transaction effectively
+pruning the maps, so we guarantee the manifest is always up to date. As a
+consequence of this criteria, the first time we will write the osdmap manifest
+is the first time we prune. If an osdmap manifest does not exist, we can be
+certain we do not hold pruned map intervals.
+
+We will rely on the manifest to ascertain whether we have pruned maps
+intervals. In theory, this will always be the on-disk osdmap manifest, but we
+make sure to read the on-disk osdmap manifest each time we update from paxos;
+this way we always ensure having an up to date in-memory osdmap manifest.
+
+Once we finish pruning maps, we will keep the manifest in the store, to
+allow us to easily find which maps have been pinned (instead of checking
+the store until we find a map). This has the added benefit of allowing us to
+quickly figure out which is the next interval we need to prune (i.e., last
+pinned plus the prune interval). This doesn't however mean we will forever
+keep the osdmap manifest: the osdmap manifest will no longer be required once
+the monitor trims osdmaps and the earliest available epoch in the store is
+greater than the last map we pruned.
+
+The same conditions from ``OSDMonitor::get_trim_to()`` that force the monitor
+to keep a lot of osdmaps, thus requiring us to prune, may eventually change
+and allow the monitor to remove some of its oldest maps.
+
+MAP TRIMMING
+------------
+
+If the monitor trims maps, we must then adjust the osdmap manifest to
+reflect our pruning status, or remove the manifest entirely if it no longer
+makes sense to keep it. For instance, take the map sequence from before, but
+let us assume we did not finish pruning all the maps.::
+
+ -------------------------------------------------------------
+ |1|10|20|30|..|490|500|501|502|..|49500|49501|..|49999|50000|
+ -------------------------------------------------------------
+ ^ first ^ pinned.last() last ^
+
+ pinned = {1, 10, 20, ..., 490, 500}
+
+Now let us assume that the monitor will trim up to epoch 501. This means
+removing all maps prior to epoch 501, and updating the ``first_committed``
+pointer to ``501``. Given removing all those maps would invalidate our
+existing pruning efforts, we can consider our pruning has finished and drop
+our osdmap manifest. Doing so also simplifies starting a new prune, if all
+the starting conditions are met once we refreshed our state from the
+store.
+
+We would then have the following map sequence: ::
+
+ ---------------------------------------
+ |501|502|..|49500|49501|..|49999|50000|
+ ---------------------------------------
+ ^ first last ^
+
+However, imagine a slightly more convoluted scenario: the monitor will trim
+up to epoch 491. In this case, epoch 491 has been previously pruned from the
+store.
+
+Given we will always need to have the oldest known map in the store, before
+we trim we will have to check whether that map is in the prune interval
+(i.e., if said map epoch belongs to ``[ pinned.first()..pinned.last() )``).
+If so, we need to check if this is a pinned map, in which case we don't have
+much to be concerned aside from removing lower epochs from the manifest's
+pinned list. On the other hand, if the map being trimmed to is not a pinned
+map, we will need to rebuild said map and pin it, and only then will we remove
+the pinned maps prior to the map's epoch.
+
+In this case, we would end up with the following sequence:::
+
+ -----------------------------------------------
+ |491|500|501|502|..|49500|49501|..|49999|50000|
+ -----------------------------------------------
+ ^ ^- pinned.last() last ^
+ `- first
+
+There is still an edge case that we should mention. Consider that we are
+going to trim up to epoch 499, which is the very last pruned epoch.
+
+Much like the scenario above, we would end up writing osdmap epoch 499 to
+the store; but what should we do about pinned maps and pruning?
+
+The simplest solution is to drop the osdmap manifest. After all, given we
+are trimming to the last pruned map, and we are rebuilding this map, we can
+guarantee that all maps greater than e 499 are sequential (because we have
+not pruned any of them). In essence, dropping the osdmap manifest in this
+case is essentially the same as if we were trimming over the last pruned
+epoch: we can prune again later if we meet the required conditions.
+
+And, with this, we have fully dwelled into full osdmap pruning. Later in this
+document one can find detailed `REQUIREMENTS, CONDITIONS & INVARIANTS` for the
+whole algorithm, from pruning to trimming. Additionally, the next section
+details several additional checks to guarantee the sanity of our configuration
+options. Enjoy.
+
+
+CONFIGURATION OPTIONS SANITY CHECKS
+-----------------------------------
+
+We perform additional checks before pruning to ensure all configuration
+options involved are sane:
+
+1. If ``mon_osdmap_full_prune_interval`` is zero we will not prune; we
+ require an actual positive number, greater than one, to be able to prune
+ maps. If the interval is one, we would not actually be pruning any maps, as
+ the interval between pinned maps would essentially be a single epoch. This
+ means we would have zero maps in-between pinned maps, hence no maps would
+ ever be pruned.
+
+2. If ``mon_osdmap_full_prune_min`` is zero we will not prune; we require a
+ positive, greater than zero, value so we know the threshold over which we
+ should prune. We don't want to guess.
+
+3. If ``mon_osdmap_full_prune_interval`` is greater than
+ ``mon_osdmap_full_prune_min`` we will not prune, as it is impossible to
+ ascertain a proper prune interval.
+
+4. If ``mon_osdmap_full_prune_txsize`` is lower than
+ ``mon_osdmap_full_prune_interval`` we will not prune; we require a
+ ``txsize`` with a value at least equal than ``interval``, and (depending on
+ the value of the latter) ideally higher.
+
+
+REQUIREMENTS, CONDITIONS & INVARIANTS
+-------------------------------------
+
+REQUIREMENTS
+~~~~~~~~~~~~
+
+* All monitors in the quorum need to support pruning.
+
+* Once pruning has been enabled, monitors not supporting pruning will not be
+ allowed in the quorum, nor will be allowed to synchronize.
+
+* Removing the osdmap manifest results in disabling the pruning feature quorum
+ requirement. This means that monitors not supporting pruning will be allowed
+ to synchronize and join the quorum, granted they support any other features
+ required.
+
+
+CONDITIONS & INVARIANTS
+~~~~~~~~~~~~~~~~~~~~~~~
+
+* Pruning has never happened, or we have trimmed past its previous
+ intervals:::
+
+ invariant: first_committed > 1
+
+ condition: pinned.empty() AND !store.exists(manifest)
+
+
+* Pruning has happened at least once:::
+
+ invariant: first_committed > 0
+ invariant: !pinned.empty())
+ invariant: pinned.first() == first_committed
+ invariant: pinned.last() < last_committed
+
+ precond: pinned.last() < prune_to AND
+ pinned.last() + prune_interval < prune_to
+
+ postcond: pinned.size() > old_pinned.size() AND
+ (for each v in [pinned.first()..pinned.last()]:
+ if pinned.count(v) > 0: store.exists_full(v)
+ else: !store.exists_full(v)
+ )
+
+
+* Pruning has finished:::
+
+ invariant: first_committed > 0
+ invariant: !pinned.empty()
+ invariant: pinned.first() == first_committed
+ invariant: pinned.last() < last_committed
+
+ condition: pinned.last() == prune_to OR
+ pinned.last() + prune_interval < prune_to
+
+
+* Pruning intervals can be trimmed:::
+
+ precond: OSDMonitor::get_trim_to() > 0
+
+ condition: !pinned.empty()
+
+ invariant: pinned.first() == first_committed
+ invariant: pinned.last() < last_committed
+ invariant: pinned.first() <= OSDMonitor::get_trim_to()
+ invariant: pinned.last() >= OSDMonitor::get_trim_to()
+
+* Trim pruned intervals:::
+
+ invariant: !pinned.empty()
+ invariant: pinned.first() == first_committed
+ invariant: pinned.last() < last_committed
+ invariant: pinned.first() <= OSDMonitor::get_trim_to()
+ invariant: pinned.last() >= OSDMonitor::get_trim_to()
+
+ postcond: pinned.empty() OR
+ (pinned.first() == OSDMonitor::get_trim_to() AND
+ pinned.last() > pinned.first() AND
+ (for each v in [0..pinned.first()]:
+ !store.exists(v) AND
+ !store.exists_full(v)
+ ) AND
+ (for each m in [pinned.first()..pinned.last()]:
+ if pinned.count(m) > 0: store.exists_full(m)
+ else: !store.exists_full(m) AND store.exists(m)
+ )
+ )
+ postcond: !pinned.empty() OR
+ (!store.exists(manifest) AND
+ (for each v in [pinned.first()..pinned.last()]:
+ !store.exists(v) AND
+ !store.exists_full(v)
+ )
+ )
+
diff --git a/doc/dev/msgr2.rst b/doc/dev/msgr2.rst
new file mode 100644
index 000000000..585dc34d2
--- /dev/null
+++ b/doc/dev/msgr2.rst
@@ -0,0 +1,840 @@
+.. _msgr2-protocol:
+
+msgr2 protocol (msgr2.0 and msgr2.1)
+====================================
+
+This is a revision of the legacy Ceph on-wire protocol that was
+implemented by the SimpleMessenger. It addresses performance and
+security issues.
+
+Goals
+-----
+
+This protocol revision has several goals relative to the original protocol:
+
+* *Flexible handshaking*. The original protocol did not have a
+ sufficiently flexible protocol negotiation that allows for features
+ that were not required.
+* *Encryption*. We will incorporate encryption over the wire.
+* *Performance*. We would like to provide for protocol features
+ (e.g., padding) that keep computation and memory copies out of the
+ fast path where possible.
+* *Signing*. We will allow for traffic to be signed (but not
+ necessarily encrypted). This is not implemented.
+
+Definitions
+-----------
+
+* *client* (C): the party initiating a (TCP) connection
+* *server* (S): the party accepting a (TCP) connection
+* *connection*: an instance of a (TCP) connection between two processes.
+* *entity*: a ceph entity instantiation, e.g. 'osd.0'. each entity
+ has one or more unique entity_addr_t's by virtue of the 'nonce'
+ field, which is typically a pid or random value.
+* *session*: a stateful session between two entities in which message
+ exchange is ordered and lossless. A session might span multiple
+ connections if there is an interruption (TCP connection disconnect).
+* *frame*: a discrete message sent between the peers. Each frame
+ consists of a tag (type code), payload, and (if signing
+ or encryption is enabled) some other fields. See below for the
+ structure.
+* *tag*: a type code associated with a frame. The tag
+ determines the structure of the payload.
+
+Phases
+------
+
+A connection has four distinct phases:
+
+#. banner
+#. authentication frame exchange
+#. message flow handshake frame exchange
+#. message frame exchange
+
+Banner
+------
+
+Both the client and server, upon connecting, send a banner::
+
+ "ceph v2\n"
+ __le16 banner payload length
+ banner payload
+
+A banner payload has the form::
+
+ __le64 peer_supported_features
+ __le64 peer_required_features
+
+This is a new, distinct feature bit namespace (CEPH_MSGR2_*).
+Currently, only CEPH_MSGR2_FEATURE_REVISION_1 is defined. It is
+supported but not required, so that msgr2.0 and msgr2.1 peers
+can talk to each other.
+
+If the remote party advertises required features we don't support, we
+can disconnect.
+
+
+.. ditaa::
+
+ +---------+ +--------+
+ | Client | | Server |
+ +---------+ +--------+
+ | send banner |
+ |----+ +----|
+ | | | |
+ | +-------+--->|
+ | send banner| |
+ |<-----------+ |
+ | |
+
+Frame format
+------------
+
+After the banners are exchanged, all further communication happens
+in frames. The exact format of the frame depends on the connection
+mode (msgr2.0-crc, msgr2.0-secure, msgr2.1-crc or msgr2.1-secure).
+All connections start in crc mode (either msgr2.0-crc or msgr2.1-crc,
+depending on peer_supported_features from the banner).
+
+Each frame has a 32-byte preamble::
+
+ __u8 tag
+ __u8 number of segments
+ {
+ __le32 segment length
+ __le16 segment alignment
+ } * 4
+ reserved (2 bytes)
+ __le32 preamble crc
+
+An empty frame has one empty segment. A non-empty frame can have
+between one and four segments, all segments except the last may be
+empty.
+
+If there are less than four segments, unused (trailing) segment
+length and segment alignment fields are zeroed.
+
+The reserved bytes are zeroed.
+
+The preamble checksum is CRC32-C. It covers everything up to
+itself (28 bytes) and is calculated and verified irrespective of
+the connection mode (i.e. even if the frame is encrypted).
+
+### msgr2.0-crc mode
+
+A msgr2.0-crc frame has the form::
+
+ preamble (32 bytes)
+ {
+ segment payload
+ } * number of segments
+ epilogue (17 bytes)
+
+where epilogue is::
+
+ __u8 late_flags
+ {
+ __le32 segment crc
+ } * 4
+
+late_flags is used for frame abortion. After transmitting the
+preamble and the first segment, the sender can fill the remaining
+segments with zeros and set a flag to indicate that the receiver must
+drop the frame. This allows the sender to avoid extra buffering
+when a frame that is being put on the wire is revoked (i.e. yanked
+out of the messenger): payload buffers can be unpinned and handed
+back to the user immediately, without making a copy or blocking
+until the whole frame is transmitted. Currently this is used only
+by the kernel client, see ceph_msg_revoke().
+
+The segment checksum is CRC32-C. For "used" empty segments, it is
+set to (__le32)-1. For unused (trailing) segments, it is zeroed.
+
+The crcs are calculated just to protect against bit errors.
+No authenticity guarantees are provided, unlike in msgr1 which
+attempted to provide some authenticity guarantee by optionally
+signing segment lengths and crcs with the session key.
+
+Issues:
+
+1. As part of introducing a structure for a generic frame with
+ variable number of segments suitable for both control and
+ message frames, msgr2.0 moved the crc of the first segment of
+ the message frame (ceph_msg_header2) into the epilogue.
+
+ As a result, ceph_msg_header2 can no longer be safely
+ interpreted before the whole frame is read off the wire.
+ This is a regression from msgr1, because in order to scatter
+ the payload directly into user-provided buffers and thus avoid
+ extra buffering and copying when receiving message frames,
+ ceph_msg_header2 must be available in advance -- it stores
+ the transaction id which the user buffers are keyed on.
+ The implementation has to choose between forgoing this
+ optimization or acting on an unverified segment.
+
+2. late_flags is not covered by any crc. Since it stores the
+ abort flag, a single bit flip can result in a completed frame
+ being dropped (causing the sender to hang waiting for a reply)
+ or, worse, in an aborted frame with garbage segment payloads
+ being dispatched.
+
+ This was the case with msgr1 and got carried over to msgr2.0.
+
+### msgr2.1-crc mode
+
+Differences from msgr2.0-crc:
+
+1. The crc of the first segment is stored at the end of the
+ first segment, not in the epilogue. The epilogue stores up to
+ three crcs, not up to four.
+
+ If the first segment is empty, (__le32)-1 crc is not generated.
+
+2. The epilogue is generated only if the frame has more than one
+ segment (i.e. at least one of second to fourth segments is not
+ empty). Rationale: If the frame has only one segment, it cannot
+ be aborted and there are no crcs to store in the epilogue.
+
+3. Unchecksummed late_flags is replaced with late_status which
+ builds in bit error detection by using a 4-bit nibble per flag
+ and two code words that are Hamming Distance = 4 apart (and not
+ all zeros or ones). This comes at the expense of having only
+ one reserved flag, of course.
+
+Some example frames:
+
+* A 0+0+0+0 frame (empty, no epilogue)::
+
+ preamble (32 bytes)
+
+* A 20+0+0+0 frame (no epilogue)::
+
+ preamble (32 bytes)
+ segment1 payload (20 bytes)
+ __le32 segment1 crc
+
+* A 0+70+0+0 frame::
+
+ preamble (32 bytes)
+ segment2 payload (70 bytes)
+ epilogue (13 bytes)
+
+* A 20+70+0+350 frame::
+
+ preamble (32 bytes)
+ segment1 payload (20 bytes)
+ __le32 segment1 crc
+ segment2 payload (70 bytes)
+ segment4 payload (350 bytes)
+ epilogue (13 bytes)
+
+where epilogue is::
+
+ __u8 late_status
+ {
+ __le32 segment crc
+ } * 3
+
+Hello
+-----
+
+* TAG_HELLO: client->server and server->client::
+
+ __u8 entity_type
+ entity_addr_t peer_socket_address
+
+ - We immediately share our entity type and the address of the peer (which can be useful
+ for detecting our effective IP address, especially in the presence of NAT).
+
+
+Authentication
+--------------
+
+* TAG_AUTH_REQUEST: client->server::
+
+ __le32 method; // CEPH_AUTH_{NONE, CEPHX, ...}
+ __le32 num_preferred_modes;
+ list<__le32> mode // CEPH_CON_MODE_*
+ method specific payload
+
+* TAG_AUTH_BAD_METHOD server -> client: reject client-selected auth method::
+
+ __le32 method
+ __le32 negative error result code
+ __le32 num_methods
+ list<__le32> allowed_methods // CEPH_AUTH_{NONE, CEPHX, ...}
+ __le32 num_modes
+ list<__le32> allowed_modes // CEPH_CON_MODE_*
+
+ - Returns the attempted auth method, and error code (-EOPNOTSUPP if
+ the method is unsupported), and the list of allowed authentication
+ methods.
+
+* TAG_AUTH_REPLY_MORE: server->client::
+
+ __le32 len;
+ method specific payload
+
+* TAG_AUTH_REQUEST_MORE: client->server::
+
+ __le32 len;
+ method specific payload
+
+* TAG_AUTH_DONE: (server->client)::
+
+ __le64 global_id
+ __le32 connection mode // CEPH_CON_MODE_*
+ method specific payload
+
+ - The server is the one to decide authentication has completed and what
+ the final connection mode will be.
+
+
+Example of authentication phase interaction when the client uses an
+allowed authentication method:
+
+.. ditaa::
+
+ +---------+ +--------+
+ | Client | | Server |
+ +---------+ +--------+
+ | auth request |
+ |---------------->|
+ |<----------------|
+ | auth more|
+ | |
+ |auth more |
+ |---------------->|
+ |<----------------|
+ | auth done|
+
+
+Example of authentication phase interaction when the client uses a forbidden
+authentication method as the first attempt:
+
+.. ditaa::
+
+ +---------+ +--------+
+ | Client | | Server |
+ +---------+ +--------+
+ | auth request |
+ |---------------->|
+ |<----------------|
+ | bad method |
+ | |
+ | auth request |
+ |---------------->|
+ |<----------------|
+ | auth more|
+ | |
+ | auth more |
+ |---------------->|
+ |<----------------|
+ | auth done|
+
+
+Post-auth frame format
+----------------------
+
+Depending on the negotiated connection mode from TAG_AUTH_DONE, the
+connection either stays in crc mode or switches to the corresponding
+secure mode (msgr2.0-secure or msgr2.1-secure).
+
+### msgr2.0-secure mode
+
+A msgr2.0-secure frame has the form::
+
+ {
+ preamble (32 bytes)
+ {
+ segment payload
+ zero padding (out to 16 bytes)
+ } * number of segments
+ epilogue (16 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+
+where epilogue is::
+
+ __u8 late_flags
+ zero padding (15 bytes)
+
+late_flags has the same meaning as in msgr2.0-crc mode.
+
+Each segment and the epilogue are zero padded out to 16 bytes.
+Technically, GCM doesn't require any padding because Counter mode
+(the C in GCM) essentially turns a block cipher into a stream cipher.
+But, if the overall input length is not a multiple of 16 bytes, some
+implicit zero padding would occur internally because GHASH function
+used by GCM for generating auth tags only works on 16-byte blocks.
+
+Issues:
+
+1. The sender encrypts the whole frame using a single nonce
+ and generating a single auth tag. Because segment lengths are
+ stored in the preamble, the receiver has no choice but to decrypt
+ and interpret the preamble without verifying the auth tag -- it
+ can't even tell how much to read off the wire to get the auth tag
+ otherwise! This creates a decryption oracle, which, in conjunction
+ with Counter mode malleability, could lead to recovery of sensitive
+ information.
+
+ This issue extends to the first segment of the message frame as
+ well. As in msgr2.0-crc mode, ceph_msg_header2 cannot be safely
+ interpreted before the whole frame is read off the wire.
+
+2. Deterministic nonce construction with a 4-byte counter field
+ followed by an 8-byte fixed field is used. The initial values are
+ taken from the connection secret -- a random byte string generated
+ during the authentication phase. Because the counter field is
+ only four bytes long, it can wrap and then repeat in under a day,
+ leading to GCM nonce reuse and therefore a potential complete
+ loss of both authenticity and confidentiality for the connection.
+ This was addressed by disconnecting before the counter repeats
+ (CVE-2020-1759).
+
+### msgr2.1-secure mode
+
+Differences from msgr2.0-secure:
+
+1. The preamble, the first segment and the rest of the frame are
+ encrypted separately, using separate nonces and generating
+ separate auth tags. This gets rid of unverified plaintext use
+ and keeps msgr2.1-secure mode close to msgr2.1-crc mode, allowing
+ the implementation to receive message frames in a similar fashion
+ (little to no buffering, same scatter/gather logic, etc).
+
+ In order to reduce the number of en/decryption operations per
+ frame, the preamble is grown by a fixed size inline buffer (48
+ bytes) that the first segment is inlined into, either fully or
+ partially. The preamble auth tag covers both the preamble and the
+ inline buffer, so if the first segment is small enough to be fully
+ inlined, it becomes available after a single decryption operation.
+
+2. As in msgr2.1-crc mode, the epilogue is generated only if the
+ frame has more than one segment. The rationale is even stronger,
+ as it would require an extra en/decryption operation.
+
+3. For consistency with msgr2.1-crc mode, late_flags is replaced
+ with late_status (the built-in bit error detection isn't really
+ needed in secure mode).
+
+4. In accordance with `NIST Recommendation for GCM`_, deterministic
+ nonce construction with a 4-byte fixed field followed by an 8-byte
+ counter field is used. An 8-byte counter field should never repeat
+ but the nonce reuse protection put in place for msgr2.0-secure mode
+ is still there.
+
+ The initial values are the same as in msgr2.0-secure mode.
+
+ .. _`NIST Recommendation for GCM`: https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
+
+As in msgr2.0-secure mode, each segment is zero padded out to
+16 bytes. If the first segment is fully inlined, its padding goes
+to the inline buffer. Otherwise, the padding is on the remainder.
+The corollary to this is that the inline buffer is consumed in
+16-byte chunks.
+
+The unused portion of the inline buffer is zeroed.
+
+Some example frames:
+
+* A 0+0+0+0 frame (empty, nothing to inline, no epilogue)::
+
+ {
+ preamble (32 bytes)
+ zero padding (48 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+
+* A 20+0+0+0 frame (first segment fully inlined, no epilogue)::
+
+ {
+ preamble (32 bytes)
+ segment1 payload (20 bytes)
+ zero padding (28 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+
+* A 0+70+0+0 frame (nothing to inline)::
+
+ {
+ preamble (32 bytes)
+ zero padding (48 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+ {
+ segment2 payload (70 bytes)
+ zero padding (10 bytes)
+ epilogue (16 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+
+* A 20+70+0+350 frame (first segment fully inlined)::
+
+ {
+ preamble (32 bytes)
+ segment1 payload (20 bytes)
+ zero padding (28 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+ {
+ segment2 payload (70 bytes)
+ zero padding (10 bytes)
+ segment4 payload (350 bytes)
+ zero padding (2 bytes)
+ epilogue (16 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+
+* A 105+0+0+0 frame (first segment partially inlined, no epilogue)::
+
+ {
+ preamble (32 bytes)
+ segment1 payload (48 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+ {
+ segment1 payload remainder (57 bytes)
+ zero padding (7 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+
+* A 105+70+0+350 frame (first segment partially inlined)::
+
+ {
+ preamble (32 bytes)
+ segment1 payload (48 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+ {
+ segment1 payload remainder (57 bytes)
+ zero padding (7 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+ {
+ segment2 payload (70 bytes)
+ zero padding (10 bytes)
+ segment4 payload (350 bytes)
+ zero padding (2 bytes)
+ epilogue (16 bytes)
+ } ^ AES-128-GCM cipher
+ auth tag (16 bytes)
+
+where epilogue is::
+
+ __u8 late_status
+ zero padding (15 bytes)
+
+late_status has the same meaning as in msgr2.1-crc mode.
+
+Message flow handshake
+----------------------
+
+In this phase the peers identify each other and (if desired) reconnect to
+an established session.
+
+* TAG_CLIENT_IDENT (client->server): identify ourselves::
+
+ __le32 num_addrs
+ entity_addrvec_t*num_addrs entity addrs
+ entity_addr_t target entity addr
+ __le64 gid (numeric part of osd.0, client.123456, ...)
+ __le64 global_seq
+ __le64 features supported (CEPH_FEATURE_* bitmask)
+ __le64 features required (CEPH_FEATURE_* bitmask)
+ __le64 flags (CEPH_MSG_CONNECT_* bitmask)
+ __le64 cookie
+
+ - client will send first, server will reply with same. if this is a
+ new session, the client and server can proceed to the message exchange.
+ - the target addr is who the client is trying to connect *to*, so
+ that the server side can close the connection if the client is
+ talking to the wrong daemon.
+ - type.gid (entity_name_t) is set here, by combinging the type shared in the hello
+ frame with the gid here. this means we don't need it
+ in the header of every message. it also means that we can't send
+ messages "from" other entity_name_t's. the current
+ implementations set this at the top of _send_message etc so this
+ shouldn't break any existing functionality. implementation will
+ likely want to mask this against what the authenticated credential
+ allows.
+ - cookie is the client coookie used to identify a session, and can be used
+ to reconnect to an existing session.
+ - we've dropped the 'protocol_version' field from msgr1
+
+* TAG_IDENT_MISSING_FEATURES (server->client): complain about a TAG_IDENT
+ with too few features::
+
+ __le64 features we require that the peer didn't advertise
+
+* TAG_SERVER_IDENT (server->client): accept client ident and identify server::
+
+ __le32 num_addrs
+ entity_addrvec_t*num_addrs entity addrs
+ __le64 gid (numeric part of osd.0, client.123456, ...)
+ __le64 global_seq
+ __le64 features supported (CEPH_FEATURE_* bitmask)
+ __le64 features required (CEPH_FEATURE_* bitmask)
+ __le64 flags (CEPH_MSG_CONNECT_* bitmask)
+ __le64 cookie
+
+ - The server cookie can be used by the client if it is later disconnected
+ and wants to reconnect and resume the session.
+
+* TAG_RECONNECT (client->server): reconnect to an established session::
+
+ __le32 num_addrs
+ entity_addr_t * num_addrs
+ __le64 client_cookie
+ __le64 server_cookie
+ __le64 global_seq
+ __le64 connect_seq
+ __le64 msg_seq (the last msg seq received)
+
+* TAG_RECONNECT_OK (server->client): acknowledge a reconnect attempt::
+
+ __le64 msg_seq (last msg seq received)
+
+ - once the client receives this, the client can proceed to message exchange.
+ - once the server sends this, the server can proceed to message exchange.
+
+* TAG_RECONNECT_RETRY_SESSION (server only): fail reconnect due to stale connect_seq
+
+* TAG_RECONNECT_RETRY_GLOBAL (server only): fail reconnect due to stale global_seq
+
+* TAG_RECONNECT_WAIT (server only): fail reconnect due to connect race.
+
+ - Indicates that the server is already connecting to the client, and
+ that direction should win the race. The client should wait for that
+ connection to complete.
+
+* TAG_RESET_SESSION (server only): ask client to reset session::
+
+ __u8 full
+
+ - full flag indicates whether peer should do a full reset, i.e., drop
+ message queue.
+
+
+Example of failure scenarios:
+
+* First client's client_ident message is lost, and then client reconnects.
+
+.. ditaa::
+
+ +---------+ +--------+
+ | Client | | Server |
+ +---------+ +--------+
+ | |
+ c_cookie(a) | client_ident(a) |
+ |-------------X |
+ | |
+ | client_ident(a) |
+ |-------------------->|
+ |<--------------------|
+ | server_ident(b) | s_cookie(b)
+ | |
+ | session established |
+ | |
+
+
+* Server's server_ident message is lost, and then client reconnects.
+
+.. ditaa::
+
+ +---------+ +--------+
+ | Client | | Server |
+ +---------+ +--------+
+ | |
+ c_cookie(a) | client_ident(a) |
+ |-------------------->|
+ | X------------|
+ | server_ident(b) | s_cookie(b)
+ | |
+ | |
+ | client_ident(a) |
+ |-------------------->|
+ |<--------------------|
+ | server_ident(c) | s_cookie(c)
+ | |
+ | session established |
+ | |
+
+
+* Server's server_ident message is lost, and then server reconnects.
+
+.. ditaa::
+
+ +---------+ +--------+
+ | Client | | Server |
+ +---------+ +--------+
+ | |
+ c_cookie(a) | client_ident(a) |
+ |-------------------->|
+ | X------------|
+ | server_ident(b) | s_cookie(b)
+ | |
+ | |
+ | reconnect(a, b) |
+ |<--------------------|
+ |-------------------->|
+ | reset_session(F) |
+ | |
+ | client_ident(a) | c_cookie(a)
+ |<--------------------|
+ |-------------------->|
+ s_cookie(c) | server_ident(c) |
+ | |
+
+
+* Connection failure after session is established, and then client reconnects.
+
+.. ditaa::
+
+ +---------+ +--------+
+ | Client | | Server |
+ +---------+ +--------+
+ | |
+ c_cookie(a) | session established | s_cookie(b)
+ |<------------------->|
+ | X------------|
+ | |
+ | reconnect(a, b) |
+ |-------------------->|
+ |<--------------------|
+ | reconnect_ok |
+ | |
+
+
+* Connection failure after session is established because server reset,
+ and then client reconnects.
+
+.. ditaa::
+
+ +---------+ +--------+
+ | Client | | Server |
+ +---------+ +--------+
+ | |
+ c_cookie(a) | session established | s_cookie(b)
+ |<------------------->|
+ | X------------| reset
+ | |
+ | reconnect(a, b) |
+ |-------------------->|
+ |<--------------------|
+ | reset_session(RC*) |
+ | |
+ c_cookie(c) | client_ident(c) |
+ |-------------------->|
+ |<--------------------|
+ | server_ident(d) | s_cookie(d)
+ | |
+
+RC* means that the reset session full flag depends on the policy.resetcheck
+of the connection.
+
+
+* Connection failure after session is established because client reset,
+ and then client reconnects.
+
+.. ditaa::
+
+ +---------+ +--------+
+ | Client | | Server |
+ +---------+ +--------+
+ | |
+ c_cookie(a) | session established | s_cookie(b)
+ |<------------------->|
+ reset | X------------|
+ | |
+ c_cookie(c) | client_ident(c) |
+ |-------------------->|
+ |<--------------------| reset if policy.resetcheck
+ | server_ident(d) | s_cookie(d)
+ | |
+
+
+Message exchange
+----------------
+
+Once a session is established, we can exchange messages.
+
+* TAG_MSG: a message::
+
+ ceph_msg_header2
+ front
+ middle
+ data_pre_padding
+ data
+
+ - The ceph_msg_header2 is modified from ceph_msg_header:
+ * include an ack_seq. This avoids the need for a TAG_ACK
+ message most of the time.
+ * remove the src field, which we now get from the message flow
+ handshake (TAG_IDENT).
+ * specifies the data_pre_padding length, which can be used to
+ adjust the alignment of the data payload. (NOTE: is this is
+ useful?)
+
+* TAG_ACK: acknowledge receipt of message(s)::
+
+ __le64 seq
+
+ - This is only used for stateful sessions.
+
+* TAG_KEEPALIVE2: check for connection liveness::
+
+ ceph_timespec stamp
+
+ - Time stamp is local to sender.
+
+* TAG_KEEPALIVE2_ACK: reply to a keepalive2::
+
+ ceph_timestamp stamp
+
+ - Time stamp is from the TAG_KEEPALIVE2 we are responding to.
+
+* TAG_CLOSE: terminate a connection
+
+ Indicates that a connection should be terminated. This is equivalent
+ to a hangup or reset (i.e., should trigger ms_handle_reset). It
+ isn't strictly necessary or useful as we could just disconnect the
+ TCP connection.
+
+
+Example of protocol interaction (WIP)
+_____________________________________
+
+
+.. ditaa::
+
+ +---------+ +--------+
+ | Client | | Server |
+ +---------+ +--------+
+ | send banner |
+ |----+ +------|
+ | | | |
+ | +-------+----->|
+ | send banner| |
+ |<-----------+ |
+ | |
+ | send new stream |
+ |------------------>|
+ | auth request |
+ |------------------>|
+ |<------------------|
+ | bad method |
+ | |
+ | auth request |
+ |------------------>|
+ |<------------------|
+ | auth more |
+ | |
+ | auth more |
+ |------------------>|
+ |<------------------|
+ | auth done |
+ | |
+
+
diff --git a/doc/dev/network-encoding.rst b/doc/dev/network-encoding.rst
new file mode 100644
index 000000000..d59b0ee9e
--- /dev/null
+++ b/doc/dev/network-encoding.rst
@@ -0,0 +1,214 @@
+==================
+ Network Encoding
+==================
+
+This describes the encoding used to serialize data. It doesn't cover specific
+objects/messages but focuses on the base types.
+
+The types are not self documenting in any way. They can not be decoded unless
+you know what they are.
+
+Conventions
+===========
+
+Integers
+--------
+
+The integer types used will be named ``{signed}{size}{endian}``. For example
+``u16le`` is an unsigned 16 bit integer encoded in little endian byte order
+while ``s64be`` is a signed 64 bit integer in big endian. Additionally ``u8``
+and ``s8`` will represent signed and unsigned bytes respectively. Signed
+integers use two's complement encoding.
+
+Complex Types
+-------------
+
+This document will use a c-like syntax for describing structures. The
+structure represents the data that will go over the wire. There will be no
+padding between the elements and the elements will be sent in the order they
+appear. For example::
+
+ struct foo {
+ u8 tag;
+ u32le data;
+ }
+
+When encoding the values ``0x05`` and ``0x12345678`` respectively will appear on
+the wire as ``05 78 56 34 12``.
+
+Variable Arrays
+---------------
+
+Unlike c, length arrays can be used anywhere in structures and will be inline in
+the protocol. Furthermore the length may be described using an earlier item in
+the structure.
+
+::
+
+ struct blob {
+ u32le size;
+ u8 data[size];
+ u32le checksum;
+ }
+
+This structure is encoded as a 32 bit size, followed by ``size`` data bytes,
+then a 32 bit checksum.
+
+Primitive Aliases
+-----------------
+
+These types are just aliases for primitive types.
+
+::
+
+ // From /src/include/types.h
+
+ typedef u32le epoch_t;
+ typedef u32le ceph_seq_t;
+ typedef u64le ceph_tid_t;
+ typedef u64le version_t;
+
+
+Structures
+==========
+
+These are the way structures are encoded. Note that these structures don't
+actually exist in the source but are the way that different types are encoded.
+
+Optional
+--------
+
+Optionals are represented as a presence byte, followed by the item if it exists.
+
+::
+
+ struct ceph_optional<T> {
+ u8 present;
+ T element[present? 1 : 0]; // Only if present is non-zero.
+ }
+
+Optionals are used to encode ``boost::optional``.
+
+Pair
+----
+
+Pairs are simply the first item followed by the second.
+
+::
+
+ struct ceph_pair<A,B> {
+ A a;
+ B b;
+ }
+
+Pairs are used to encode ``std::pair``.
+
+Triple
+------
+
+Triples are simply the tree elements one after another.
+
+::
+
+ struct ceph_triple<A,B,C> {
+ A a;
+ B b;
+ C c;
+ }
+
+Triples are used to encode ``ceph::triple``.
+
+
+List
+----
+
+Lists are represented as an element count followed by that many elements.
+
+::
+
+ struct ceph_list<T> {
+ u32le length;
+ T elements[length];
+ }
+
+.. note::
+ The size of the elements in the list are not necessarily uniform.
+
+Lists are used to encode ``std::list``, ``std::vector``, ``std::deque``,
+``std::set`` and ``ceph::unordered_set``.
+
+Blob
+----
+
+A Blob is simply a list of bytes.
+
+::
+
+ struct ceph_string {
+ ceph_list<u8>;
+ }
+
+ // AKA
+
+ struct ceph_string {
+ u32le size;
+ u8 data[size];
+ }
+
+Blobs are used to encode ``std::string``, ``const char *`` and ``bufferlist``.
+
+.. note::
+ The content of a Blob is arbitrary binary data.
+
+Map
+---
+
+Maps are a list of pairs.
+
+::
+
+ struct ceph_map<K,V> {
+ ceph_list<ceph_pair<K,V>>;
+ }
+
+ // AKA
+
+ struct ceph_map<K,V> {
+ u32le length;
+ ceph_pair<K,V> entries[length];
+ }
+
+Maps are used to encode ``std::map``, ``std::multimap`` and
+``ceph::unordered_map``.
+
+Complex Types
+=============
+
+These aren't hard to find in the source but the common ones are listed here for
+convenience.
+
+utime_t
+-------
+
+::
+
+ // From /src/include/utime.h
+ struct utime_t {
+ u32le tv_sec; // Seconds since epoch.
+ u32le tv_nsec; // Nanoseconds since the last second.
+ }
+
+ceph_entity_name
+----------------
+
+::
+
+ // From /src/include/msgr.h
+ struct ceph_entity_name {
+ u8 type; // CEPH_ENTITY_TYPE_*
+ u64le num;
+ }
+
+ // CEPH_ENTITY_TYPE_* defined in /src/include/msgr.h
+
+.. vi: textwidth=80 noexpandtab
diff --git a/doc/dev/network-protocol.rst b/doc/dev/network-protocol.rst
new file mode 100644
index 000000000..f6fb1738d
--- /dev/null
+++ b/doc/dev/network-protocol.rst
@@ -0,0 +1,197 @@
+==================
+ Network Protocol
+==================
+
+This file describes the network protocol used by Ceph. In order to understand
+the way the structures are defined it is recommended to read the introduction
+of :doc:`/dev/network-encoding` first.
+
+Hello
+=====
+
+The protocol starts with a handshake that confirms that both nodes are talking
+ceph and shares some basic information.
+
+Banner
+------
+
+The first action is the server sending banner to the client. The banner is
+defined in ``CEPH_BANNER`` from ``src/include/msgr.h``. This is followed by
+the server's then client's address each encoded as a ``entity_addr_t``.
+
+Once the client verifies that the servers banner matches its own it replies with
+its banner and its address.
+
+Connect
+-------
+
+Once the banners have been verified and the addresses exchanged the connection
+negotiation begins. First the client sends a ``ceph_msg_connect`` structure
+with its information.
+
+::
+
+ // From src/include/msgr.h
+ struct ceph_msg_connect {
+ u64le features; // Supported features (CEPH_FEATURE_*)
+ u32le host_type; // CEPH_ENTITY_TYPE_*
+ u32le global_seq; // Number of connections initiated by this host.
+ u32le connect_seq; // Number of connections initiated in this session.
+ u32le protocol_version;
+ u32le authorizer_protocol;
+ u32le authorizer_len;
+ u8 flags; // CEPH_MSG_CONNECT_*
+ u8 authorizer[authorizer_len];
+ }
+
+Connect Reply
+-------------
+
+Once the connect has been sent the connection has effectively been opened,
+however the first message the server sends must be a connect reply message.
+
+::
+
+ struct ceph_msg_connect_reply {
+ u8 tag; // Tag indicating response code.
+ u64le features;
+ u32le global_seq;
+ u32le connect_seq;
+ u32le protocol_version;
+ u32le authorizer_len;
+ u8 flags;
+ u8 authorizer[authorizer_len];
+ }
+
+MSGR Protocol
+=============
+
+This is a low level protocol over which messages are delivered. The messages
+at this level consist of a tag byte, identifying the type of message, followed
+by the message data.
+
+::
+
+ // Virtual structure.
+ struct {
+ u8 tag; // CEPH_MSGR_TAG_*
+ u8 data[]; // Length depends on tag and data.
+ }
+
+The length of ``data`` is determined by the tag byte and depending on the
+message type via information in the ``data`` array itself.
+
+.. note::
+ There is no way to determine the length of the message if you do not
+ understand the type of message.
+
+The message tags are defined in ``src/include/msgr.h`` and the current ones
+are listed below along with the data they include. Note that the defined
+structures don't exist in the source and are merely for representing the
+protocol.
+
+CEPH_MSGR_TAG_CLOSE (0x06)
+--------------------------
+
+::
+
+ struct ceph_msgr_close {
+ u8 tag = 0x06;
+ u8 data[0]; // No data.
+ }
+
+The close message indicates that the connection is being closed.
+
+CEPH_MSGR_TAG_MSG (0x07)
+------------------------
+
+::
+
+ struct ceph_msgr_msg {
+ u8 tag = 0x07;
+ ceph_msg_header header;
+ u8 front [header.front_len ];
+ u8 middle[header.middle_len];
+ u8 data [header.data_len ];
+ ceph_msg_footer footer;
+ }
+
+ // From src/include/msgr.h
+ struct ceph_msg_header {
+ u64le seq; // Sequence number.
+ u64le tid; // Transaction ID.
+ u16le type; // Message type (CEPH_MSG_* or MSG_*).
+ u16le priority; // Priority (higher is more important).
+ u16le version; // Version of message encoding.
+
+ u32le front_len; // The size of the front section.
+ u32le middle_len; // The size of the middle section.
+ u32le data_len; // The size of the data section.
+ u16le data_off; // The way data should be aligned by the receiver.
+
+ ceph_entity_name src; // Information about the sender.
+
+ u16le compat_version; // Oldest compatible encoding version.
+ u16le reserved; // Unused.
+ u32le crc; // CRC of header.
+ }
+
+ // From src/include/msgr.h
+ struct ceph_msg_footer {
+ u32le front_crc; // Checksums of the various sections.
+ u32le middle_crc; //
+ u32le data_crc; //
+ u64le sig; // Crypographic signature.
+ u8 flags;
+ }
+
+Messages are the business logic of Ceph. They are what is used to send data and
+requests between nodes. The message header contains the length of the message
+so unknown messages can be handled gracefully.
+
+There are two names for the message type constants ``CEPH_MSG_*`` and ``MSG_*``.
+The only difference between the two is that the first are considered "public"
+while the second is for internal use only. There is no protocol-level
+difference.
+
+CEPH_MSGR_TAG_ACK (0x08)
+------------------------
+
+::
+
+ struct ceph_msgr_ack {
+ u8 tag = 0x08;
+ u64le seq; // The sequence number of the message being acknowledged.
+ }
+
+CEPH_MSGR_TAG_KEEPALIVE (0x09)
+------------------------------
+
+::
+
+ struct ceph_msgr_keepalive {
+ u8 tag = 0x09;
+ u8 data[0]; // No data.
+ }
+
+CEPH_MSGR_TAG_KEEPALIVE2 (0x0E)
+-------------------------------
+
+::
+
+ struct ceph_msgr_keepalive2 {
+ u8 tag = 0x0E;
+ utime_t timestamp;
+ }
+
+CEPH_MSGR_TAG_KEEPALIVE2_ACK (0x0F)
+-----------------------------------
+
+::
+
+ struct ceph_msgr_keepalive2_ack {
+ u8 tag = 0x0F;
+ utime_t timestamp;
+ }
+
+.. vi: textwidth=80 noexpandtab
diff --git a/doc/dev/object-store.rst b/doc/dev/object-store.rst
new file mode 100644
index 000000000..2d9a7d8fa
--- /dev/null
+++ b/doc/dev/object-store.rst
@@ -0,0 +1,67 @@
+====================================
+ Object Store Architecture Overview
+====================================
+
+.. graphviz::
+
+ digraph object_store {
+ size="7,7";
+ node [color=lightblue2, style=filled, fontname="Serif"];
+
+ "testrados" -> "librados"
+ "testradospp" -> "librados"
+
+ "rbd" -> "librados"
+
+ "radostool" -> "librados"
+
+ "radosgw-admin" -> "radosgw"
+
+ "radosgw" -> "librados"
+
+ "radosacl" -> "librados"
+
+ "librados" -> "objecter"
+
+ "ObjectCacher" -> "Filer"
+
+ "dumpjournal" -> "Journaler"
+
+ "Journaler" -> "Filer"
+
+ "SyntheticClient" -> "Filer"
+ "SyntheticClient" -> "objecter"
+
+ "Filer" -> "objecter"
+
+ "objecter" -> "OSDMap"
+
+ "ceph-osd" -> "PG"
+ "ceph-osd" -> "ObjectStore"
+
+ "crushtool" -> "CrushWrapper"
+
+ "OSDMap" -> "CrushWrapper"
+
+ "OSDMapTool" -> "OSDMap"
+
+ "PG" -> "PrimaryLogPG"
+ "PG" -> "ObjectStore"
+ "PG" -> "OSDMap"
+
+ "PrimaryLogPG" -> "ObjectStore"
+ "PrimaryLogPG" -> "OSDMap"
+
+ "ObjectStore" -> "FileStore"
+ "ObjectStore" -> "BlueStore"
+
+ "BlueStore" -> "rocksdb"
+
+ "FileStore" -> "xfs"
+ "FileStore" -> "btrfs"
+ "FileStore" -> "ext4"
+ }
+
+
+.. todo:: write more here
+
diff --git a/doc/dev/osd-class-path.rst b/doc/dev/osd-class-path.rst
new file mode 100644
index 000000000..d0e54acac
--- /dev/null
+++ b/doc/dev/osd-class-path.rst
@@ -0,0 +1,16 @@
+=======================
+ OSD class path issues
+=======================
+
+::
+
+ 2011-12-05 17:41:00.994075 7ffe8b5c3760 librbd: failed to assign a block name for image
+ create error: error 5: Input/output error
+
+This usually happens because your OSDs can't find ``cls_rbd.so``. They
+search for it in ``osd_class_dir``, which may not be set correctly by
+default (http://tracker.ceph.com/issues/1722).
+
+Most likely it's looking in ``/usr/lib/rados-classes`` instead of
+``/usr/lib64/rados-classes`` - change ``osd_class_dir`` in your
+``ceph.conf`` and restart the OSDs to fix it.
diff --git a/doc/dev/osd_internals/async_recovery.rst b/doc/dev/osd_internals/async_recovery.rst
new file mode 100644
index 000000000..ab0a036f1
--- /dev/null
+++ b/doc/dev/osd_internals/async_recovery.rst
@@ -0,0 +1,53 @@
+=====================
+Asynchronous Recovery
+=====================
+
+Ceph Placement Groups (PGs) maintain a log of write transactions to
+facilitate speedy recovery of data. During recovery, each of these PG logs
+is used to determine which content in each OSD is missing or outdated.
+This obviates the need to scan all RADOS objects.
+See :ref:`Log Based PG <log-based-pg>` for more details on this process.
+
+Prior to the Nautilus release this recovery process was synchronous: it
+blocked writes to a RADOS object until it was recovered. In contrast,
+backfill could allow writes to proceed (assuming enough up-to-date replicas
+were available) by temporarily assigning a different acting set, and
+backfilling an OSD outside of the acting set. In some circumstances
+this ends up being significantly better for availability, e.g. if the
+PG log contains 3000 writes to disjoint objects. When the PG log contains
+thousands of entries, it could actually be faster (though not as safe) to
+trade backfill for recovery by deleting and redeploying the containing
+OSD than to iterate through the PG log. Recovering several megabytes
+of RADOS object data (or even worse, several megabytes of omap keys,
+notably RGW bucket indexes) can drastically increase latency for a small
+update, and combined with requests spread across many degraded objects
+it is a recipe for slow requests.
+
+To avoid this we can perform recovery in the background on an OSD
+out-of-band of the live acting set, similar to backfill, but still using
+the PG log to determine what needs to be done. This is known as *asynchronous
+recovery*.
+
+The threashold for performing asynchronous recovery instead of synchronous
+recovery is not a clear-cut. There are a few criteria which
+need to be met for asynchronous recovery:
+
+* Try to keep ``min_size`` replicas available
+* Use the approximate magnitude of the difference in length of
+ logs combined with historical missing objects to estimate the cost of
+ recovery
+* Use the parameter ``osd_async_recovery_min_cost`` to determine
+ when asynchronous recovery is appropriate
+
+With the existing peering process, when we choose the acting set we
+have not fetched the PG log from each peer; we have only the bounds of
+it and other metadata from their ``pg_info_t``. It would be more expensive
+to fetch and examine every log at this point, so we only consider an
+approximate check for log length for now. In Nautilus, we improved
+the accounting of missing objects, so post-Nautilus this information
+is also used to determine the cost of recovery.
+
+While async recovery is occurring, writes to members of the acting set
+may proceed, but we need to send their log entries to the async
+recovery targets (just like we do for backfill OSDs) so that they
+can completely catch up.
diff --git a/doc/dev/osd_internals/backfill_reservation.rst b/doc/dev/osd_internals/backfill_reservation.rst
new file mode 100644
index 000000000..3c380dcf6
--- /dev/null
+++ b/doc/dev/osd_internals/backfill_reservation.rst
@@ -0,0 +1,93 @@
+====================
+Backfill Reservation
+====================
+
+When a new OSD joins a cluster all PGs with it in their acting sets must
+eventually backfill. If all of these backfills happen simultaneously
+they will present excessive load on the OSD: the "thundering herd"
+effect.
+
+The ``osd_max_backfills`` tunable limits the number of outgoing or
+incoming backfills that are active on a given OSD. Note that this limit is
+applied separately to incoming and to outgoing backfill operations.
+Thus there can be as many as ``osd_max_backfills * 2`` backfill operations
+in flight on each OSD. This subtlety is often missed, and Ceph
+operators can be puzzled as to why more ops are observed than expected.
+
+Each ``OSDService`` now has two AsyncReserver instances: one for backfills going
+from the OSD (``local_reserver``) and one for backfills going to the OSD
+(``remote_reserver``). An ``AsyncReserver`` (``common/AsyncReserver.h``)
+manages a queue by priority of waiting items and a set of current reservation
+holders. When a slot frees up, the ``AsyncReserver`` queues the ``Context*``
+associated with the next item on the highest priority queue in the finisher
+provided to the constructor.
+
+For a primary to initiate a backfill it must first obtain a reservation from
+its own ``local_reserver``. Then it must obtain a reservation from the backfill
+target's ``remote_reserver`` via a ``MBackfillReserve`` message. This process is
+managed by sub-states of ``Active`` and ``ReplicaActive`` (see the sub-states
+of ``Active`` in PG.h). The reservations are dropped either on the ``Backfilled``
+event, which is sent on the primary before calling ``recovery_complete``
+and on the replica on receipt of the ``BackfillComplete`` progress message),
+or upon leaving ``Active`` or ``ReplicaActive``.
+
+It's important to always grab the local reservation before the remote
+reservation in order to prevent a circular dependency.
+
+We minimize the risk of data loss by prioritizing the order in
+which PGs are recovered. Admins can override the default order by using
+``force-recovery`` or ``force-backfill``. A ``force-recovery`` with op
+priority ``255`` will start before a ``force-backfill`` op at priority ``254``.
+
+If recovery is needed because a PG is below ``min_size`` a base priority of
+``220`` is used. This is incremented by the number of OSDs short of the pool's
+``min_size`` as well as a value relative to the pool's ``recovery_priority``.
+The resultant priority is capped at ``253`` so that it does not confound forced
+ops as described above. Under ordinary circumstances a recovery op is
+prioritized at ``180`` plus a value relative to the pool's ``recovery_priority``.
+The resultant priority is capped at ``219``.
+
+If backfill is needed because the number of acting OSDs is less than
+the pool's ``min_size``, a priority of ``220`` is used. The number of OSDs
+short of the pool's ``min_size`` is added as well as a value relative to
+the pool's ``recovery_priority``. The total priority is limited to ``253``.
+
+If backfill is needed because a PG is undersized,
+a priority of ``140`` is used. The number of OSDs below the size of the pool is
+added as well as a value relative to the pool's ``recovery_priority``. The
+resultant priority is capped at ``179``. If a backfill op is
+needed because a PG is degraded, a priority of ``140`` is used. A value
+relative to the pool's ``recovery_priority`` is added. The resultant priority
+is capped at ``179`` . Under ordinary circumstances a
+backfill op priority of ``100`` is used. A value relative to the pool's
+``recovery_priority`` is added. The total priority is capped at ``139``.
+
+.. list-table:: Backfill and Recovery op priorities
+ :widths: 20 20 20
+ :header-rows: 1
+
+ * - Description
+ - Base priority
+ - Maximum priority
+ * - Backfill
+ - 100
+ - 139
+ * - Degraded Backfill
+ - 140
+ - 179
+ * - Recovery
+ - 180
+ - 219
+ * - Inactive Recovery
+ - 220
+ - 253
+ * - Inactive Backfill
+ - 220
+ - 253
+ * - force-backfill
+ - 254
+ -
+ * - force-recovery
+ - 255
+ -
+
diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst
new file mode 100644
index 000000000..40064961b
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding.rst
@@ -0,0 +1,87 @@
+==============================
+Erasure Coded Placement Groups
+==============================
+
+Glossary
+--------
+
+*chunk*
+ When the encoding function is called, it returns chunks of the same
+ size as each other. There are two kinds of chunks: (1) *data
+ chunks*, which can be concatenated to reconstruct the original
+ object, and (2) *coding chunks*, which can be used to rebuild a
+ lost chunk.
+
+*chunk rank*
+ The index of a chunk, as determined by the encoding function. The
+ rank of the first chunk is 0, the rank of the second chunk is 1,
+ and so on.
+
+*K*
+ The number of data chunks into which an object is divided. For
+ example, if *K* = 2, then a 10KB object is divided into two objects
+ of 5KB each.
+
+*M*
+ The number of coding chunks computed by the encoding function. *M*
+ is equal to the number of OSDs that can be missing from the cluster
+ without the cluster suffering data loss. For example, if there are
+ two coding chunks, then two OSDs can be missing without data loss.
+
+*N*
+ The number of data chunks plus the number of coding chunks: that
+ is, *K* + *M*.
+
+*rate*
+ The proportion of the total chunks containing useful information:
+ that is, *K* divided by *N*. For example, suppose that *K* = 9 and
+ *M* = 3. This would mean that *N* = 12 (because *K* + *M* = 9 + 3).
+ Therefore, the *rate* (*K* / *N*) would be 9 / 12 = 0.75. In other
+ words, 75% of the chunks would contain useful information.
+
+*shard* (also called *strip*)
+ An ordered sequence of chunks of the same rank from the same object. For a
+ given placement group, each OSD contains shards of the same rank. In the
+ special case in which an object is encoded with only one call to the
+ encoding function, the term *chunk* may be used instead of *shard* because
+ the shard is made of a single chunk. The chunks in a shard are ordered
+ according to the rank of the stripe (see *stripe* below) they belong to.
+
+
+*stripe*
+ If an object is so large that encoding it requires more than one
+ call to the encoding function, each of these calls creates a set of
+ chunks called a *stripe*.
+
+The definitions are illustrated as follows (PG stands for placement group):
+::
+
+ OSD 40 OSD 33
+ +-------------------------+ +-------------------------+
+ | shard 0 - PG 10 | | shard 1 - PG 10 |
+ |+------ object O -------+| |+------ object O -------+|
+ ||+---------------------+|| ||+---------------------+||
+ stripe||| chunk 0 ||| ||| chunk 1 ||| ...
+ 0 ||| stripe 0 ||| ||| stripe 0 |||
+ ||+---------------------+|| ||+---------------------+||
+ ||+---------------------+|| ||+---------------------+||
+ stripe||| chunk 0 ||| ||| chunk 1 ||| ...
+ 1 ||| stripe 1 ||| ||| stripe 1 |||
+ ||+---------------------+|| ||+---------------------+||
+ ||+---------------------+|| ||+---------------------+||
+ stripe||| chunk 0 ||| ||| chunk 1 ||| ...
+ 2 ||| stripe 2 ||| ||| stripe 2 |||
+ ||+---------------------+|| ||+---------------------+||
+ |+-----------------------+| |+-----------------------+|
+ | ... | | ... |
+ +-------------------------+ +-------------------------+
+
+Table of contents
+-----------------
+
+.. toctree::
+ :maxdepth: 1
+
+ Developer notes <erasure_coding/developer_notes>
+ Jerasure plugin <erasure_coding/jerasure>
+ High level design document <erasure_coding/ecbackend>
diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
new file mode 100644
index 000000000..586b4b71b
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
@@ -0,0 +1,223 @@
+============================
+Erasure Code developer notes
+============================
+
+Introduction
+------------
+
+Each chapter of this document explains an aspect of the implementation
+of the erasure code within Ceph. It is mostly based on examples being
+explained to demonstrate how things work.
+
+Reading and writing encoded chunks from and to OSDs
+---------------------------------------------------
+
+An erasure coded pool stores each object as K+M chunks. It is divided
+into K data chunks and M coding chunks. The pool is configured to have
+a size of K+M so that each chunk is stored in an OSD in the acting
+set. The rank of the chunk is stored as an attribute of the object.
+
+Let's say an erasure coded pool is created to use five OSDs ( K+M =
+5 ) and sustain the loss of two of them ( M = 2 ).
+
+When the object *NYAN* containing *ABCDEFGHI* is written to it, the
+erasure encoding function splits the content in three data chunks,
+simply by dividing the content in three : the first contains *ABC*,
+the second *DEF* and the last *GHI*. The content will be padded if the
+content length is not a multiple of K. The function also creates two
+coding chunks : the fourth with *YXY* and the fifth with *GQC*. Each
+chunk is stored in an OSD in the acting set. The chunks are stored in
+objects that have the same name ( *NYAN* ) but reside on different
+OSDs. The order in which the chunks were created must be preserved and
+is stored as an attribute of the object ( shard_t ), in addition to its
+name. Chunk *1* contains *ABC* and is stored on *OSD5* while chunk *4*
+contains *YXY* and is stored on *OSD3*.
+
+::
+
+ +-------------------+
+ name | NYAN |
+ +-------------------+
+ content | ABCDEFGHI |
+ +--------+----------+
+ |
+ |
+ v
+ +------+------+
+ +---------------+ encode(3,2) +-----------+
+ | +--+--+---+---+ |
+ | | | | |
+ | +-------+ | +-----+ |
+ | | | | |
+ +--v---+ +--v---+ +--v---+ +--v---+ +--v---+
+ name | NYAN | | NYAN | | NYAN | | NYAN | | NYAN |
+ +------+ +------+ +------+ +------+ +------+
+ shard | 1 | | 2 | | 3 | | 4 | | 5 |
+ +------+ +------+ +------+ +------+ +------+
+ content | ABC | | DEF | | GHI | | YXY | | QGC |
+ +--+---+ +--+---+ +--+---+ +--+---+ +--+---+
+ | | | | |
+ | | | | |
+ | | +--+---+ | |
+ | | | OSD1 | | |
+ | | +------+ | |
+ | | +------+ | |
+ | +------>| OSD2 | | |
+ | +------+ | |
+ | +------+ | |
+ | | OSD3 |<----+ |
+ | +------+ |
+ | +------+ |
+ | | OSD4 |<--------------+
+ | +------+
+ | +------+
+ +----------------->| OSD5 |
+ +------+
+
+
+
+
+When the object *NYAN* is read from the erasure coded pool, the
+decoding function reads three chunks : chunk *1* containing *ABC*,
+chunk *3* containing *GHI* and chunk *4* containing *YXY* and rebuild
+the original content of the object *ABCDEFGHI*. The decoding function
+is informed that the chunks *2* and *5* are missing ( they are called
+*erasures* ). The chunk *5* could not be read because the *OSD4* is
+*out*.
+
+The decoding function could be called as soon as three chunks are
+read : *OSD2* was the slowest and its chunk does not need to be taken into
+account. This optimization is not implemented in Firefly.
+
+::
+
+ +-------------------+
+ name | NYAN |
+ +-------------------+
+ content | ABCDEFGHI |
+ +--------+----------+
+ ^
+ |
+ |
+ +------+------+
+ | decode(3,2) |
+ | erasures 2,5|
+ +-------------->| |
+ | +-------------+
+ | ^ ^
+ | | +-----+
+ | | |
+ +--+---+ +------+ +--+---+ +--+---+
+ name | NYAN | | NYAN | | NYAN | | NYAN |
+ +------+ +------+ +------+ +------+
+ shard | 1 | | 2 | | 3 | | 4 |
+ +------+ +------+ +------+ +------+
+ content | ABC | | DEF | | GHI | | YXY |
+ +--+---+ +--+---+ +--+---+ +--+---+
+ ^ . ^ ^
+ | TOO . | |
+ | SLOW . +--+---+ |
+ | ^ | OSD1 | |
+ | | +------+ |
+ | | +------+ |
+ | +-------| OSD2 | |
+ | +------+ |
+ | +------+ |
+ | | OSD3 |-----+
+ | +------+
+ | +------+
+ | | OSD4 | OUT
+ | +------+
+ | +------+
+ +------------------| OSD5 |
+ +------+
+
+
+Erasure code library
+--------------------
+
+Using `Reed-Solomon <https://en.wikipedia.org/wiki/Reed_Solomon>`_,
+with parameters K+M, object O is encoded by dividing it into chunks O1,
+O2, ... OM and computing coding chunks P1, P2, ... PK. Any K chunks
+out of the available K+M chunks can be used to obtain the original
+object. If data chunk O2 or coding chunk P2 are lost, they can be
+repaired using any K chunks out of the K+M chunks. If more than M
+chunks are lost, it is not possible to recover the object.
+
+Reading the original content of object O can be a simple
+concatenation of O1, O2, ... OM, because the plugins are using
+`systematic codes
+<https://en.wikipedia.org/wiki/Systematic_code>`_. Otherwise the chunks
+must be given to the erasure code library *decode* method to retrieve
+the content of the object.
+
+Performance depend on the parameters to the encoding functions and
+is also influenced by the packet sizes used when calling the encoding
+functions ( for Cauchy or Liberation for instance ): smaller packets
+means more calls and more overhead.
+
+Although Reed-Solomon is provided as a default, Ceph uses it via an
+`abstract API <https://github.com/ceph/ceph/blob/v0.78/src/erasure-code/ErasureCodeInterface.h>`_ designed to
+allow each pool to choose the plugin that implements it using
+key=value pairs stored in an `erasure code profile`_.
+
+.. _erasure code profile: ../../../erasure-coded-pool
+
+::
+
+ $ ceph osd erasure-code-profile set myprofile \
+ crush-failure-domain=osd
+ $ ceph osd erasure-code-profile get myprofile
+ directory=/usr/lib/ceph/erasure-code
+ k=2
+ m=1
+ plugin=jerasure
+ technique=reed_sol_van
+ crush-failure-domain=osd
+ $ ceph osd pool create ecpool erasure myprofile
+
+The *plugin* is dynamically loaded from *directory* and expected to
+implement the *int __erasure_code_init(char *plugin_name, char *directory)* function
+which is responsible for registering an object derived from *ErasureCodePlugin*
+in the registry. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/v0.78/src/test/erasure-code/ErasureCodePluginExample.cc>`_ plugin reads:
+
+::
+
+ ErasureCodePluginRegistry &instance =
+ ErasureCodePluginRegistry::instance();
+ instance.add(plugin_name, new ErasureCodePluginExample());
+
+The *ErasureCodePlugin* derived object must provide a factory method
+from which the concrete implementation of the *ErasureCodeInterface*
+object can be generated. The `ErasureCodePluginExample plugin <https://github.com/ceph/ceph/blob/v0.78/src/test/erasure-code/ErasureCodePluginExample.cc>`_ reads:
+
+::
+
+ virtual int factory(const map<std::string,std::string> &parameters,
+ ErasureCodeInterfaceRef *erasure_code) {
+ *erasure_code = ErasureCodeInterfaceRef(new ErasureCodeExample(parameters));
+ return 0;
+ }
+
+The *parameters* argument is the list of *key=value* pairs that were
+set in the erasure code profile, before the pool was created.
+
+::
+
+ ceph osd erasure-code-profile set myprofile \
+ directory=<dir> \ # mandatory
+ plugin=jerasure \ # mandatory
+ m=10 \ # optional and plugin dependent
+ k=3 \ # optional and plugin dependent
+ technique=reed_sol_van \ # optional and plugin dependent
+
+Notes
+-----
+
+If the objects are large, it may be impractical to encode and decode
+them in memory. However, when using *RBD* a 1TB device is divided in
+many individual 4MB objects and *RGW* does the same.
+
+Encoding and decoding is implemented in the OSD. Although it could be
+implemented client side for read write, the OSD must be able to encode
+and decode on its own when scrubbing.
diff --git a/doc/dev/osd_internals/erasure_coding/ecbackend.rst b/doc/dev/osd_internals/erasure_coding/ecbackend.rst
new file mode 100644
index 000000000..624ec217e
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding/ecbackend.rst
@@ -0,0 +1,207 @@
+=================================
+ECBackend Implementation Strategy
+=================================
+
+Misc initial design notes
+=========================
+
+The initial (and still true for ec pools without the hacky ec
+overwrites debug flag enabled) design for ec pools restricted
+EC pools to operations which can be easily rolled back:
+
+- CEPH_OSD_OP_APPEND: We can roll back an append locally by
+ including the previous object size as part of the PG log event.
+- CEPH_OSD_OP_DELETE: The possibility of rolling back a delete
+ requires that we retain the deleted object until all replicas have
+ persisted the deletion event. ErasureCoded backend will therefore
+ need to store objects with the version at which they were created
+ included in the key provided to the filestore. Old versions of an
+ object can be pruned when all replicas have committed up to the log
+ event deleting the object.
+- CEPH_OSD_OP_(SET|RM)ATTR: If we include the prior value of the attr
+ to be set or removed, we can roll back these operations locally.
+
+Log entries contain a structure explaining how to locally undo the
+operation represented by the operation
+(see osd_types.h:TransactionInfo::LocalRollBack).
+
+PGTemp and Crush
+----------------
+
+Primaries are able to request a temp acting set mapping in order to
+allow an up-to-date OSD to serve requests while a new primary is
+backfilled (and for other reasons). An erasure coded pg needs to be
+able to designate a primary for these reasons without putting it in
+the first position of the acting set. It also needs to be able to
+leave holes in the requested acting set.
+
+Core Changes:
+
+- OSDMap::pg_to_*_osds needs to separately return a primary. For most
+ cases, this can continue to be acting[0].
+- MOSDPGTemp (and related OSD structures) needs to be able to specify
+ a primary as well as an acting set.
+- Much of the existing code base assumes that acting[0] is the primary
+ and that all elements of acting are valid. This needs to be cleaned
+ up since the acting set may contain holes.
+
+Distinguished acting set positions
+----------------------------------
+
+With the replicated strategy, all replicas of a PG are
+interchangeable. With erasure coding, different positions in the
+acting set have different pieces of the erasure coding scheme and are
+not interchangeable. Worse, crush might cause chunk 2 to be written
+to an OSD which happens already to contain an (old) copy of chunk 4.
+This means that the OSD and PG messages need to work in terms of a
+type like pair<shard_t, pg_t> in order to distinguish different pg
+chunks on a single OSD.
+
+Because the mapping of object name to object in the filestore must
+be 1-to-1, we must ensure that the objects in chunk 2 and the objects
+in chunk 4 have different names. To that end, the objectstore must
+include the chunk id in the object key.
+
+Core changes:
+
+- The objectstore `ghobject_t needs to also include a chunk id
+ <https://github.com/ceph/ceph/blob/firefly/src/common/hobject.h#L241>`_ making it more like
+ tuple<hobject_t, gen_t, shard_t>.
+- coll_t needs to include a shard_t.
+- The OSD pg_map and similar pg mappings need to work in terms of a
+ spg_t (essentially
+ pair<pg_t, shard_t>). Similarly, pg->pg messages need to include
+ a shard_t
+- For client->PG messages, the OSD will need a way to know which PG
+ chunk should get the message since the OSD may contain both a
+ primary and non-primary chunk for the same pg
+
+Object Classes
+--------------
+
+Reads from object classes will return ENOTSUP on ec pools by invoking
+a special SYNC read.
+
+Scrub
+-----
+
+The main catch, however, for ec pools is that sending a crc32 of the
+stored chunk on a replica isn't particularly helpful since the chunks
+on different replicas presumably store different data. Because we
+don't support overwrites except via DELETE, however, we have the
+option of maintaining a crc32 on each chunk through each append.
+Thus, each replica instead simply computes a crc32 of its own stored
+chunk and compares it with the locally stored checksum. The replica
+then reports to the primary whether the checksums match.
+
+With overwrites, all scrubs are disabled for now until we work out
+what to do (see doc/dev/osd_internals/erasure_coding/proposals.rst).
+
+Crush
+-----
+
+If crush is unable to generate a replacement for a down member of an
+acting set, the acting set should have a hole at that position rather
+than shifting the other elements of the acting set out of position.
+
+=========
+ECBackend
+=========
+
+MAIN OPERATION OVERVIEW
+=======================
+
+A RADOS put operation can span
+multiple stripes of a single object. There must be code that
+tessellates the application level write into a set of per-stripe write
+operations -- some whole-stripes and up to two partial
+stripes. Without loss of generality, for the remainder of this
+document we will focus exclusively on writing a single stripe (whole
+or partial). We will use the symbol "W" to represent the number of
+blocks within a stripe that are being written, i.e., W <= K.
+
+There are three data flows for handling a write into an EC stripe. The
+choice of which of the three data flows to choose is based on the size
+of the write operation and the arithmetic properties of the selected
+parity-generation algorithm.
+
+(1) whole stripe is written/overwritten
+(2) a read-modify-write operation is performed.
+
+WHOLE STRIPE WRITE
+------------------
+
+This is the simple case, and is already performed in the existing code
+(for appends, that is). The primary receives all of the data for the
+stripe in the RADOS request, computes the appropriate parity blocks
+and send the data and parity blocks to their destination shards which
+write them. This is essentially the current EC code.
+
+READ-MODIFY-WRITE
+-----------------
+
+The primary determines which of the K-W blocks are to be unmodified,
+and reads them from the shards. Once all of the data is received it is
+combined with the received new data and new parity blocks are
+computed. The modified blocks are sent to their respective shards and
+written. The RADOS operation is acknowledged.
+
+OSD Object Write and Consistency
+--------------------------------
+
+Regardless of the algorithm chosen above, writing of the data is a two
+phase process: commit and rollforward. The primary sends the log
+entries with the operation described (see
+osd_types.h:TransactionInfo::(LocalRollForward|LocalRollBack).
+In all cases, the "commit" is performed in place, possibly leaving some
+information required for a rollback in a write-aside object. The
+rollforward phase occurs once all acting set replicas have committed
+the commit (sorry, overloaded term) and removes the rollback information.
+
+In the case of overwrites of exsting stripes, the rollback information
+has the form of a sparse object containing the old values of the
+overwritten extents populated using clone_range. This is essentially
+a place-holder implementation, in real life, bluestore will have an
+efficient primitive for this.
+
+The rollforward part can be delayed since we report the operation as
+committed once all replicas have committed. Currently, whenever we
+send a write, we also indicate that all previously committed
+operations should be rolled forward (see
+ECBackend::try_reads_to_commit). If there aren't any in the pipeline
+when we arrive at the waiting_rollforward queue, we start a dummy
+write to move things along (see the Pipeline section later on and
+ECBackend::try_finish_rmw).
+
+ExtentCache
+-----------
+
+It's pretty important to be able to pipeline writes on the same
+object. For this reason, there is a cache of extents written by
+cacheable operations. Each extent remains pinned until the operations
+referring to it are committed. The pipeline prevents rmw operations
+from running until uncacheable transactions (clones, etc) are flushed
+from the pipeline.
+
+See ExtentCache.h for a detailed explanation of how the cache
+states correspond to the higher level invariants about the conditions
+under which cuncurrent operations can refer to the same object.
+
+Pipeline
+--------
+
+Reading src/osd/ExtentCache.h should have given a good idea of how
+operations might overlap. There are several states involved in
+processing a write operation and an important invariant which
+isn't enforced by PrimaryLogPG at a higher level which need to be
+managed by ECBackend. The important invariant is that we can't
+have uncacheable and rmw operations running at the same time
+on the same object. For simplicity, we simply enforce that any
+operation which contains an rmw operation must wait until
+all in-progress uncacheable operations complete.
+
+There are improvements to be made here in the future.
+
+For more details, see ECBackend::waiting_* and
+ECBackend::try_<from>_to_<to>.
+
diff --git a/doc/dev/osd_internals/erasure_coding/jerasure.rst b/doc/dev/osd_internals/erasure_coding/jerasure.rst
new file mode 100644
index 000000000..27669a0b2
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding/jerasure.rst
@@ -0,0 +1,33 @@
+===============
+jerasure plugin
+===============
+
+Introduction
+------------
+
+The parameters interpreted by the jerasure plugin are:
+
+::
+
+ ceph osd erasure-code-profile set myprofile \
+ directory=<dir> \ # plugin directory absolute path
+ plugin=jerasure \ # plugin name (only jerasure)
+ k=<k> \ # data chunks (default 2)
+ m=<m> \ # coding chunks (default 2)
+ technique=<technique> \ # coding technique
+
+The coding techniques can be chosen among *reed_sol_van*,
+*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
+*blaum_roth* and *liber8tion*.
+
+The *src/erasure-code/jerasure* directory contains the
+implementation. It is a wrapper around the code found at
+`https://github.com/ceph/jerasure <https://github.com/ceph/jerasure>`_
+and `https://github.com/ceph/gf-complete
+<https://github.com/ceph/gf-complete>`_ , pinned to the latest stable
+version in *.gitmodules*. These repositories are copies of the
+upstream repositories `http://jerasure.org/jerasure/jerasure
+<http://jerasure.org/jerasure/jerasure>`_ and
+`http://jerasure.org/jerasure/gf-complete
+<http://jerasure.org/jerasure/gf-complete>`_ . The difference
+between the two, if any, should match pull requests against upstream.
diff --git a/doc/dev/osd_internals/erasure_coding/proposals.rst b/doc/dev/osd_internals/erasure_coding/proposals.rst
new file mode 100644
index 000000000..d048ce8a1
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding/proposals.rst
@@ -0,0 +1,385 @@
+:orphan:
+
+=================================
+Proposed Next Steps for ECBackend
+=================================
+
+PARITY-DELTA-WRITE
+------------------
+
+RMW operations current require 4 network hops (2 round trips). In
+principle, for some codes, we can reduce this to 3 by sending the
+update to the replicas holding the data blocks and having them
+compute a delta to forward onto the parity blocks.
+
+The primary reads the current values of the "W" blocks and then uses
+the new values of the "W" blocks to compute parity-deltas for each of
+the parity blocks. The W blocks and the parity delta-blocks are sent
+to their respective shards.
+
+The choice of whether to use a read-modify-write or a
+parity-delta-write is complex policy issue that is TBD in the details
+and is likely to be heavily dependent on the computational costs
+associated with a parity-delta vs. a regular parity-generation
+operation. However, it is believed that the parity-delta scheme is
+likely to be the preferred choice, when available.
+
+The internal interface to the erasure coding library plug-ins needs to
+be extended to support the ability to query if parity-delta
+computation is possible for a selected algorithm as well as an
+interface to the actual parity-delta computation algorithm when
+available.
+
+Stripe Cache
+------------
+
+It may be a good idea to extend the current ExtentCache usage to
+cache some data past when the pinning operation releases it.
+One application pattern that is important to optimize is the small
+block sequential write operation (think of the journal of a journaling
+file system or a database transaction log). Regardless of the chosen
+redundancy algorithm, it is advantageous for the primary to
+retain/buffer recently read/written portions of a stripe in order to
+reduce network traffic. The dynamic contents of this cache may be used
+in the determination of whether a read-modify-write or a
+parity-delta-write is performed. The sizing of this cache is TBD, but
+we should plan on allowing at least a few full stripes per active
+client. Limiting the cache occupancy on a per-client basis will reduce
+the noisy neighbor problem.
+
+Recovery and Rollback Details
+=============================
+
+Implementing a Rollback-able Prepare Operation
+----------------------------------------------
+
+The prepare operation is implemented at each OSD through a simulation
+of a versioning or copy-on-write capability for modifying a portion of
+an object.
+
+When a prepare operation is performed, the new data is written into a
+temporary object. The PG log for the
+operation will contain a reference to the temporary object so that it
+can be located for recovery purposes as well as a record of all of the
+shards which are involved in the operation.
+
+In order to avoid fragmentation (and hence, future read performance),
+creation of the temporary object needs special attention. The name of
+the temporary object affects its location within the KV store. Right
+now its unclear whether it's desirable for the name to locate near the
+base object or whether a separate subset of keyspace should be used
+for temporary objects. Sam believes that colocation with the base
+object is preferred (he suggests using the generation counter of the
+ghobject for temporaries). Whereas Allen believes that using a
+separate subset of keyspace is desirable since these keys are
+ephemeral and we don't want to actually colocate them with the base
+object keys. Perhaps some modeling here can help resolve this
+issue. The data of the temporary object wants to be located as close
+to the data of the base object as possible. This may be best performed
+by adding a new ObjectStore creation primitive that takes the base
+object as an additional parameter that is a hint to the allocator.
+
+Sam: I think that the short lived thing may be a red herring. We'll
+be updating the donor and primary objects atomically, so it seems like
+we'd want them adjacent in the key space, regardless of the donor's
+lifecycle.
+
+The apply operation moves the data from the temporary object into the
+correct position within the base object and deletes the associated
+temporary object. This operation is done using a specialized
+ObjectStore primitive. In the current ObjectStore interface, this can
+be done using the clonerange function followed by a delete, but can be
+done more efficiently with a specialized move primitive.
+Implementation of the specialized primitive on FileStore can be done
+by copying the data. Some file systems have extensions that might also
+be able to implement this operation (like a defrag API that swaps
+chunks between files). It is expected that NewStore will be able to
+support this efficiently and natively (It has been noted that this
+sequence requires that temporary object allocations, which tend to be
+small, be efficiently converted into blocks for main objects and that
+blocks that were formerly inside of main objects must be reusable with
+minimal overhead)
+
+The prepare and apply operations can be separated arbitrarily in
+time. If a read operation accesses an object that has been altered by
+a prepare operation (but without a corresponding apply operation) it
+must return the data after the prepare operation. This is done by
+creating an in-memory database of objects which have had a prepare
+operation without a corresponding apply operation. All read operations
+must consult this in-memory data structure in order to get the correct
+data. It should explicitly recognized that it is likely that there
+will be multiple prepare operations against a single base object and
+the code must handle this case correctly. This code is implemented as
+a layer between ObjectStore and all existing readers. Annoyingly,
+we'll want to trash this state when the interval changes, so the first
+thing that needs to happen after activation is that the primary and
+replicas apply up to last_update so that the empty cache will be
+correct.
+
+During peering, it is now obvious that an unapplied prepare operation
+can easily be rolled back simply by deleting the associated temporary
+object and removing that entry from the in-memory data structure.
+
+Partial Application Peering/Recovery modifications
+--------------------------------------------------
+
+Some writes will be small enough to not require updating all of the
+shards holding data blocks. For write amplification minization
+reasons, it would be best to avoid writing to those shards at all,
+and delay even sending the log entries until the next write which
+actually hits that shard.
+
+The delaying (buffering) of the transmission of the prepare and apply
+operations for witnessing OSDs creates new situations that peering
+must handle. In particular the logic for determining the authoritative
+last_update value (and hence the selection of the OSD which has the
+authoritative log) must be modified to account for the valid but
+missing (i.e., delayed/buffered) pglog entries to which the
+authoritative OSD was only a witness to.
+
+Because a partial write might complete without persisting a log entry
+on every replica, we have to do a bit more work to determine an
+authoritative last_update. The constraint (as with a replicated PG)
+is that last_update >= the most recent log entry for which a commit
+was sent to the client (call this actual_last_update). Secondarily,
+we want last_update to be as small as possible since any log entry
+past actual_last_update (we do not apply a log entry until we have
+sent the commit to the client) must be able to be rolled back. Thus,
+the smaller a last_update we choose, the less recovery will need to
+happen (we can always roll back, but rolling a replica forward may
+require an object rebuild). Thus, we will set last_update to 1 before
+the oldest log entry we can prove cannot have been committed. In
+current master, this is simply the last_update of the shortest log
+from that interval (because that log did not persist any entry past
+that point -- a precondition for sending a commit to the client). For
+this design, we must consider the possibility that any log is missing
+at its head log entries in which it did not participate. Thus, we
+must determine the most recent interval in which we went active
+(essentially, this is what find_best_info currently does). We then
+pull the log from each live osd from that interval back to the minimum
+last_update among them. Then, we extend all logs from the
+authoritative interval until each hits an entry in which it should
+have participated, but did not record. The shortest of these extended
+logs must therefore contain any log entry for which we sent a commit
+to the client -- and the last entry gives us our last_update.
+
+Deep scrub support
+------------------
+
+The simple answer here is probably our best bet. EC pools can't use
+the omap namespace at all right now. The simplest solution would be
+to take a prefix of the omap space and pack N M byte L bit checksums
+into each key/value. The prefixing seems like a sensible precaution
+against eventually wanting to store something else in the omap space.
+It seems like any write will need to read at least the blocks
+containing the modified range. However, with a code able to compute
+parity deltas, we may not need to read a whole stripe. Even without
+that, we don't want to have to write to blocks not participating in
+the write. Thus, each shard should store checksums only for itself.
+It seems like you'd be able to store checksums for all shards on the
+parity blocks, but there may not be distinguished parity blocks which
+are modified on all writes (LRC or shec provide two examples). L
+should probably have a fixed number of options (16, 32, 64?) and be
+configurable per-pool at pool creation. N, M should be likewise be
+configurable at pool creation with sensible defaults.
+
+We need to handle online upgrade. I think the right answer is that
+the first overwrite to an object with an append only checksum
+removes the append only checksum and writes in whatever stripe
+checksums actually got written. The next deep scrub then writes
+out the full checksum omap entries.
+
+RADOS Client Acknowledgement Generation Optimization
+====================================================
+
+Now that the recovery scheme is understood, we can discuss the
+generation of the RADOS operation acknowledgement (ACK) by the
+primary ("sufficient" from above). It is NOT required that the primary
+wait for all shards to complete their respective prepare
+operations. Using our example where the RADOS operations writes only
+"W" chunks of the stripe, the primary will generate and send W+M
+prepare operations (possibly including a send-to-self). The primary
+need only wait for enough shards to be written to ensure recovery of
+the data, Thus after writing W + M chunks you can afford the lost of M
+chunks. Hence the primary can generate the RADOS ACK after W+M-M => W
+of those prepare operations are completed.
+
+Inconsistent object_info_t versions
+===================================
+
+A natural consequence of only writing the blocks which actually
+changed is that we don't want to update the object_info_t of the
+objects which didn't. I actually think it would pose a problem to do
+so: pg ghobject namespaces are generally large, and unless the osd is
+seeing a bunch of overwrites on a small set of objects, I'd expect
+each write to be far enough apart in the backing ghobject_t->data
+mapping to each constitute a random metadata update. Thus, we have to
+accept that not every shard will have the current version in its
+object_info_t. We can't even bound how old the version on a
+particular shard will happen to be. In particular, the primary does
+not necessarily have the current version. One could argue that the
+parity shards would always have the current version, but not every
+code necessarily has designated parity shards which see every write
+(certainly LRC, iirc shec, and even with a more pedestrian code, it
+might be desirable to rotate the shards based on object hash). Even
+if you chose to designate a shard as witnessing all writes, the pg
+might be degraded with that particular shard missing. This is a bit
+tricky, currently reads and writes implicitly return the most recent
+version of the object written. On reads, we'd have to read K shards
+to answer that question. We can get around that by adding a "don't
+tell me the current version" flag. Writes are more problematic: we
+need an object_info from the most recent write in order to form the
+new object_info and log_entry.
+
+A truly terrifying option would be to eliminate version and
+prior_version entirely from the object_info_t. There are a few
+specific purposes it serves:
+
+#. On OSD startup, we prime the missing set by scanning backwards
+ from last_update to last_complete comparing the stored object's
+ object_info_t to the version of most recent log entry.
+#. During backfill, we compare versions between primary and target
+ to avoid some pushes. We use it elsewhere as well
+#. While pushing and pulling objects, we verify the version.
+#. We return it on reads and writes and allow the librados user to
+ assert it atomically on writesto allow the user to deal with write
+ races (used extensively by rbd).
+
+Case (3) isn't actually essential, just convenient. Oh well. (4)
+is more annoying. Writes are easy since we know the version. Reads
+are tricky because we may not need to read from all of the replicas.
+Simplest solution is to add a flag to rados operations to just not
+return the user version on read. We can also just not support the
+user version assert on ec for now (I think? Only user is rgw bucket
+indices iirc, and those will always be on replicated because they use
+omap).
+
+We can avoid (1) by maintaining the missing set explicitly. It's
+already possible for there to be a missing object without a
+corresponding log entry (Consider the case where the most recent write
+is to an object which has not been updated in weeks. If that write
+becomes divergent, the written object needs to be marked missing based
+on the prior_version which is not in the log.) THe PGLog already has
+a way of handling those edge cases (see divergent_priors). We'd
+simply expand that to contain the entire missing set and maintain it
+atomically with the log and the objects. This isn't really an
+unreasonable option, the additional keys would be fewer than the
+existing log keys + divergent_priors and aren't updated in the fast
+write path anyway.
+
+The second case is a bit trickier. It's really an optimization for
+the case where a pg became not in the acting set long enough for the
+logs to no longer overlap but not long enough for the PG to have
+healed and removed the old copy. Unfortunately, this describes the
+case where a node was taken down for maintenance with noout set. It's
+probably not acceptable to re-backfill the whole OSD in such a case,
+so we need to be able to quickly determine whether a particular shard
+is up to date given a valid acting set of other shards.
+
+Let ordinary writes which do not change the object size not touch the
+object_info at all. That means that the object_info version won't
+match the pg log entry version. Include in the pg_log_entry_t the
+current object_info version as well as which shards participated (as
+mentioned above). In addition to the object_info_t attr, record on
+each shard s a vector recording for each other shard s' the most
+recent write which spanned both s and s'. Operationally, we maintain
+an attr on each shard containing that vector. A write touching S
+updates the version stamp entry for each shard in S on each shard in
+S's attribute (and leaves the rest alone). If we have a valid acting
+set during backfill, we must have a witness of every write which
+completed -- so taking the max of each entry over all of the acting
+set shards must give us the current version for each shard. During
+recovery, we set the attribute on the recovery target to that max
+vector (Question: with LRC, we may not need to touch much of the
+acting set to recover a particular shard -- can we just use the max of
+the shards we used to recovery, or do we need to grab the version
+vector from the rest of the acting set as well? I'm not sure, not a
+big deal anyway, I think).
+
+The above lets us perform blind writes without knowing the current
+object version (log entry version, that is) while still allowing us to
+avoid backfilling up to date objects. The only catch is that our
+backfill scans will can all replicas, not just the primary and the
+backfill targets.
+
+It would be worth adding into scrub the ability to check the
+consistency of the gathered version vectors -- probably by just
+taking 3 random valid subsets and verifying that they generate
+the same authoritative version vector.
+
+Implementation Strategy
+=======================
+
+It goes without saying that it would be unwise to attempt to do all of
+this in one massive PR. It's also not a good idea to merge code which
+isn't being tested. To that end, it's worth thinking a bit about
+which bits can be tested on their own (perhaps with a bit of temporary
+scaffolding).
+
+We can implement the overwrite friendly checksumming scheme easily
+enough with the current implementation. We'll want to enable it on a
+per-pool basis (probably using a flag which we'll later repurpose for
+actual overwrite support). We can enable it in some of the ec
+thrashing tests in the suite. We can also add a simple test
+validating the behavior of turning it on for an existing ec pool
+(later, we'll want to be able to convert append-only ec pools to
+overwrite ec pools, so that test will simply be expanded as we go).
+The flag should be gated by the experimental feature flag since we
+won't want to support this as a valid configuration -- testing only.
+We need to upgrade append only ones in place during deep scrub.
+
+Similarly, we can implement the unstable extent cache with the current
+implementation, it even lets us cut out the readable ack the replicas
+send to the primary after the commit which lets it release the lock.
+Same deal, implement, gate with experimental flag, add to some of the
+automated tests. I don't really see a reason not to use the same flag
+as above.
+
+We can certainly implement the move-range primitive with unit tests
+before there are any users. Adding coverage to the existing
+objectstore tests would suffice here.
+
+Explicit missing set can be implemented now, same deal as above --
+might as well even use the same feature bit.
+
+The TPC protocol outlined above can actually be implemented an append
+only EC pool. Same deal as above, can even use the same feature bit.
+
+The RADOS flag to suppress the read op user version return can be
+implemented immediately. Mostly just needs unit tests.
+
+The version vector problem is an interesting one. For append only EC
+pools, it would be pointless since all writes increase the size and
+therefore update the object_info. We could do it for replicated pools
+though. It's a bit silly since all "shards" see all writes, but it
+would still let us implement and partially test the augmented backfill
+code as well as the extra pg log entry fields -- this depends on the
+explicit pg log entry branch having already merged. It's not entirely
+clear to me that this one is worth doing separately. It's enough code
+that I'd really prefer to get it done independently, but it's also a
+fair amount of scaffolding that will be later discarded.
+
+PGLog entries need to be able to record the participants and log
+comparison needs to be modified to extend logs with entries they
+wouldn't have witnessed. This logic should be abstracted behind
+PGLog so it can be unittested -- that would let us test it somewhat
+before the actual ec overwrites code merges.
+
+Whatever needs to happen to the ec plugin interface can probably be
+done independently of the rest of this (pending resolution of
+questions below).
+
+The actual nuts and bolts of performing the ec overwrite it seems to
+me can't be productively tested (and therefore implemented) until the
+above are complete, so best to get all of the supporting code in
+first.
+
+Open Questions
+==============
+
+Is there a code we should be using that would let us compute a parity
+delta without rereading and reencoding the full stripe? If so, is it
+the kind of thing we need to design for now, or can it be reasonably
+put off?
+
+What needs to happen to the EC plugin interface?
diff --git a/doc/dev/osd_internals/index.rst b/doc/dev/osd_internals/index.rst
new file mode 100644
index 000000000..7e82914aa
--- /dev/null
+++ b/doc/dev/osd_internals/index.rst
@@ -0,0 +1,10 @@
+==============================
+OSD developer documentation
+==============================
+
+.. rubric:: Contents
+
+.. toctree::
+ :glob:
+
+ *
diff --git a/doc/dev/osd_internals/last_epoch_started.rst b/doc/dev/osd_internals/last_epoch_started.rst
new file mode 100644
index 000000000..c31cc66b5
--- /dev/null
+++ b/doc/dev/osd_internals/last_epoch_started.rst
@@ -0,0 +1,60 @@
+======================
+last_epoch_started
+======================
+
+``info.last_epoch_started`` records an activation epoch ``e`` for interval ``i``
+such that all writes committed in ``i`` or earlier are reflected in the
+local info/log and no writes after ``i`` are reflected in the local
+info/log. Since no committed write is ever divergent, even if we
+get an authoritative log/info with an older ``info.last_epoch_started``,
+we can leave our ``info.last_epoch_started`` alone since no writes could
+have committed in any intervening interval (See PG::proc_master_log).
+
+``info.history.last_epoch_started`` records a lower bound on the most
+recent interval in which the PG as a whole went active and accepted
+writes. On a particular OSD it is also an upper bound on the
+activation epoch of intervals in which writes in the local PG log
+occurred: we update it before accepting writes. Because all
+committed writes are committed by all acting set OSDs, any
+non-divergent writes ensure that ``history.last_epoch_started`` was
+recorded by all acting set members in the interval. Once peering has
+queried one OSD from each interval back to some seen
+``history.last_epoch_started``, it follows that no interval after the max
+``history.last_epoch_started`` can have reported writes as committed
+(since we record it before recording client writes in an interval).
+Thus, the minimum ``last_update`` across all infos with
+``info.last_epoch_started >= MAX(history.last_epoch_started)`` must be an
+upper bound on writes reported as committed to the client.
+
+We update ``info.last_epoch_started`` with the initial activation message,
+but we only update ``history.last_epoch_started`` after the new
+``info.last_epoch_started`` is persisted (possibly along with the first
+write). This ensures that we do not require an OSD with the most
+recent ``info.last_epoch_started`` until all acting set OSDs have recorded
+it.
+
+In ``find_best_info``, we do include ``info.last_epoch_started`` values when
+calculating ``max_last_epoch_started_found`` because we want to avoid
+designating a log entry divergent which in a prior interval would have
+been non-divergent since it might have been used to serve a read. In
+``activate()``, we use the peer's ``last_epoch_started`` value as a bound on
+how far back divergent log entries can be found.
+
+However, in a case like
+
+.. code::
+
+ calc_acting osd.0 1.4e( v 473'302 (292'200,473'302] local-les=473 n=4 ec=5 les/c 473/473 556/556/556
+ calc_acting osd.1 1.4e( v 473'302 (293'202,473'302] lb 0//0//-1 local-les=477 n=0 ec=5 les/c 473/473 556/556/556
+ calc_acting osd.4 1.4e( v 473'302 (120'121,473'302] local-les=473 n=4 ec=5 les/c 473/473 556/556/556
+ calc_acting osd.5 1.4e( empty local-les=0 n=0 ec=5 les/c 473/473 556/556/556
+
+since osd.1 is the only one which recorded info.les=477, while osd.4,osd.0
+(which were the acting set in that interval) did not (osd.4 restarted and osd.0
+did not get the message in time), the PG is marked incomplete when
+either osd.4 or osd.0 would have been valid choices. To avoid this, we do not
+consider ``info.les`` for incomplete peers when calculating
+``min_last_epoch_started_found``. It would not have been in the acting
+set, so we must have another OSD from that interval anyway (if
+``maybe_went_rw``). If that OSD does not remember that ``info.les``, then we
+cannot have served reads.
diff --git a/doc/dev/osd_internals/log_based_pg.rst b/doc/dev/osd_internals/log_based_pg.rst
new file mode 100644
index 000000000..5d1e560c0
--- /dev/null
+++ b/doc/dev/osd_internals/log_based_pg.rst
@@ -0,0 +1,208 @@
+.. _log-based-pg:
+
+============
+Log Based PG
+============
+
+Background
+==========
+
+Why PrimaryLogPG?
+-----------------
+
+Currently, consistency for all ceph pool types is ensured by primary
+log-based replication. This goes for both erasure-coded (EC) and
+replicated pools.
+
+Primary log-based replication
+-----------------------------
+
+Reads must return data written by any write which completed (where the
+client could possibly have received a commit message). There are lots
+of ways to handle this, but Ceph's architecture makes it easy for
+everyone at any map epoch to know who the primary is. Thus, the easy
+answer is to route all writes for a particular PG through a single
+ordering primary and then out to the replicas. Though we only
+actually need to serialize writes on a single RADOS object (and even then,
+the partial ordering only really needs to provide an ordering between
+writes on overlapping regions), we might as well serialize writes on
+the whole PG since it lets us represent the current state of the PG
+using two numbers: the epoch of the map on the primary in which the
+most recent write started (this is a bit stranger than it might seem
+since map distribution itself is asynchronous -- see Peering and the
+concept of interval changes) and an increasing per-PG version number
+-- this is referred to in the code with type ``eversion_t`` and stored as
+``pg_info_t::last_update``. Furthermore, we maintain a log of "recent"
+operations extending back at least far enough to include any
+*unstable* writes (writes which have been started but not committed)
+and objects which aren't uptodate locally (see recovery and
+backfill). In practice, the log will extend much further
+(``osd_min_pg_log_entries`` when clean and ``osd_max_pg_log_entries`` when not
+clean) because it's handy for quickly performing recovery.
+
+Using this log, as long as we talk to a non-empty subset of the OSDs
+which must have accepted any completed writes from the most recent
+interval in which we accepted writes, we can determine a conservative
+log which must contain any write which has been reported to a client
+as committed. There is some freedom here, we can choose any log entry
+between the oldest head remembered by an element of that set (any
+newer cannot have completed without that log containing it) and the
+newest head remembered (clearly, all writes in the log were started,
+so it's fine for us to remember them) as the new head. This is the
+main point of divergence between replicated pools and EC pools in
+``PG/PrimaryLogPG``: replicated pools try to choose the newest valid
+option to avoid the client needing to replay those operations and
+instead recover the other copies. EC pools instead try to choose
+the *oldest* option available to them.
+
+The reason for this gets to the heart of the rest of the differences
+in implementation: one copy will not generally be enough to
+reconstruct an EC object. Indeed, there are encodings where some log
+combinations would leave unrecoverable objects (as with a ``k=4,m=2`` encoding
+where 3 of the replicas remember a write, but the other 3 do not -- we
+don't have 3 copies of either version). For this reason, log entries
+representing *unstable* writes (writes not yet committed to the
+client) must be rollbackable using only local information on EC pools.
+Log entries in general may therefore be rollbackable (and in that case,
+via a delayed application or via a set of instructions for rolling
+back an inplace update) or not. Replicated pool log entries are
+never able to be rolled back.
+
+For more details, see ``PGLog.h/cc``, ``osd_types.h:pg_log_t``,
+``osd_types.h:pg_log_entry_t``, and peering in general.
+
+ReplicatedBackend/ECBackend unification strategy
+================================================
+
+PGBackend
+---------
+
+The fundamental difference between replication and erasure coding
+is that replication can do destructive updates while erasure coding
+cannot. It would be really annoying if we needed to have two entire
+implementations of ``PrimaryLogPG`` since there
+are really only a few fundamental differences:
+
+#. How reads work -- async only, requires remote reads for EC
+#. How writes work -- either restricted to append, or must write aside and do a
+ tpc
+#. Whether we choose the oldest or newest possible head entry during peering
+#. A bit of extra information in the log entry to enable rollback
+
+and so many similarities
+
+#. All of the stats and metadata for objects
+#. The high level locking rules for mixing client IO with recovery and scrub
+#. The high level locking rules for mixing reads and writes without exposing
+ uncommitted state (which might be rolled back or forgotten later)
+#. The process, metadata, and protocol needed to determine the set of osds
+ which participated in the most recent interval in which we accepted writes
+#. etc.
+
+Instead, we choose a few abstractions (and a few kludges) to paper over the differences:
+
+#. ``PGBackend``
+#. ``PGTransaction``
+#. ``PG::choose_acting`` chooses between ``calc_replicated_acting`` and ``calc_ec_acting``
+#. Various bits of the write pipeline disallow some operations based on pool
+ type -- like omap operations, class operation reads, and writes which are
+ not aligned appends (officially, so far) for EC
+#. Misc other kludges here and there
+
+``PGBackend`` and ``PGTransaction`` enable abstraction of differences 1 and 2 above
+and the addition of 4 as needed to the log entries.
+
+The replicated implementation is in ``ReplicatedBackend.h/cc`` and doesn't
+require much additional explanation. More detail on the ``ECBackend`` can be
+found in ``doc/dev/osd_internals/erasure_coding/ecbackend.rst``.
+
+PGBackend Interface Explanation
+===============================
+
+Note: this is from a design document that predated the Firefly release
+and is probably out of date w.r.t. some of the method names.
+
+Readable vs Degraded
+--------------------
+
+For a replicated pool, an object is readable IFF it is present on
+the primary (at the right version). For an EC pool, we need at least
+`m` shards present to perform a read, and we need it on the primary. For
+this reason, ``PGBackend`` needs to include some interfaces for determining
+when recovery is required to serve a read vs a write. This also
+changes the rules for when peering has enough logs to prove that it
+
+Core Changes:
+
+- | ``PGBackend`` needs to be able to return ``IsPG(Recoverable|Readable)Predicate``
+ | objects to allow the user to make these determinations.
+
+Client Reads
+------------
+
+Reads from a replicated pool can always be satisfied
+synchronously by the primary OSD. Within an erasure coded pool,
+the primary will need to request data from some number of replicas in
+order to satisfy a read. ``PGBackend`` will therefore need to provide
+separate ``objects_read_sync`` and ``objects_read_async`` interfaces where
+the former won't be implemented by the ``ECBackend``.
+
+``PGBackend`` interfaces:
+
+- ``objects_read_sync``
+- ``objects_read_async``
+
+Scrubs
+------
+
+We currently have two scrub modes with different default frequencies:
+
+#. [shallow] scrub: compares the set of objects and metadata, but not
+ the contents
+#. deep scrub: compares the set of objects, metadata, and a CRC32 of
+ the object contents (including omap)
+
+The primary requests a scrubmap from each replica for a particular
+range of objects. The replica fills out this scrubmap for the range
+of objects including, if the scrub is deep, a CRC32 of the contents of
+each object. The primary gathers these scrubmaps from each replica
+and performs a comparison identifying inconsistent objects.
+
+Most of this can work essentially unchanged with erasure coded PG with
+the caveat that the ``PGBackend`` implementation must be in charge of
+actually doing the scan.
+
+
+``PGBackend`` interfaces:
+
+- ``be_*``
+
+Recovery
+--------
+
+The logic for recovering an object depends on the backend. With
+the current replicated strategy, we first pull the object replica
+to the primary and then concurrently push it out to the replicas.
+With the erasure coded strategy, we probably want to read the
+minimum number of replica chunks required to reconstruct the object
+and push out the replacement chunks concurrently.
+
+Another difference is that objects in erasure coded PG may be
+unrecoverable without being unfound. The ``unfound`` state
+should probably be renamed to ``unrecoverable``. Also, the
+``PGBackend`` implementation will have to be able to direct the search
+for PG replicas with unrecoverable object chunks and to be able
+to determine whether a particular object is recoverable.
+
+
+Core changes:
+
+- ``s/unfound/unrecoverable``
+
+PGBackend interfaces:
+
+- `on_local_recover_start <https://github.com/ceph/ceph/blob/firefly/src/osd/PGBackend.h#L60>`_
+- `on_local_recover <https://github.com/ceph/ceph/blob/firefly/src/osd/PGBackend.h#L66>`_
+- `on_global_recover <https://github.com/ceph/ceph/blob/firefly/src/osd/PGBackend.h#L78>`_
+- `on_peer_recover <https://github.com/ceph/ceph/blob/firefly/src/osd/PGBackend.h#L83>`_
+- `begin_peer_recover <https://github.com/ceph/ceph/blob/firefly/src/osd/PGBackend.h#L90>`_
diff --git a/doc/dev/osd_internals/manifest.rst b/doc/dev/osd_internals/manifest.rst
new file mode 100644
index 000000000..2e8f9ae44
--- /dev/null
+++ b/doc/dev/osd_internals/manifest.rst
@@ -0,0 +1,623 @@
+========
+Manifest
+========
+
+
+Introduction
+============
+
+As described in ``../deduplication.rst``, adding transparent redirect
+machinery to RADOS would enable a more capable tiering solution
+than RADOS currently has with "cache/tiering".
+
+See ``../deduplication.rst``
+
+At a high level, each object has a piece of metadata embedded in
+the ``object_info_t`` which can map subsets of the object data payload
+to (refcounted) objects in other pools.
+
+This document exists to detail:
+
+1. Manifest data structures
+2. Rados operations for manipulating manifests.
+3. Status and Plans
+
+
+Intended Usage Model
+====================
+
+RBD
+---
+
+For RBD, the primary goal is for either an OSD-internal agent or a
+cluster-external agent to be able to transparently shift portions
+of the consituent 4MB extents between a dedup pool and a hot base
+pool.
+
+As such, RBD operations (including class operations and snapshots)
+must have the same observable results regardless of the current
+status of the object.
+
+Moreover, tiering/dedup operations must interleave with RBD operations
+without changing the result.
+
+Thus, here is a sketch of how I'd expect a tiering agent to perform
+basic operations:
+
+* Demote cold RBD chunk to slow pool:
+
+ 1. Read object, noting current user_version.
+ 2. In memory, run CDC implementation to fingerprint object.
+ 3. Write out each resulting extent to an object in the cold pool
+ using the CAS class.
+ 4. Submit operation to base pool:
+
+ * ``ASSERT_VER`` with the user version from the read to fail if the
+ object has been mutated since the read.
+ * ``SET_CHUNK`` for each of the extents to the corresponding object
+ in the base pool.
+ * ``EVICT_CHUNK`` for each extent to free up space in the base pool.
+ Results in each chunk being marked ``MISSING``.
+
+ RBD users should then either see the state prior to the demotion or
+ subsequent to it.
+
+ Note that between 3 and 4, we potentially leak references, so a
+ periodic scrub would be needed to validate refcounts.
+
+* Promote cold RBD chunk to fast pool.
+
+ 1. Submit ``TIER_PROMOTE``
+
+For clones, all of the above would be identical except that the
+initial read would need a ``LIST_SNAPS`` to determine which clones exist
+and the ``PROMOTE`` or ``SET_CHUNK``/``EVICT`` operations would need to include
+the ``cloneid``.
+
+RadosGW
+-------
+
+For reads, RADOS Gateway (RGW) could operate as RBD does above relying on the
+manifest machinery in the OSD to hide the distinction between the object
+being dedup'd or present in the base pool
+
+For writes, RGW could operate as RBD does above, but could
+optionally have the freedom to fingerprint prior to doing the write.
+In that case, it could immediately write out the target objects to the
+CAS pool and then atomically write an object with the corresponding
+chunks set.
+
+Status and Future Work
+======================
+
+At the moment, initial versions of a manifest data structure along
+with IO path support and rados control operations exist. This section
+is meant to outline next steps.
+
+At a high level, our future work plan is:
+
+- Cleanups: Address immediate inconsistencies and shortcomings outlined
+ in the next section.
+- Testing: Rados relies heavily on teuthology failure testing to validate
+ features like cache/tiering. We'll need corresponding tests for
+ manifest operations.
+- Snapshots: We want to be able to deduplicate portions of clones
+ below the level of the rados snapshot system. As such, the
+ rados operations below need to be extended to work correctly on
+ clones (e.g.: we should be able to call ``SET_CHUNK`` on a clone, clear the
+ corresponding extent in the base pool, and correctly maintain OSD metadata).
+- Cache/tiering: Ultimately, we'd like to be able to deprecate the existing
+ cache/tiering implementation, but to do that we need to ensure that we
+ can address the same use cases.
+
+
+Cleanups
+--------
+
+The existing implementation has some things that need to be cleaned up:
+
+* ``SET_REDIRECT``: Should create the object if it doesn't exist, otherwise
+ one couldn't create an object atomically as a redirect.
+* ``SET_CHUNK``:
+
+ * Appears to trigger a new clone as user_modify gets set in
+ ``do_osd_ops``. This probably isn't desirable, see Snapshots section
+ below for some options on how generally to mix these operations
+ with snapshots. At a minimum, ``SET_CHUNK`` probably shouldn't set
+ user_modify.
+ * Appears to assume that the corresponding section of the object
+ does not exist (sets ``FLAG_MISSING``) but does not check whether the
+ corresponding extent exists already in the object. Should always
+ leave the extent clean.
+ * Appears to clear the manifest unconditionally if not chunked,
+ that's probably wrong. We should return an error if it's a
+ ``REDIRECT`` ::
+
+ case CEPH_OSD_OP_SET_CHUNK:
+ if (oi.manifest.is_redirect()) {
+ result = -EINVAL;
+ goto fail;
+ }
+
+
+* ``TIER_PROMOTE``:
+
+ * ``SET_REDIRECT`` clears the contents of the object. ``PROMOTE`` appears
+ to copy them back in, but does not unset the redirect or clear the
+ reference. This violates the invariant that a redirect object
+ should be empty in the base pool. In particular, as long as the
+ redirect is set, it appears that all operations will be proxied
+ even after the promote defeating the purpose. We do want ``PROMOTE``
+ to be able to atomically replace a redirect with the actual
+ object, so the solution is to clear the redirect at the end of the
+ promote.
+ * For a chunked manifest, we appear to flush prior to promoting.
+ Promotion will often be used to prepare an object for low latency
+ reads and writes, accordingly, the only effect should be to read
+ any ``MISSING`` extents into the base pool. No flushing should be done.
+
+* High Level:
+
+ * It appears that ``FLAG_DIRTY`` should never be used for an extent pointing
+ at a dedup extent. Writing the mutated extent back to the dedup pool
+ requires writing a new object since the previous one cannot be mutated,
+ just as it would if it hadn't been dedup'd yet. Thus, we should always
+ drop the reference and remove the manifest pointer.
+
+ * There isn't currently a way to "evict" an object region. With the above
+ change to ``SET_CHUNK`` to always retain the existing object region, we
+ need an ``EVICT_CHUNK`` operation to then remove the extent.
+
+
+Testing
+-------
+
+We rely really heavily on randomized failure testing. As such, we need
+to extend that testing to include dedup/manifest support as well. Here's
+a short list of the touchpoints:
+
+* Thrasher tests like ``qa/suites/rados/thrash/workloads/cache-snaps.yaml``
+
+ That test, of course, tests the existing cache/tiering machinery. Add
+ additional files to that directory that instead setup a dedup pool. Add
+ support to ``ceph_test_rados`` (``src/test/osd/TestRados*``).
+
+* RBD tests
+
+ Add a test that runs an RBD workload concurrently with blind
+ promote/evict operations.
+
+* RGW
+
+ Add a test that runs a rgw workload concurrently with blind
+ promote/evict operations.
+
+
+Snapshots
+---------
+
+Fundamentally we need to be able to manipulate the manifest
+status of clones because we want to be able to dynamically promote,
+flush (if the state was dirty when the clone was created), and evict
+extents from clones.
+
+As such, the plan is to allow the ``object_manifest_t`` for each clone
+to be independent. Here's an incomplete list of the high level
+tasks:
+
+* Modify the op processing pipeline to permit ``SET_CHUNK``, ``EVICT_CHUNK``
+ to operation directly on clones.
+* Ensure that recovery checks the object_manifest prior to trying to
+ use the overlaps in clone_range. ``ReplicatedBackend::calc_*_subsets``
+ are the two methods that would likely need to be modified.
+
+See ``snaps.rst`` for a rundown of the ``librados`` snapshot system and OSD
+support details. I'd like to call out one particular data structure
+we may want to exploit.
+
+The dedup-tool needs to be updated to use ``LIST_SNAPS`` to discover
+clones as part of leak detection.
+
+An important question is how we deal with the fact that many clones
+will frequently have references to the same backing chunks at the same
+offset. In particular, ``make_writeable`` will generally create a clone
+that shares the same ``object_manifest_t`` references with the exception
+of any extents modified in that transaction. The metadata that
+commits as part of that transaction must therefore map onto the same
+refcount as before because otherwise we'd have to first increment
+refcounts on backing objects (or risk a reference to a dead object)
+Thus, we introduce a simple convention: consecutive clones which
+share a reference at the same offset share the same refcount. This
+means that a write that invokes ``make_writeable`` may decrease refcounts,
+but not increase them. This has some conquences for removing clones.
+Consider the following sequence ::
+
+ write foo [0, 1024)
+ flush foo ->
+ head: [0, 512) aaa, [512, 1024) bbb
+ refcount(aaa)=1, refcount(bbb)=1
+ snapshot 10
+ write foo [0, 512) ->
+ head: [512, 1024) bbb
+ 10 : [0, 512) aaa, [512, 1024) bbb
+ refcount(aaa)=1, refcount(bbb)=1
+ flush foo ->
+ head: [0, 512) ccc, [512, 1024) bbb
+ 10 : [0, 512) aaa, [512, 1024) bbb
+ refcount(aaa)=1, refcount(bbb)=1, refcount(ccc)=1
+ snapshot 20
+ write foo [0, 512) (same contents as the original write)
+ head: [512, 1024) bbb
+ 20 : [0, 512) ccc, [512, 1024) bbb
+ 10 : [0, 512) aaa, [512, 1024) bbb
+ refcount(aaa)=?, refcount(bbb)=1
+ flush foo
+ head: [0, 512) aaa, [512, 1024) bbb
+ 20 : [0, 512) ccc, [512, 1024) bbb
+ 10 : [0, 512) aaa, [512, 1024) bbb
+ refcount(aaa)=?, refcount(bbb)=1, refcount(ccc)=1
+
+What should be the refcount for ``aaa`` be at the end? By our
+above rule, it should be ``2`` since the two ```aaa``` refs are not
+contiguous. However, consider removing clone ``20`` ::
+
+ initial:
+ head: [0, 512) aaa, [512, 1024) bbb
+ 20 : [0, 512) ccc, [512, 1024) bbb
+ 10 : [0, 512) aaa, [512, 1024) bbb
+ refcount(aaa)=2, refcount(bbb)=1, refcount(ccc)=1
+ trim 20
+ head: [0, 512) aaa, [512, 1024) bbb
+ 10 : [0, 512) aaa, [512, 1024) bbb
+ refcount(aaa)=?, refcount(bbb)=1, refcount(ccc)=0
+
+At this point, our rule dictates that ``refcount(aaa)`` is `1`.
+This means that removing ``20`` needs to check for refs held by
+the clones on either side which will then match.
+
+See ``osd_types.h:object_manifest_t::calc_refs_to_drop_on_removal``
+for the logic implementing this rule.
+
+This seems complicated, but it gets us two valuable properties:
+
+1) The refcount change from make_writeable will not block on
+ incrementing a ref
+2) We don't need to load the ``object_manifest_t`` for every clone
+ to determine how to handle removing one -- just the ones
+ immediately preceding and succeeding it.
+
+All clone operations will need to consider adjacent ``chunk_maps``
+when adding or removing references.
+
+Cache/Tiering
+-------------
+
+There already exists a cache/tiering mechanism based on whiteouts.
+One goal here should ultimately be for this manifest machinery to
+provide a complete replacement.
+
+See ``cache-pool.rst``
+
+The manifest machinery already shares some code paths with the
+existing cache/tiering code, mainly ``stat_flush``.
+
+In no particular order, here's in incomplete list of things that need
+to be wired up to provide feature parity:
+
+* Online object access information: The osd already has pool configs
+ for maintaining bloom filters which provide estimates of access
+ recency for objects. We probably need to modify this to permit
+ hitset maintenance for a normal pool -- there are already
+ ``CEPH_OSD_OP_PG_HITSET*`` interfaces for querying them.
+* Tiering agent: The osd already has a background tiering agent which
+ would need to be modified to instead flush and evict using
+ manifests.
+
+* Use exiting existing features regarding the cache flush policy such as
+ histset, age, ratio.
+ - hitset
+ - age, ratio, bytes
+
+* Add tiering-mode to ``manifest-tiering``
+ - Writeback
+ - Read-only
+
+
+Data Structures
+===============
+
+Each RADOS object contains an ``object_manifest_t`` embedded within the
+``object_info_t`` (see ``osd_types.h``):
+
+::
+
+ struct object_manifest_t {
+ enum {
+ TYPE_NONE = 0,
+ TYPE_REDIRECT = 1,
+ TYPE_CHUNKED = 2,
+ };
+ uint8_t type; // redirect, chunked, ...
+ hobject_t redirect_target;
+ std::map<uint64_t, chunk_info_t> chunk_map;
+ }
+
+The ``type`` enum reflects three possible states an object can be in:
+
+1. ``TYPE_NONE``: normal RADOS object
+2. ``TYPE_REDIRECT``: object payload is backed by a single object
+ specified by ``redirect_target``
+3. ``TYPE_CHUNKED: object payload is distributed among objects with
+ size and offset specified by the ``chunk_map``. ``chunk_map`` maps
+ the offset of the chunk to a ``chunk_info_t`` as shown below, also
+ specifying the ``length``, target `OID`, and ``flags``.
+
+::
+
+ struct chunk_info_t {
+ typedef enum {
+ FLAG_DIRTY = 1,
+ FLAG_MISSING = 2,
+ FLAG_HAS_REFERENCE = 4,
+ FLAG_HAS_FINGERPRINT = 8,
+ } cflag_t;
+ uint32_t offset;
+ uint32_t length;
+ hobject_t oid;
+ cflag_t flags; // FLAG_*
+
+
+``FLAG_DIRTY`` at this time can happen if an extent with a fingerprint
+is written. This should be changed to drop the fingerprint instead.
+
+
+Request Handling
+================
+
+Similarly to cache/tiering, the initial touchpoint is
+``maybe_handle_manifest_detail``.
+
+For manifest operations listed below, we return ``NOOP`` and continue onto
+dedicated handling within ``do_osd_ops``.
+
+For redirect objects which haven't been promoted (apparently ``oi.size >
+0`` indicates that it's present?) we proxy reads and writes.
+
+For reads on ``TYPE_CHUNKED``, if ``can_proxy_chunked_read`` (basically, all
+of the ops are reads of extents in the ``object_manifest_t chunk_map``),
+we proxy requests to those objects.
+
+
+RADOS Interface
+================
+
+To set up deduplication one must provision two pools. One will act as the
+base pool and the other will act as the chunk pool. The base pool need to be
+configured with the ``fingerprint_algorithm`` option as follows.
+
+::
+
+ ceph osd pool set $BASE_POOL fingerprint_algorithm sha1|sha256|sha512
+ --yes-i-really-mean-it
+
+Create objects ::
+
+ rados -p base_pool put foo ./foo
+ rados -p chunk_pool put foo-chunk ./foo-chunk
+
+Make a manifest object ::
+
+ rados -p base_pool set-chunk foo $START_OFFSET $END_OFFSET --target-pool chunk_pool foo-chunk $START_OFFSET --with-reference
+
+Operations:
+
+* ``set-redirect``
+
+ Set a redirection between a ``base_object`` in the ``base_pool`` and a ``target_object``
+ in the ``target_pool``.
+ A redirected object will forward all operations from the client to the
+ ``target_object``. ::
+
+ void set_redirect(const std::string& tgt_obj, const IoCtx& tgt_ioctx,
+ uint64_t tgt_version, int flag = 0);
+
+ rados -p base_pool set-redirect <base_object> --target-pool <target_pool>
+ <target_object>
+
+ Returns ``ENOENT`` if the object does not exist (TODO: why?)
+ Returns ``EINVAL`` if the object already is a redirect.
+
+ Takes a reference to target as part of operation, can possibly leak a ref
+ if the acting set resets and the client dies between taking the ref and
+ recording the redirect.
+
+ Truncates object, clears omap, and clears xattrs as a side effect.
+
+ At the top of ``do_osd_ops``, does not set user_modify.
+
+ This operation is not a user mutation and does not trigger a clone to be created.
+
+ There are two purposes of ``set_redirect``:
+
+ 1. Redirect all operation to the target object (like proxy)
+ 2. Cache when ``tier_promote`` is called (redirect will be cleared at this time).
+
+* ``set-chunk``
+
+ Set the ``chunk-offset`` in a ``source_object`` to make a link between it and a
+ ``target_object``. ::
+
+ void set_chunk(uint64_t src_offset, uint64_t src_length, const IoCtx& tgt_ioctx,
+ std::string tgt_oid, uint64_t tgt_offset, int flag = 0);
+
+ rados -p base_pool set-chunk <source_object> <offset> <length> --target-pool
+ <caspool> <target_object> <target-offset>
+
+ Returns ``ENOENT`` if the object does not exist (TODO: why?)
+ Returns ``EINVAL`` if the object already is a redirect.
+ Returns ``EINVAL`` if on ill-formed parameter buffer.
+ Returns ``ENOTSUPP`` if existing mapped chunks overlap with new chunk mapping.
+
+ Takes references to targets as part of operation, can possibly leak refs
+ if the acting set resets and the client dies between taking the ref and
+ recording the redirect.
+
+ Truncates object, clears omap, and clears xattrs as a side effect.
+
+ This operation is not a user mutation and does not trigger a clone to be created.
+
+ TODO: ``SET_CHUNK`` appears to clear the manifest unconditionally if it's not chunked. ::
+
+ if (!oi.manifest.is_chunked()) {
+ oi.manifest.clear();
+ }
+
+* ``evict-chunk``
+
+ Clears an extent from an object leaving only the manifest link between
+ it and the ``target_object``. ::
+
+ void evict_chunk(
+ uint64_t offset, uint64_t length, int flag = 0);
+
+ rados -p base_pool evict-chunk <offset> <length> <object>
+
+ Returns ``EINVAL`` if the extent is not present in the manifest.
+
+ Note: this does not exist yet.
+
+
+* ``tier-promote``
+
+ Promotes the object ensuring that subsequent reads and writes will be local ::
+
+ void tier_promote();
+
+ rados -p base_pool tier-promote <obj-name>
+
+ Returns ``ENOENT`` if the object does not exist
+
+ For a redirect manifest, copies data to head.
+
+ TODO: Promote on a redirect object needs to clear the redirect.
+
+ For a chunked manifest, reads all MISSING extents into the base pool,
+ subsequent reads and writes will be served from the base pool.
+
+ Implementation Note: For a chunked manifest, calls ``start_copy`` on itself. The
+ resulting ``copy_get`` operation will issue reads which will then be redirected by
+ the normal manifest read machinery.
+
+ Does not set the ``user_modify`` flag.
+
+ Future work will involve adding support for specifying a ``clone_id``.
+
+* ``unset-manifest``
+
+ Unset the manifest info in the object that has manifest. ::
+
+ void unset_manifest();
+
+ rados -p base_pool unset-manifest <obj-name>
+
+ Clears manifest chunks or redirect. Lazily releases references, may
+ leak.
+
+ ``do_osd_ops`` seems not to include it in the ``user_modify=false`` ``ignorelist``,
+ and so will trigger a snapshot. Note, this will be true even for a
+ redirect though ``SET_REDIRECT`` does not flip ``user_modify``. This should
+ be fixed -- ``unset-manifest`` should not be a ``user_modify``.
+
+* ``tier-flush``
+
+ Flush the object which has chunks to the chunk pool. ::
+
+ void tier_flush();
+
+ rados -p base_pool tier-flush <obj-name>
+
+ Included in the ``user_modify=false`` ``ignorelist``, does not trigger a clone.
+
+ Does not evict the extents.
+
+
+ceph-dedup-tool
+===============
+
+``ceph-dedup-tool`` has two features: finding an optimal chunk offset for dedup chunking
+and fixing the reference count (see ``./refcount.rst``).
+
+* Find an optimal chunk offset
+
+ a. Fixed chunk
+
+ To find out a fixed chunk length, you need to run the following command many
+ times while changing the ``chunk_size``. ::
+
+ ceph-dedup-tool --op estimate --pool $POOL --chunk-size chunk_size
+ --chunk-algorithm fixed --fingerprint-algorithm sha1|sha256|sha512
+
+ b. Rabin chunk(Rabin-Karp algorithm)
+
+ Rabin-Karp is a string-searching algorithm based
+ on a rolling hash. But a rolling hash is not enough to do deduplication because
+ we don't know the chunk boundary. So, we need content-based slicing using
+ a rolling hash for content-defined chunking.
+ The current implementation uses the simplest approach: look for chunk boundaries
+ by inspecting the rolling hash for pattern (like the
+ lower N bits are all zeroes).
+
+ Users who want to use deduplication need to find an ideal chunk offset.
+ To find out ideal chunk offset, users should discover
+ the optimal configuration for their data workload via ``ceph-dedup-tool``.
+ This information will then be used for object chunking through
+ the ``set-chunk`` API. ::
+
+ ceph-dedup-tool --op estimate --pool $POOL --min-chunk min_size
+ --chunk-algorithm rabin --fingerprint-algorithm rabin
+
+ ``ceph-dedup-tool`` has many options to utilize ``rabin chunk``.
+ These are options for ``rabin chunk``. ::
+
+ --mod-prime <uint64_t>
+ --rabin-prime <uint64_t>
+ --pow <uint64_t>
+ --chunk-mask-bit <uint32_t>
+ --window-size <uint32_t>
+ --min-chunk <uint32_t>
+ --max-chunk <uint64_t>
+
+ Users need to refer following equation to use above options for ``rabin chunk``. ::
+
+ rabin_hash =
+ (rabin_hash * rabin_prime + new_byte - old_byte * pow) % (mod_prime)
+
+ c. Fixed chunk vs content-defined chunk
+
+ Content-defined chunking may or not be optimal solution.
+ For example,
+
+ Data chunk ``A`` : ``abcdefgabcdefgabcdefg``
+
+ Let's think about Data chunk ``A``'s deduplication. The ideal chunk offset is
+ from ``1`` to ``7`` (``abcdefg``). So, if we use fixed chunk, ``7`` is optimal chunk length.
+ But, in the case of content-based slicing, the optimal chunk length
+ could not be found (dedup ratio will not be 100%).
+ Because we need to find optimal parameter such
+ as boundary bit, window size and prime value. This is as easy as fixed chunk.
+ But, content defined chunking is very effective in the following case.
+
+ Data chunk ``B`` : ``abcdefgabcdefgabcdefg``
+
+ Data chunk ``C`` : ``Tabcdefgabcdefgabcdefg``
+
+
+* Fix reference count
+
+ The key idea behind of reference counting for dedup is false-positive, which means
+ ``(manifest object (no ref),, chunk object(has ref))`` happen instead of
+ ``(manifest object (has ref), chunk 1(no ref))``.
+ To fix such inconsistencies, ``ceph-dedup-tool`` supports ``chunk_scrub``. ::
+
+ ceph-dedup-tool --op chunk_scrub --chunk_pool $CHUNK_POOL
+
diff --git a/doc/dev/osd_internals/map_message_handling.rst b/doc/dev/osd_internals/map_message_handling.rst
new file mode 100644
index 000000000..f8104f3fd
--- /dev/null
+++ b/doc/dev/osd_internals/map_message_handling.rst
@@ -0,0 +1,131 @@
+===========================
+Map and PG Message handling
+===========================
+
+Overview
+--------
+The OSD handles routing incoming messages to PGs, creating the PG if necessary
+in some cases.
+
+PG messages generally come in two varieties:
+
+ 1. Peering Messages
+ 2. Ops/SubOps
+
+There are several ways in which a message might be dropped or delayed. It is
+important that the message delaying does not result in a violation of certain
+message ordering requirements on the way to the relevant PG handling logic:
+
+ 1. Ops referring to the same object must not be reordered.
+ 2. Peering messages must not be reordered.
+ 3. Subops must not be reordered.
+
+MOSDMap
+-------
+MOSDMap messages may come from either monitors or other OSDs. Upon receipt, the
+OSD must perform several tasks:
+
+ 1. Persist the new maps to the filestore.
+ Several PG operations rely on having access to maps dating back to the last
+ time the PG was clean.
+ 2. Update and persist the superblock.
+ 3. Update OSD state related to the current map.
+ 4. Expose new maps to PG processes via *OSDService*.
+ 5. Remove PGs due to pool removal.
+ 6. Queue dummy events to trigger PG map catchup.
+
+Each PG asynchronously catches up to the currently published map during
+process_peering_events before processing the event. As a result, different
+PGs may have different views as to the "current" map.
+
+One consequence of this design is that messages containing submessages from
+multiple PGs (MOSDPGInfo, MOSDPGQuery, MOSDPGNotify) must tag each submessage
+with the PG's epoch as well as tagging the message as a whole with the OSD's
+current published epoch.
+
+MOSDPGOp/MOSDPGSubOp
+--------------------
+See OSD::dispatch_op, OSD::handle_op, OSD::handle_sub_op
+
+MOSDPGOps are used by clients to initiate rados operations. MOSDSubOps are used
+between OSDs to coordinate most non peering activities including replicating
+MOSDPGOp operations.
+
+OSD::require_same_or_newer map checks that the current OSDMap is at least
+as new as the map epoch indicated on the message. If not, the message is
+queued in OSD::waiting_for_osdmap via OSD::wait_for_new_map. Note, this
+cannot violate the above conditions since any two messages will be queued
+in order of receipt and if a message is received with epoch e0, a later message
+from the same source must be at epoch at least e0. Note that two PGs from
+the same OSD count for these purposes as different sources for single PG
+messages. That is, messages from different PGs may be reordered.
+
+
+MOSDPGOps follow the following process:
+
+ 1. OSD::handle_op: validates permissions and crush mapping.
+ discard the request if they are not connected and the client cannot get the reply ( See OSD::op_is_discardable )
+ See OSDService::handle_misdirected_op
+ See PG::op_has_sufficient_caps
+ See OSD::require_same_or_newer_map
+ 2. OSD::enqueue_op
+
+MOSDSubOps follow the following process:
+
+ 1. OSD::handle_sub_op checks that sender is an OSD
+ 2. OSD::enqueue_op
+
+OSD::enqueue_op calls PG::queue_op which checks waiting_for_map before calling OpWQ::queue which adds the op to the queue of the PG responsible for handling it.
+
+OSD::dequeue_op is then eventually called, with a lock on the PG. At
+this time, the op is passed to PG::do_request, which checks that:
+
+ 1. the PG map is new enough (PG::must_delay_op)
+ 2. the client requesting the op has enough permissions (PG::op_has_sufficient_caps)
+ 3. the op is not to be discarded (PG::can_discard_{request,op,subop,scan,backfill})
+ 4. the PG is active (PG::flushed boolean)
+ 5. the op is a CEPH_MSG_OSD_OP and the PG is in PG_STATE_ACTIVE state and not in PG_STATE_REPLAY
+
+If these conditions are not met, the op is either discarded or queued for later processing. If all conditions are met, the op is processed according to its type:
+
+ 1. CEPH_MSG_OSD_OP is handled by PG::do_op
+ 2. MSG_OSD_SUBOP is handled by PG::do_sub_op
+ 3. MSG_OSD_SUBOPREPLY is handled by PG::do_sub_op_reply
+ 4. MSG_OSD_PG_SCAN is handled by PG::do_scan
+ 5. MSG_OSD_PG_BACKFILL is handled by PG::do_backfill
+
+CEPH_MSG_OSD_OP processing
+--------------------------
+
+PrimaryLogPG::do_op handles CEPH_MSG_OSD_OP op and will queue it
+
+ 1. in wait_for_all_missing if it is a CEPH_OSD_OP_PGLS for a designated snapid and some object updates are still missing
+ 2. in waiting_for_active if the op may write but the scrubber is working
+ 3. in waiting_for_missing_object if the op requires an object or a snapdir or a specific snap that is still missing
+ 4. in waiting_for_degraded_object if the op may write an object or a snapdir that is degraded, or if another object blocks it ("blocked_by")
+ 5. in waiting_for_backfill_pos if the op requires an object that will be available after the backfill is complete
+ 6. in waiting_for_ack if an ack from another OSD is expected
+ 7. in waiting_for_ondisk if the op is waiting for a write to complete
+
+Peering Messages
+----------------
+See OSD::handle_pg_(notify|info|log|query)
+
+Peering messages are tagged with two epochs:
+
+ 1. epoch_sent: map epoch at which the message was sent
+ 2. query_epoch: map epoch at which the message triggering the message was sent
+
+These are the same in cases where there was no triggering message. We discard
+a peering message if the message's query_epoch if the PG in question has entered
+a new epoch (See PG::old_peering_evt, PG::queue_peering_event). Notifies,
+infos, notifies, and logs are all handled as PG::PeeringMachine events and
+are wrapped by PG::queue_* by PG::CephPeeringEvts, which include the created
+state machine event along with epoch_sent and query_epoch in order to
+generically check PG::old_peering_message upon insertion and removal from the
+queue.
+
+Note, notifies, logs, and infos can trigger the creation of a PG. See
+OSD::get_or_create_pg.
+
+
diff --git a/doc/dev/osd_internals/osd_overview.rst b/doc/dev/osd_internals/osd_overview.rst
new file mode 100644
index 000000000..192ddf8ca
--- /dev/null
+++ b/doc/dev/osd_internals/osd_overview.rst
@@ -0,0 +1,106 @@
+===
+OSD
+===
+
+Concepts
+--------
+
+*Messenger*
+ See src/msg/Messenger.h
+
+ Handles sending and receipt of messages on behalf of the OSD. The OSD uses
+ two messengers:
+
+ 1. cluster_messenger - handles traffic to other OSDs, monitors
+ 2. client_messenger - handles client traffic
+
+ This division allows the OSD to be configured with different interfaces for
+ client and cluster traffic.
+
+*Dispatcher*
+ See src/msg/Dispatcher.h
+
+ OSD implements the Dispatcher interface. Of particular note is ms_dispatch,
+ which serves as the entry point for messages received via either the client
+ or cluster messenger. Because there are two messengers, ms_dispatch may be
+ called from at least two threads. The osd_lock is always held during
+ ms_dispatch.
+
+*WorkQueue*
+ See src/common/WorkQueue.h
+
+ The WorkQueue class abstracts the process of queueing independent tasks
+ for asynchronous execution. Each OSD process contains workqueues for
+ distinct tasks:
+
+ 1. OpWQ: handles ops (from clients) and subops (from other OSDs).
+ Runs in the op_tp threadpool.
+ 2. PeeringWQ: handles peering tasks and pg map advancement
+ Runs in the op_tp threadpool.
+ See Peering
+ 3. CommandWQ: handles commands (pg query, etc)
+ Runs in the command_tp threadpool.
+ 4. RecoveryWQ: handles recovery tasks.
+ Runs in the recovery_tp threadpool.
+ 5. SnapTrimWQ: handles snap trimming
+ Runs in the disk_tp threadpool.
+ See SnapTrimmer
+ 6. ScrubWQ: handles primary scrub path
+ Runs in the disk_tp threadpool.
+ See Scrub
+ 7. ScrubFinalizeWQ: handles primary scrub finalize
+ Runs in the disk_tp threadpool.
+ See Scrub
+ 8. RepScrubWQ: handles replica scrub path
+ Runs in the disk_tp threadpool
+ See Scrub
+ 9. RemoveWQ: Asynchronously removes old pg directories
+ Runs in the disk_tp threadpool
+ See PGRemoval
+
+*ThreadPool*
+ See src/common/WorkQueue.h
+ See also above.
+
+ There are 4 OSD threadpools:
+
+ 1. op_tp: handles ops and subops
+ 2. recovery_tp: handles recovery tasks
+ 3. disk_tp: handles disk intensive tasks
+ 4. command_tp: handles commands
+
+*OSDMap*
+ See src/osd/OSDMap.h
+
+ The crush algorithm takes two inputs: a picture of the cluster
+ with status information about which nodes are up/down and in/out,
+ and the pgid to place. The former is encapsulated by the OSDMap.
+ Maps are numbered by *epoch* (epoch_t). These maps are passed around
+ within the OSD as std::tr1::shared_ptr<const OSDMap>.
+
+ See MapHandling
+
+*PG*
+ See src/osd/PG.* src/osd/PrimaryLogPG.*
+
+ Objects in rados are hashed into *PGs* and *PGs* are placed via crush onto
+ OSDs. The PG structure is responsible for handling requests pertaining to
+ a particular *PG* as well as for maintaining relevant metadata and controlling
+ recovery.
+
+*OSDService*
+ See src/osd/OSD.cc OSDService
+
+ The OSDService acts as a broker between PG threads and OSD state which allows
+ PGs to perform actions using OSD services such as workqueues and messengers.
+ This is still a work in progress. Future cleanups will focus on moving such
+ state entirely from the OSD into the OSDService.
+
+Overview
+--------
+ See src/ceph_osd.cc
+
+ The OSD process represents one leaf device in the crush hierarchy. There
+ might be one OSD process per physical machine, or more than one if, for
+ example, the user configures one OSD instance per disk.
+
diff --git a/doc/dev/osd_internals/osd_throttles.rst b/doc/dev/osd_internals/osd_throttles.rst
new file mode 100644
index 000000000..0703b797e
--- /dev/null
+++ b/doc/dev/osd_internals/osd_throttles.rst
@@ -0,0 +1,93 @@
+=============
+OSD Throttles
+=============
+
+There are three significant throttles in the FileStore OSD back end:
+wbthrottle, op_queue_throttle, and a throttle based on journal usage.
+
+WBThrottle
+----------
+The WBThrottle is defined in src/os/filestore/WBThrottle.[h,cc] and
+included in FileStore as FileStore::wbthrottle. The intention is to
+bound the amount of outstanding IO we need to do to flush the journal.
+At the same time, we don't want to necessarily do it inline in case we
+might be able to combine several IOs on the same object close together
+in time. Thus, in FileStore::_write, we queue the fd for asynchronous
+flushing and block in FileStore::_do_op if we have exceeded any hard
+limits until the background flusher catches up.
+
+The relevant config options are filestore_wbthrottle*. There are
+different defaults for XFS and Btrfs. Each set has hard and soft
+limits on bytes (total dirty bytes), ios (total dirty ios), and
+inodes (total dirty fds). The WBThrottle will begin flushing
+when any of these hits the soft limit and will block in throttle()
+while any has exceeded the hard limit.
+
+Tighter soft limits will cause writeback to happen more quickly,
+but may cause the OSD to miss oportunities for write coalescing.
+Tighter hard limits may cause a reduction in latency variance by
+reducing time spent flushing the journal, but may reduce writeback
+parallelism.
+
+op_queue_throttle
+-----------------
+The op queue throttle is intended to bound the amount of queued but
+uncompleted work in the filestore by delaying threads calling
+queue_transactions more and more based on how many ops and bytes are
+currently queued. The throttle is taken in queue_transactions and
+released when the op is applied to the file system. This period
+includes time spent in the journal queue, time spent writing to the
+journal, time spent in the actual op queue, time spent waiting for the
+wbthrottle to open up (thus, the wbthrottle can push back indirectly
+on the queue_transactions caller), and time spent actually applying
+the op to the file system. A BackoffThrottle is used to gradually
+delay the queueing thread after each throttle becomes more than
+filestore_queue_low_threshhold full (a ratio of
+filestore_queue_max_(bytes|ops)). The throttles will block once the
+max value is reached (filestore_queue_max_(bytes|ops)).
+
+The significant config options are:
+filestore_queue_low_threshhold
+filestore_queue_high_threshhold
+filestore_expected_throughput_ops
+filestore_expected_throughput_bytes
+filestore_queue_high_delay_multiple
+filestore_queue_max_delay_multiple
+
+While each throttle is at less than low_threshold of the max,
+no delay happens. Between low and high, the throttle will
+inject a per-op delay (per op or byte) ramping from 0 at low to
+high_delay_multiple/expected_throughput at high. From high to
+1, the delay will ramp from high_delay_multiple/expected_throughput
+to max_delay_multiple/expected_throughput.
+
+filestore_queue_high_delay_multiple and
+filestore_queue_max_delay_multiple probably do not need to be
+changed.
+
+Setting these properly should help to smooth out op latencies by
+mostly avoiding the hard limit.
+
+See FileStore::throttle_ops and FileSTore::thottle_bytes.
+
+journal usage throttle
+----------------------
+See src/os/filestore/JournalThrottle.h/cc
+
+The intention of the journal usage throttle is to gradually slow
+down queue_transactions callers as the journal fills up in order
+to smooth out hiccup during filestore syncs. JournalThrottle
+wraps a BackoffThrottle and tracks journaled but not flushed
+journal entries so that the throttle can be released when the
+journal is flushed. The configs work very similarly to the
+op_queue_throttle.
+
+The significant config options are:
+journal_throttle_low_threshhold
+journal_throttle_high_threshhold
+filestore_expected_throughput_ops
+filestore_expected_throughput_bytes
+journal_throttle_high_multiple
+journal_throttle_max_multiple
+
+.. literalinclude:: osd_throttles.txt
diff --git a/doc/dev/osd_internals/osd_throttles.txt b/doc/dev/osd_internals/osd_throttles.txt
new file mode 100644
index 000000000..0332377ef
--- /dev/null
+++ b/doc/dev/osd_internals/osd_throttles.txt
@@ -0,0 +1,21 @@
+ Messenger throttle (number and size)
+ |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+ FileStore op_queue throttle (number and size, includes a soft throttle based on filestore_expected_throughput_(ops|bytes))
+ |--------------------------------------------------------|
+ WBThrottle
+ |---------------------------------------------------------------------------------------------------------|
+ Journal (size, includes a soft throttle based on filestore_expected_throughput_bytes)
+ |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+ |----------------------------------------------------------------------------------------------------> flushed ----------------> synced
+ |
+Op: Read Header --DispatchQ--> OSD::_dispatch --OpWQ--> PG::do_request --journalq--> Journal --FileStore::OpWQ--> Apply Thread --Finisher--> op_applied -------------------------------------------------------------> Complete
+ | |
+SubOp: --Messenger--> ReadHeader --DispatchQ--> OSD::_dispatch --OpWQ--> PG::do_request --journalq--> Journal --FileStore::OpWQ--> Apply Thread --Finisher--> sub_op_applied -
+ |
+ |-----------------------------> flushed ----------------> synced
+ |------------------------------------------------------------------------------------------|
+ Journal (size)
+ |---------------------------------|
+ WBThrottle
+ |-----------------------------------------------------|
+ FileStore op_queue throttle (number and size)
diff --git a/doc/dev/osd_internals/osdmap_versions.txt b/doc/dev/osd_internals/osdmap_versions.txt
new file mode 100644
index 000000000..12fab5ae3
--- /dev/null
+++ b/doc/dev/osd_internals/osdmap_versions.txt
@@ -0,0 +1,259 @@
+releases:
+
+ <0.48 pre-argonaut, dev
+ 0.48 argonaut
+ 0.56 bobtail
+ 0.61 cuttlefish
+ 0.67 dumpling
+ 0.72 emperor
+ 0.80 firefly
+ 0.87 giant
+ 0.94 hammer
+ 9.1.0 infernalis rc
+ 9.2.0 infernalis
+ 10.2.0 jewel
+ 11.2.0 kraken
+ 12.2.0 luminous
+ 13.2.0 mimic
+ 14.2.0 nautilus (to-be)
+
+osdmap:
+
+type / v / cv / ev / commit / version / date
+
+map / 1 / - / - / 017788a6ecb570038632de31904dd2e1314dc7b7 / 0.11 / 2009
+inc / 1 / - / - /
+ * initial
+map / 2 / - / - / 020350e19a5dc03cd6cedd7494e434295580615f / 0.13 / 2009
+inc / 2 / - / - /
+ * pg_temp
+map / 3 / - / - / 1ebcebf6fff056a0c0bdf82dde69356e271be27e / 0.19 / 2009
+inc / 3 / - / - /
+ * heartbeat_addr
+map / 4 / - / - / 3ced5e7de243edeccfd20a90ec2034206c920795 / 0.19 / 2010
+inc / 4 / - / - /
+ * pools removed from map
+map / 5 / - / 5 / c4892bed6f49df396df3cbf9ed561c7315bd2442 / 0.20 / 2010
+inc / 5 / - / 5 /
+ * pool names moved to first part of encoding
+ * adds CEPH_OSDMAP_INC_VERSION_EXT (for extended part of map)
+ * adds CEPH_OSDMAP_VERSION_EXT (for extended part of map)
+ * adds 'ev' (extended version) during encode() and decode
+map / 5 / - / 5 / bc9cb9311f1b946898b5256eab500856fccf5c83 / 0.22 / 2010
+inc / 5 / - / 6 /
+ * separate up client/osd
+ * increments CEPH_OSDMAP_INC_VERSION_EXT to 6
+ * CEPH_OSDMAP_INC_VERSION stays at 5
+map / 5 / - / 6 / 7f70112052c7fc3ba46f9e475fa575d85e8b16b2 / 0.22 / 2010
+inc / 5 / - / 6 /
+ * add osd_cluster_addr to full map
+ * increments CEPH_OSDMAP_VERSION_EXT to 6
+ * CEPH_OSDMAP_VERSION stays at 5
+map / 5 / - / 7 / 2ced4e24aef64f2bc7d55b73abb888c124512eac / 0.28 / 2011
+inc / 5 / - / 7 /
+ * add cluster_snapshot field
+ * increments CEPH_OSDMAP_VERSION_EXT to 7
+ * increments CEPH_OSDMAP_INC_VERSION_EXT to 7
+ * CEPH_OSDMAP_INC_VERSION stays at 5
+ * CEPH_OSDMAP_VERSION stays at 5
+map / 6 / - / 7 / d1ce410842ca51fad3aa100a52815a39e5fe6af6 / 0.35 / 2011
+inc / 6 / - / 7 /
+ * encode/decode old + new versions
+ * adds encode_client_old() (basically transitioning from uint32 to
+ uint64)
+ * bumps osdmap version to 6, old clients stay at 5
+ * starts using in-function versions (i.e., _u16 v = 6)
+map / 6 / - / 7 / b297d1edecaf31a48cff6c37df2ee266e51cdec1 / 0.38 / 2011
+inc / 6 / - / 7 /
+ * make encoding conditional based on features
+ * essentially checks whether features & CEPH_FEATURE_PGID64 and opts
+ to either use encode_client_old() or encode()
+map / 6 / - / 7 / 0f0c59478894c9ca7fa04fc32e854648192a9fae / 0.38 / 2011
+inc / 6 / - / 7 /
+ * move stuff from osdmap.h to osdmap.cc
+map / 6 / 8 / ca4311e5e39cec8fad85fad3e67eea968707e9eb / 0.47 / 2012
+inc / 6 / 8 /
+ * store uuid per osd
+ * bumps osdmap::incremental extended version to 8; in function
+ * bumps osdmap's extended version to 8; in function
+map / 6 / - / 8 / 5125daa6d78e173a8dbc75723a8fdcd279a44bcd / 0.47 / 2012
+inc / 6 / - / 8 /
+ * drop defines
+ * drops defines for CEPH_OSDMAP_*_VERSION from rados.h
+map / 6 / 9 / e9f051ef3c49a080b24d7811a16aefb64beacbbd / 0.53 / 2012
+inc / 6 / 9 /
+ * add osd_xinfo_t
+ * osdmap::incremental ext version bumped to 9
+ * osdmap's ext version bumped to 9
+ * because we're adding osd_xinfo_t to the map
+map / 6 / - / 10 / 1fee4ccd5277b52292e255daf458330eef5f0255 / 0.64 / 2013
+inc / 6 / - / 10 /
+ * encode front hb addr
+ * osdmap::incremental ext version bumped to 10
+ * osdmap's ext versiont bumped to 10
+ * because we're adding osd_addrs->hb_front_addr to map
+
+// below we have the change to ENCODE_START() for osdmap and others
+// this means client-usable data and extended osd data get to have their
+// own ENCODE_START()'s, hence their versions start at 1 again.
+
+map / 7 / 1 / 1 / 3d7c69fb0986337dc72e466dc39d93e5ab406062 / 0.77 / 2014
+inc / 7 / 1 / 1 / b55c45e85dbd5d2513a4c56b3b74dcafd03f20b1 / 0.77 / 2014
+ * introduces ENCODE_START() approach to osdmap, and the 'features'
+ argument we currently see in ::encode() functions
+ * same, but for osdmap::incremental
+map / 7 / 1 / 1 / b9208b47745fdd53d36b682bebfc01e913347092 / 0.77 / 2014
+inc / 7 / 1 / 2 /
+ * include features argument in incremental.
+map / 7 / 2 / 1 / cee914290c5540eb1fb9d70faac70a581381c29b / 0.78 / 2014
+inc / 7 / 2 / 2 /
+ * add osd_primary_affinity
+map / 7 / 3 / 1 / c4f8f265955d54f33c79cde02c1ab2fe69ab1ab0 / 0.78 / 2014
+inc / 7 / 3 / 2 /
+ * add new/old erasure code profiles
+map / 8 / 3 / 1 / 3dcf5b9636bb9e0cd6484d18f151b457e1a0c328 / 0.91 / 2014
+inc / 8 / 3 / 2 /
+ * encode crc
+map / 8 / 3 / 1 / 04679c5451e353c966f6ed00b33fa97be8072a79 / 9.1.0 / 2015
+inc / 8 / 3 / 2 /
+ * simply ensures encode_features are filled to CEPH_FEATURE_PGID64 when
+ decoding an incremental if struct_v >= 6; else keeps it at zero.
+ * otherwise, if we get an incremental from hammer (which has
+ struct_v = 6) we would be decoding it as if it were a map from before
+ CEPH_FEATURES_PGID64 (which was introduced in 0.35, pre-argonaut)
+map / 8 / 3 / 2 / 5c6b9d9dcd0a225e3a2b154c20a623868c269346 / 12.0.1 / 2017
+inc / 8 / 3 / 3 /
+ * add (near)full_ratio
+ * used to live in pgmap, moving to osdmap for luminous
+ * conditional on SERVER_LUMINOUS feature being present
+ * osdmap::incremental::encode(): conditional on ev >= 3
+ * osdmap::incremental::decode(): conditional on ev >= 3, else -1
+ * osdmap::encode(): conditional on ev >= 2
+ * osdmap::decode(): conditional on ev >= 0, else 0
+map / 8 / 4 / 2 / 27d6f4373bafa24450f6dbb4e4252c2d9c2c1448 / 12.0.2 / 2017
+inc / 8 / 4 / 3 /
+ * add pg_remap and pg_remap_items
+ * first forces a pg to map to a particular value; second replaces
+ specific osds with specific other osds in crush mapping.
+ * inc conditional on SERVER_LUMINOUS feature being present
+ * osdmap::incremental::encode(): conditional on cv >= 4
+ * osdmap::incremental::decode(): conditional on cv >= 4
+ * map conditional on OSDMAP_REMAP feature being present
+ * osdmap::encode(): if not feature, cv = 3; encode on cv >= 4
+ * osdmap::decode(): conditional on cv >= 4
+map / 8 / 4 / 3 / 27d6f4373bafa24450f6dbb4e4252c2d9c2c1448 / 12.0.2 / 2017
+inc / 8 / 4 / 4 /
+ * handle backfillfull_ratio like nearfull and full
+ * inc:
+ * osdmap::incremental::encode(): conditional on ev >= 3
+ * osdmap::incremental::decode(): conditional on ev >= 4, else -1
+ * map:
+ * osdmap::encode(): conditional on ev >= 2
+ * osdmap::decode(): conditional on ev >= 3, else 0
+map / 8 / 4 / 3 / a1c66468232002c9f36033226f5db0a5751e8d18 / 12.0.3 / 2017
+inc / 8 / 4 / 4 /
+ * add require_min_compat_client field
+ * inc:
+ * osdmap::incremental::encode() conditional on ev >= 4
+ * osdmap::incremental::decode() conditional on ev >= 4
+ map:
+ * osdmap::encode() conditional on ev >= 3
+ * osdmap::decode() conditional on ev >= 3
+map / 8 / 4 / 4 / 4a09e9431de3084b1ca98af11b28f822fde4ffbe / 12.0.3 / 2017
+inc / 8 / 4 / 5 /
+ * bumps encoding version for require_min_compat_client
+ * otherwise osdmap::decode() would throw exception when decoding
+ old maps
+ * inc:
+ * osdmap::incremental::encode() no conditional on ev >= 3
+ * osdmap::incremental::decode() conditional on ev >= 5
+ * map:
+ * osdmap::encode() conditional on ev >= 2
+ * osdmap::decode() conditional on ev >= 4
+map / 8 / 4 / 5 / 3d4c4d9d9da07e1456331c43acc998d2008ca8ea / 12.1.0 / 2017
+inc / 8 / 4 / 6 /
+ * add require_osd_release numeric field
+ * new_require_min_compat_client:
+ * osdmap::incremental::encode() conditional on ev >= 5
+ * osdmap::encode() conditional on ev >= 4
+ * require_osd_release:
+ * osdmap::incremental::encode() conditional on ev >= 6
+ * osdmap::incremental::decode() conditional on ev >= 6 (else, -1)
+ * osdmap::encode() conditional on ev >= 5
+ * osdmap::decode() conditional on ev >= 5 (else, -1)
+map / 8 / 4 / 5 / f22997e24bda4e6476e15d5d4ad9737861f9741f / 12.1.0 / 2017
+inc / 8 / 4 / 6 /
+ * switch (require_)min_compat_client to integers instead of strings
+ * osdmap::incremental::encode() conditional on ev >= 6
+ * osdmap::incremental::decode():
+ * if ev == 5, decode string and translate to release number
+ * if ev >= 6, decode integer
+ * osdmap::encode() conditional on ev >= 4
+ * osdmap::decode():
+ * if ev == 4, decode string and translate to release number
+ * if ev >= 5, decode integer
+map / 8 / 4 / 6 / a8fb39b57884d96201fa502b17bc9395ec38c1b3 / 12.1.0 / 2017
+inc / 8 / 5 / 6 /
+ * make incremental's `new_state` 32 bits instead of 8 bits
+ * implies forcing 8 bits on
+ * osdmap::incremental::encode_client_old()
+ * osdmap::incremental::encode_classic()
+ * osdmap::incremental::decode_classic()
+ * osdmap::incremental::encode() conditional on cv >= 5, else force 8b.
+ * osdmap::incremental::decode() conditional on cv >= 5, else force 8b.
+map / 8 / 5 / 6 / 3c1e58215bbb98f71aae30904f9010a57a58da81 / 12.1.0 / 2017
+inc / 8 / 5 / 6 /
+ * same as above
+map / 8 / 6 / 6 / 48158ec579b708772fae82daaa6cb5dcaf5ac5dd / 12.1.0 / 2017
+inc / 8 / 5 / 6 /
+ * add crush_version
+ * osdmap::encode() conditional on cv >= 6
+ * osdmap::decode() conditional on cv >= 6
+map / 8 / 7 / 6 / 553048fbf97af999783deb7e992c8ecfa5e55500 / 13.0.2 / 2017
+inc / 8 / 6 / 6 /
+ * track newly removed and purged snaps in each epoch
+ * new_removed_snaps
+ * new_purged_snaps
+ * osdmap::encode() conditional on cv >= 7
+ * if SERVER_MIMIC not in features, cv = 6
+ * osdmap::decode() conditional cv >= 7
+map / 8 / 8 / 6 / f99c2a9fec65ad3ce275ef24bd167ee03275d3d7 / 14.0.1 / 2018
+inc / 8 / 7 / 6 /
+ * fix pre-addrvec compat
+ * osdmap::encode() conditional on cv >= 8, else encode client addrs
+ one by one in a loop.
+ * osdmap::decode() just bumps version (?)
+map / 8 / 8 / 7 / 9fb1e521c7c75c124b0dbf193e8b65ff1b5f461e / 14.0.1 / 2018
+inc / 8 / 7 / 7 /
+ * make cluster addrs into addrvecs too
+ * this will allow single-step upgrade from msgr1 to msgr2
+map / 8 / 9 / 7 / d414f0b43a69f3c2db8e454d795be881496237c6 / 14.0.1 / 2018
+inc / 8 / 8 / 7 /
+ * store last_up_change and last_in_change
+ * osdmap::encode() conditional on cv >= 9
+ * osdmap::decode() conditional on cv >= 9
+
+
+
+osd_info_t:
+v / commit / version / date / reason
+
+1 / e574c84a6a0c5a5070dc72d5f5d3d17914ef824a / 0.19 / 2010 / add struct_v
+
+osd_xinfo_t:
+v / commit / version / date
+
+1 / e9f051ef3c49a080b24d7811a16aefb64beacbbd / 0.53 / 2012
+ * add osd_xinfo_t
+2 / 31743d50a109a463d664ec9cf764d5405db507bd / 0.75 / 2013
+ * add features bit mask to osd_xinfo_t
+3 / 87722a42c286d4d12190b86b6d06d388e2953ba0 / 0.82 / 2014
+ * remember osd weight when auto-marking osds out
+
+rados.h:
+v / commit / version / date / reason
+
+- / 147c6f51e34a875ab65624df04baa8ef89296ddd / 0.19 / 2010 / move versions
+ 3 / CEPH_OSDMAP_INC_VERSION
+ 3 / CEPH_OSDMAP_VERSION
+ 2 / CEPH_PG_POOL_VERSION
diff --git a/doc/dev/osd_internals/partial_object_recovery.rst b/doc/dev/osd_internals/partial_object_recovery.rst
new file mode 100644
index 000000000..a22f63348
--- /dev/null
+++ b/doc/dev/osd_internals/partial_object_recovery.rst
@@ -0,0 +1,148 @@
+=======================
+Partial Object Recovery
+=======================
+
+Partial Object Recovery improves the efficiency of log-based recovery (vs
+backfill). Original log-based recovery calculates missing_set based on pg_log
+differences.
+
+The whole object should be recovery from one OSD to another
+if the object is indicated modified by pg_log regardless of how much
+content in the object is really modified. That means a 4M object,
+which is just modified 4k inside, should recovery the whole 4M object
+rather than the modified 4k content. In addition, object map should be
+also recovered even if it is not modified at all.
+
+Partial Object Recovery is designed to solve the problem mentioned above.
+In order to achieve the goals, two things should be done:
+
+1. logging where the object is modified is necessary
+2. logging whether the object_map of an object is modified is also necessary
+
+class ObjectCleanRegion is introduced to do what we want.
+clean_offsets is a variable of interval_set<uint64_t>
+and is used to indicate the unmodified content in an object.
+clean_omap is a variable of bool indicating whether object_map is modified.
+new_object means that osd does not exist for an object
+max_num_intervals is an upbound of the number of intervals in clean_offsets
+so that the memory cost of clean_offsets is always bounded.
+
+The shortest clean interval will be trimmed if the number of intervals
+in clean_offsets exceeds the boundary.
+
+ etc. max_num_intervals=2, clean_offsets:{[5~10], [20~5]}
+
+ then new interval [30~10] will evict out the shortest one [20~5]
+
+ finally, clean_offsets becomes {[5~10], [30~10]}
+
+Procedures for Partial Object Recovery
+======================================
+
+Firstly, OpContext and pg_log_entry_t should contain ObjectCleanRegion.
+In do_osd_ops(), finish_copyfrom(), finish_promote(), corresponding content
+in ObjectCleanRegion should mark dirty so that trace the modification of an object.
+Also update ObjectCleanRegion in OpContext to its pg_log_entry_t.
+
+Secondly, pg_missing_set can build and rebuild correctly.
+when calculating pg_missing_set during peering process,
+also merge ObjectCleanRegion in each pg_log_entry_t.
+
+ etc. object aa has pg_log:
+ 26'101 {[0~4096, 8192~MAX], false}
+
+ 26'104 {0~8192, 12288~MAX, false}
+
+ 28'108 {[0~12288, 16384~MAX], true}
+
+ missing_set for object aa: merge pg_log above --> {[0~4096, 16384~MAX], true}.
+ which means 4096~16384 is modified and object_map is also modified on version 28'108
+
+Also, OSD may be crash after merge log.
+Therefore, we need to read_log and rebuild pg_missing_set. For example, pg_log is:
+
+ object aa: 26'101 {[0~4096, 8192~MAX], false}
+
+ object bb: 26'102 {[0~4096, 8192~MAX], false}
+
+ object cc: 26'103 {[0~4096, 8192~MAX], false}
+
+ object aa: 26'104 {0~8192, 12288~MAX, false}
+
+ object dd: 26'105 {[0~4096, 8192~MAX], false}
+
+ object aa: 28'108 {[0~12288, 16384~MAX], true}
+
+Originally, if bb,cc,dd is recovered, and aa is not.
+So we need to rebuild pg_missing_set for object aa,
+and find aa is modified on version 28'108.
+If version in object_info is 26'96 < 28'108,
+we don't need to consider 26'104 and 26'101 because the whole object will be recovered.
+However, Partial Object Recovery should also require us to rebuild ObjectCleanRegion.
+
+Knowing whether the object is modified is not enough.
+
+Therefore, we also need to traverse the pg_log before,
+that says 26'104 and 26'101 also > object_info(26'96)
+and rebuild pg_missing_set for object aa based on those three logs: 28'108, 26'104, 26'101.
+The way how to merge logs is the same as mentioned above
+
+Finally, finish the push and pull process based on pg_missing_set.
+Updating copy_subset in recovery_info based on ObjectCleanRegion in pg_missing_set.
+copy_subset indicates the intervals of content need to pull and push.
+
+The complicated part here is submit_push_data
+and serval cases should be considered separately.
+what we need to consider is how to deal with the object data,
+object data makes up of omap_header, xattrs, omap, data:
+
+case 1: first && complete: since object recovering is finished in a single PushOp,
+we would like to preserve the original object and overwrite on the object directly.
+Object will not be removed and touch a new one.
+
+ issue 1: As object is not removed, old xattrs remain in the old object
+ but maybe updated in new object. Overwriting for the same key or adding new keys is correct,
+ but removing keys will be wrong.
+ In order to solve this issue, We need to remove the all original xattrs in the object, and then update new xattrs.
+
+ issue 2: As object is not removed,
+ object_map may be recovered depending on the clean_omap.
+ Therefore, if recovering clean_omap, we need to remove old omap of the object for the same reason
+ since omap updating may also be a deletion.
+ Thus, in this case, we should do:
+
+ 1) clear xattrs of the object
+ 2) clear omap of the object if omap recovery is needed
+ 3) truncate the object into recovery_info.size
+ 4) recovery omap_header
+ 5) recovery xattrs, and recover omap if needed
+ 6) punch zeros for original object if fiemap tells nothing there
+ 7) overwrite object content which is modified
+ 8) finish recovery
+
+case 2: first && !complete: object recovering should be done in multiple times.
+Here, target_oid will indicate a new temp_object in pgid_TEMP,
+so the issues are a bit difference.
+
+ issue 1: As object is newly created, there is no need to deal with xattrs
+
+ issue 2: As object is newly created,
+ and object_map may not be transmitted depending on clean_omap.
+ Therefore, if clean_omap is true, we need to clone object_map from original object.
+ issue 3: As object is newly created, and unmodified data will not be transmitted.
+ Therefore, we need to clone unmodified data from the original object.
+ Thus, in this case, we should do:
+
+ 1) remove the temp object
+ 2) create a new temp object
+ 3) set alloc_hint for the new temp object
+ 4) truncate new temp object to recovery_info.size
+ 5) recovery omap_header
+ 6) clone object_map from original object if omap is clean
+ 7) clone unmodified object_data from original object
+ 8) punch zeros for the new temp object
+ 9) recovery xattrs, and recover omap if needed
+ 10) overwrite object content which is modified
+ 11) remove the original object
+ 12) move and rename the new temp object to replace the original object
+ 13) finish recovery
diff --git a/doc/dev/osd_internals/pg.rst b/doc/dev/osd_internals/pg.rst
new file mode 100644
index 000000000..397d4ab5d
--- /dev/null
+++ b/doc/dev/osd_internals/pg.rst
@@ -0,0 +1,31 @@
+====
+PG
+====
+
+Concepts
+--------
+
+*Peering Interval*
+ See PG::start_peering_interval.
+ See PG::acting_up_affected
+ See PG::PeeringState::Reset
+
+ A peering interval is a maximal set of contiguous map epochs in which the
+ up and acting sets did not change. PG::PeeringMachine represents a
+ transition from one interval to another as passing through
+ PeeringState::Reset. On PG::PeeringState::AdvMap PG::acting_up_affected can
+ cause the pg to transition to Reset.
+
+
+Peering Details and Gotchas
+---------------------------
+For an overview of peering, see `Peering <../../peering>`_.
+
+ * PG::flushed defaults to false and is set to false in
+ PG::start_peering_interval. Upon transitioning to PG::PeeringState::Started
+ we send a transaction through the pg op sequencer which, upon complete,
+ sends a FlushedEvt which sets flushed to true. The primary cannot go
+ active until this happens (See PG::PeeringState::WaitFlushedPeering).
+ Replicas can go active but cannot serve ops (writes or reads).
+ This is necessary because we cannot read our ondisk state until unstable
+ transactions from the previous interval have cleared.
diff --git a/doc/dev/osd_internals/pg_removal.rst b/doc/dev/osd_internals/pg_removal.rst
new file mode 100644
index 000000000..c5fe0e1ab
--- /dev/null
+++ b/doc/dev/osd_internals/pg_removal.rst
@@ -0,0 +1,56 @@
+==========
+PG Removal
+==========
+
+See OSD::_remove_pg, OSD::RemoveWQ
+
+There are two ways for a pg to be removed from an OSD:
+
+ 1. MOSDPGRemove from the primary
+ 2. OSD::advance_map finds that the pool has been removed
+
+In either case, our general strategy for removing the pg is to
+atomically set the metadata objects (pg->log_oid, pg->biginfo_oid) to
+backfill and asynchronously remove the pg collections. We do not do
+this inline because scanning the collections to remove the objects is
+an expensive operation.
+
+OSDService::deleting_pgs tracks all pgs in the process of being
+deleted. Each DeletingState object in deleting_pgs lives while at
+least one reference to it remains. Each item in RemoveWQ carries a
+reference to the DeletingState for the relevant pg such that
+deleting_pgs.lookup(pgid) will return a null ref only if there are no
+collections currently being deleted for that pg.
+
+The DeletingState for a pg also carries information about the status
+of the current deletion and allows the deletion to be cancelled.
+The possible states are:
+
+ 1. QUEUED: the PG is in the RemoveWQ
+ 2. CLEARING_DIR: the PG's contents are being removed synchronously
+ 3. DELETING_DIR: the PG's directories and metadata being queued for removal
+ 4. DELETED_DIR: the final removal transaction has been queued
+ 5. CANCELED: the deletion has been cancelled
+
+In 1 and 2, the deletion can be cancelled. Each state transition
+method (and check_canceled) returns false if deletion has been
+cancelled and true if the state transition was successful. Similarly,
+try_stop_deletion() returns true if it succeeds in cancelling the
+deletion. Additionally, try_stop_deletion() in the event that it
+fails to stop the deletion will not return until the final removal
+transaction is queued. This ensures that any operations queued after
+that point will be ordered after the pg deletion.
+
+OSD::_create_lock_pg must handle two cases:
+
+ 1. Either there is no DeletingStateRef for the pg, or it failed to cancel
+ 2. We succeeded in cancelling the deletion.
+
+In case 1., we proceed as if there were no deletion occurring, except that
+we avoid writing to the PG until the deletion finishes. In case 2., we
+proceed as in case 1., except that we first mark the PG as backfilling.
+
+Similarly, OSD::osr_registry ensures that the OpSequencers for those
+pgs can be reused for a new pg if created before the old one is fully
+removed, ensuring that operations on the new pg are sequenced properly
+with respect to operations on the old one.
diff --git a/doc/dev/osd_internals/pgpool.rst b/doc/dev/osd_internals/pgpool.rst
new file mode 100644
index 000000000..45a252bd4
--- /dev/null
+++ b/doc/dev/osd_internals/pgpool.rst
@@ -0,0 +1,22 @@
+==================
+PGPool
+==================
+
+PGPool is a structure used to manage and update the status of removed
+snapshots. It does this by maintaining two fields, cached_removed_snaps - the
+current removed snap set and newly_removed_snaps - newly removed snaps in the
+last epoch. In OSD::load_pgs the osd map is recovered from the pg's file store
+and passed down to OSD::_get_pool where a PGPool object is initialised with the
+map.
+
+With each new map we receive we call PGPool::update with the new map. In that
+function we build a list of newly removed snaps
+(pg_pool_t::build_removed_snaps) and merge that with our cached_removed_snaps.
+This function included checks to make sure we only do this update when things
+have changed or there has been a map gap.
+
+When we activate the pg we initialise the snap trim queue from
+cached_removed_snaps and subtract the purged_snaps we have already purged
+leaving us with the list of snaps that need to be trimmed. Trimming is later
+performed asynchronously by the snap_trim_wq.
+
diff --git a/doc/dev/osd_internals/recovery_reservation.rst b/doc/dev/osd_internals/recovery_reservation.rst
new file mode 100644
index 000000000..a24ac1b15
--- /dev/null
+++ b/doc/dev/osd_internals/recovery_reservation.rst
@@ -0,0 +1,83 @@
+====================
+Recovery Reservation
+====================
+
+Recovery reservation extends and subsumes backfill reservation. The
+reservation system from backfill recovery is used for local and remote
+reservations.
+
+When a PG goes active, first it determines what type of recovery is
+necessary, if any. It may need log-based recovery, backfill recovery,
+both, or neither.
+
+In log-based recovery, the primary first acquires a local reservation
+from the OSDService's local_reserver. Then a MRemoteReservationRequest
+message is sent to each replica in order of OSD number. These requests
+will always be granted (i.e., cannot be rejected), but they may take
+some time to be granted if the remotes have already granted all their
+remote reservation slots.
+
+After all reservations are acquired, log-based recovery proceeds as it
+would without the reservation system.
+
+After log-based recovery completes, the primary releases all remote
+reservations. The local reservation remains held. The primary then
+determines whether backfill is necessary. If it is not necessary, the
+primary releases its local reservation and waits in the Recovered state
+for all OSDs to indicate that they are clean.
+
+If backfill recovery occurs after log-based recovery, the local
+reservation does not need to be reacquired since it is still held from
+before. If it occurs immediately after activation (log-based recovery
+not possible/necessary), the local reservation is acquired according to
+the typical process.
+
+Once the primary has its local reservation, it requests a remote
+reservation from the backfill target. This reservation CAN be rejected,
+for instance if the OSD is too full (backfillfull_ratio osd setting).
+If the reservation is rejected, the primary drops its local
+reservation, waits (osd_backfill_retry_interval), and then retries. It
+will retry indefinitely.
+
+Once the primary has the local and remote reservations, backfill
+proceeds as usual. After backfill completes the remote reservation is
+dropped.
+
+Finally, after backfill (or log-based recovery if backfill was not
+necessary), the primary drops the local reservation and enters the
+Recovered state. Once all the PGs have reported they are clean, the
+primary enters the Clean state and marks itself active+clean.
+
+-----------------
+Dump Reservations
+-----------------
+
+An OSD daemon command dumps total local and remote reservations::
+
+ ceph daemon osd.<id> dump_recovery_reservations
+
+
+--------------
+Things to Note
+--------------
+
+We always grab the local reservation first, to prevent a circular
+dependency. We grab remote reservations in order of OSD number for the
+same reason.
+
+The recovery reservation state chart controls the PG state as reported
+to the monitor. The state chart can set:
+
+ - recovery_wait: waiting for local/remote reservations
+ - recovering: recovering
+ - recovery_toofull: recovery stopped, OSD(s) above full ratio
+ - backfill_wait: waiting for remote backfill reservations
+ - backfilling: backfilling
+ - backfill_toofull: backfill stopped, OSD(s) above backfillfull ratio
+
+
+--------
+See Also
+--------
+
+The Active substate of the automatically generated OSD state diagram.
diff --git a/doc/dev/osd_internals/refcount.rst b/doc/dev/osd_internals/refcount.rst
new file mode 100644
index 000000000..4d75ae019
--- /dev/null
+++ b/doc/dev/osd_internals/refcount.rst
@@ -0,0 +1,45 @@
+========
+Refcount
+========
+
+
+Introduction
+============
+
+Dedupliation, as described in ../deduplication.rst, needs a way to
+maintain a target pool of deduplicated chunks with atomic ref
+refcounting. To that end, there exists an osd object class
+refcount responsible for using the object class machinery to
+maintain refcounts on deduped chunks and ultimately remove them
+as the refcount hits 0.
+
+Class Interface
+===============
+
+See cls/refcount/cls_refcount_client*
+
+* cls_refcount_get
+
+ Atomically increments the refcount with specified tag ::
+
+ void cls_refcount_get(librados::ObjectWriteOperation& op, const string& tag, bool implicit_ref = false);
+
+* cls_refcount_put
+
+ Atomically decrements the refcount specified by passed tag ::
+
+ void cls_refcount_put(librados::ObjectWriteOperation& op, const string& tag, bool implicit_ref = false);
+
+* cls_refcount_Set
+
+ Atomically sets the set of refcounts with passed list of tags ::
+
+ void cls_refcount_set(librados::ObjectWriteOperation& op, list<string>& refs);
+
+* cls_refcount_read
+
+ Dumps the current set of ref tags for the object ::
+
+ int cls_refcount_read(librados::IoCtx& io_ctx, string& oid, list<string> *refs, bool implicit_ref = false);
+
+
diff --git a/doc/dev/osd_internals/scrub.rst b/doc/dev/osd_internals/scrub.rst
new file mode 100644
index 000000000..149509799
--- /dev/null
+++ b/doc/dev/osd_internals/scrub.rst
@@ -0,0 +1,41 @@
+
+Scrub internals and diagnostics
+===============================
+
+Scrubbing Behavior Table
+------------------------
+
++-------------------------------------------------+----------+-----------+---------------+----------------------+
+| Flags | none | noscrub | nodeep_scrub | noscrub/nodeep_scrub |
++=================================================+==========+===========+===============+======================+
+| Periodic tick | S | X | S | X |
++-------------------------------------------------+----------+-----------+---------------+----------------------+
+| Periodic tick after osd_deep_scrub_interval | D | D | S | X |
++-------------------------------------------------+----------+-----------+---------------+----------------------+
+| Initiated scrub | S | S | S | S |
++-------------------------------------------------+----------+-----------+---------------+----------------------+
+| Initiated scrub after osd_deep_scrub_interval | D | D | S | S |
++-------------------------------------------------+----------+-----------+---------------+----------------------+
+| Initiated deep scrub | D | D | D | D |
++-------------------------------------------------+----------+-----------+---------------+----------------------+
+
+- X = Do nothing
+- S = Do regular scrub
+- D = Do deep scrub
+
+State variables
+---------------
+
+- Periodic tick state is ``!must_scrub && !must_deep_scrub && !time_for_deep``
+- Periodic tick after ``osd_deep_scrub_interval state is !must_scrub && !must_deep_scrub && time_for_deep``
+- Initiated scrub state is ``must_scrub && !must_deep_scrub && !time_for_deep``
+- Initiated scrub after ``osd_deep_scrub_interval`` state is ``must_scrub && !must_deep_scrub && time_for_deep``
+- Initiated deep scrub state is ``must_scrub && must_deep_scrub``
+
+Scrub Reservations
+------------------
+
+An OSD daemon command dumps total local and remote reservations::
+
+ ceph daemon osd.<id> dump_scrub_reservations
+
diff --git a/doc/dev/osd_internals/snaps.rst b/doc/dev/osd_internals/snaps.rst
new file mode 100644
index 000000000..5ebd0884a
--- /dev/null
+++ b/doc/dev/osd_internals/snaps.rst
@@ -0,0 +1,128 @@
+======
+Snaps
+======
+
+Overview
+--------
+Rados supports two related snapshotting mechanisms:
+
+ 1. *pool snaps*: snapshots are implicitly applied to all objects
+ in a pool
+ 2. *self managed snaps*: the user must provide the current *SnapContext*
+ on each write.
+
+These two are mutually exclusive, only one or the other can be used on
+a particular pool.
+
+The *SnapContext* is the set of snapshots currently defined for an object
+as well as the most recent snapshot (the *seq*) requested from the mon for
+sequencing purposes (a *SnapContext* with a newer *seq* is considered to
+be more recent).
+
+The difference between *pool snaps* and *self managed snaps* from the
+OSD's point of view lies in whether the *SnapContext* comes to the OSD
+via the client's MOSDOp or via the most recent OSDMap.
+
+See OSD::make_writeable
+
+Ondisk Structures
+-----------------
+Each object has in the PG collection a *head* object (or *snapdir*, which we
+will come to shortly) and possibly a set of *clone* objects.
+Each hobject_t has a snap field. For the *head* (the only writeable version
+of an object), the snap field is set to CEPH_NOSNAP. For the *clones*, the
+snap field is set to the *seq* of the *SnapContext* at their creation.
+When the OSD services a write, it first checks whether the most recent
+*clone* is tagged with a snapid prior to the most recent snap represented
+in the *SnapContext*. If so, at least one snapshot has occurred between
+the time of the write and the time of the last clone. Therefore, prior
+to performing the mutation, the OSD creates a new clone for servicing
+reads on snaps between the snapid of the last clone and the most recent
+snapid.
+
+The *head* object contains a *SnapSet* encoded in an attribute, which tracks
+
+ 1. The full set of snaps defined for the object
+ 2. The full set of clones which currently exist
+ 3. Overlapping intervals between clones for tracking space usage
+ 4. Clone size
+
+If the *head* is deleted while there are still clones, a *snapdir* object
+is created instead to house the *SnapSet*.
+
+Additionally, the *object_info_t* on each clone includes a vector of snaps
+for which clone is defined.
+
+Snap Removal
+------------
+To remove a snapshot, a request is made to the *Monitor* cluster to
+add the snapshot id to the list of purged snaps (or to remove it from
+the set of pool snaps in the case of *pool snaps*). In either case,
+the *PG* adds the snap to its *snap_trimq* for trimming.
+
+A clone can be removed when all of its snaps have been removed. In
+order to determine which clones might need to be removed upon snap
+removal, we maintain a mapping from snap to *hobject_t* using the
+*SnapMapper*.
+
+See PrimaryLogPG::SnapTrimmer, SnapMapper
+
+This trimming is performed asynchronously by the snap_trim_wq while the
+PG is clean and not scrubbing.
+
+ #. The next snap in PG::snap_trimq is selected for trimming
+ #. We determine the next object for trimming out of PG::snap_mapper.
+ For each object, we create a log entry and repop updating the
+ object info and the snap set (including adjusting the overlaps).
+ If the object is a clone which no longer belongs to any live snapshots,
+ it is removed here. (See PrimaryLogPG::trim_object() when new_snaps
+ is empty.)
+ #. We also locally update our *SnapMapper* instance with the object's
+ new snaps.
+ #. The log entry containing the modification of the object also
+ contains the new set of snaps, which the replica uses to update
+ its own *SnapMapper* instance.
+ #. The primary shares the info with the replica, which persists
+ the new set of purged_snaps along with the rest of the info.
+
+
+
+Recovery
+--------
+Because the trim operations are implemented using repops and log entries,
+normal PG peering and recovery maintain the snap trimmer operations with
+the caveat that push and removal operations need to update the local
+*SnapMapper* instance. If the purged_snaps update is lost, we merely
+retrim a now empty snap.
+
+SnapMapper
+----------
+*SnapMapper* is implemented on top of map_cacher<string, bufferlist>,
+which provides an interface over a backing store such as the file system
+with async transactions. While transactions are incomplete, the map_cacher
+instance buffers unstable keys allowing consistent access without having
+to flush the filestore. *SnapMapper* provides two mappings:
+
+ 1. hobject_t -> set<snapid_t>: stores the set of snaps for each clone
+ object
+ 2. snapid_t -> hobject_t: stores the set of hobjects with the snapshot
+ as one of its snaps
+
+Assumption: there are lots of hobjects and relatively few snaps. The
+first encoding has a stringification of the object as the key and an
+encoding of the set of snaps as a value. The second mapping, because there
+might be many hobjects for a single snap, is stored as a collection of keys
+of the form stringify(snap)_stringify(object) such that stringify(snap)
+is constant length. These keys have a bufferlist encoding
+pair<snapid, hobject_t> as a value. Thus, creating or trimming a single
+object does not involve reading all objects for any snap. Additionally,
+upon construction, the *SnapMapper* is provided with a mask for filtering
+the objects in the single SnapMapper keyspace belonging to that PG.
+
+Split
+-----
+The snapid_t -> hobject_t key entries are arranged such that for any PG,
+up to 8 prefixes need to be checked to determine all hobjects in a particular
+snap for a particular PG. Upon split, the prefixes to check on the parent
+are adjusted such that only the objects remaining in the PG will be visible.
+The children will immediately have the correct mapping.
diff --git a/doc/dev/osd_internals/stale_read.rst b/doc/dev/osd_internals/stale_read.rst
new file mode 100644
index 000000000..67eac9d7f
--- /dev/null
+++ b/doc/dev/osd_internals/stale_read.rst
@@ -0,0 +1,101 @@
+Preventing Stale Reads
+======================
+
+We write synchronously to all replicas before sending an ack to the
+client, which ensures that we do not introduce potential inconsistency
+in the write path. However, we only read from one replica, and the
+client will use whatever OSDMap is has to identify which OSD to read
+from. In most cases, this is fine: either the client map is correct,
+or the OSD that we think is the primary for the object knows that it
+is not the primary anymore, and can feed the client an updated map
+indicating a newer primary.
+
+They key is to ensure that this is *always* true. In particular, we
+need to ensure that an OSD that is fenced off from its peers and has
+not learned about a map update does not continue to service read
+requests from similarly stale clients at any point after which a new
+primary may have been allowed to make a write.
+
+We accomplish this via a mechanism that works much like a read lease.
+Each pool may have a ``read_lease_interval`` property which defines
+how long this is, although by default we simply set it to
+``osd_pool_default_read_lease_ratio`` (default: .8) times the
+``osd_heartbeat_grace``. (This way the lease will generally have
+expired by the time we mark a failed OSD down.)
+
+readable_until
+--------------
+
+Primary and replica both track a couple of values:
+
+* *readable_until* is how long we are allowed to service (read)
+ requests before *our* "lease" expires.
+* *readable_until_ub* is an upper bound on *readable_until* for any
+ OSD in the acting set.
+
+The primary manages these two values by sending *pg_lease_t* messages
+to replicas that increase the upper bound. Once all acting OSDs have
+acknowledged they've seen the higher bound, the primary increases its
+own *readable_until* and shares that (in a subsequent *pg_lease_t*
+message). The resulting invariant is that any acting OSDs'
+*readable_until* is always <= any acting OSDs' *readable_until_ub*.
+
+In order to avoid any problems with clock skew, we use monotonic
+clocks (which are only accurate locally and unaffected by time
+adjustments) throughout to manage these leases. Peer OSDs calculate
+upper and lower bounds on the deltas between OSD-local clocks,
+allowing the primary to share timestamps based on its local clock
+while replicas translate that to an appropriate bound in for their own
+local clocks.
+
+Prior Intervals
+---------------
+
+Whenever there is an interval change, we need to have an upper bound
+on the *readable_until* values for any OSDs in the prior interval.
+All OSDs from that interval have this value (*readable_until_ub*), and
+share it as part of the pg_history_t during peering.
+
+Because peering may involve OSDs that were not already communicating
+before and may not have bounds on their clock deltas, the bound in
+*pg_history_t* is shared as a simple duration before the upper bound
+expires. This means that the bound slips forward in time due to the
+transit time for the peering message, but that is generally quite
+short, and moving the bound later in time is safe since it is an
+*upper* bound.
+
+PG "laggy" state
+----------------
+
+While the PG is active, *pg_lease_t* and *pg_lease_ack_t* messages are
+regularly exchanged. However, if a client request comes in and the
+lease has expired (*readable_until* has passed), the PG will go into a
+*LAGGY* state and request will be blocked. Once the lease is renewed,
+the request(s) will be requeued.
+
+PG "wait" state
+---------------
+
+If peering completes but the prior interval's OSDs may still be
+readable, the PG will go into the *WAIT* state until sufficient time
+has passed. Any OSD requests will block during that period. Recovery
+may proceed while in this state, since the logical, user-visible
+content of objects does not change.
+
+Dead OSDs
+---------
+
+Generally speaking, we need to wait until prior intervals' OSDs *know*
+that they should no longer be readable. If an OSD is known to have
+crashed (e.g., because the process is no longer running, which we may
+infer because we get a ECONNREFUSED error), then we can infer that it
+is not readable.
+
+Similarly, if an OSD is marked down, gets a map update telling it so,
+and then informs the monitor that it knows it was marked down, we can
+similarly infer that it is not still serving requests for a prior interval.
+
+When a PG is in the *WAIT* state, it will watch new maps for OSDs'
+*dead_epoch* value indicating they are aware of their dead-ness. If
+all down OSDs from prior interval are so aware, we can exit the WAIT
+state early.
diff --git a/doc/dev/osd_internals/watch_notify.rst b/doc/dev/osd_internals/watch_notify.rst
new file mode 100644
index 000000000..8c2ce09ba
--- /dev/null
+++ b/doc/dev/osd_internals/watch_notify.rst
@@ -0,0 +1,81 @@
+============
+Watch Notify
+============
+
+See librados for the watch/notify interface.
+
+Overview
+--------
+The object_info (See osd/osd_types.h) tracks the set of watchers for
+a particular object persistently in the object_info_t::watchers map.
+In order to track notify progress, we also maintain some ephemeral
+structures associated with the ObjectContext.
+
+Each Watch has an associated Watch object (See osd/Watch.h). The
+ObjectContext for a watched object will have a (strong) reference
+to one Watch object per watch, and each Watch object holds a
+reference to the corresponding ObjectContext. This circular reference
+is deliberate and is broken when the Watch state is discarded on
+a new peering interval or removed upon timeout expiration or an
+unwatch operation.
+
+A watch tracks the associated connection via a strong
+ConnectionRef Watch::conn. The associated connection has a
+WatchConState stashed in the OSD::Session for tracking associated
+Watches in order to be able to notify them upon ms_handle_reset()
+(via WatchConState::reset()).
+
+Each Watch object tracks the set of currently un-acked notifies.
+start_notify() on a Watch object adds a reference to a new in-progress
+Notify to the Watch and either:
+
+* if the Watch is *connected*, sends a Notify message to the client
+* if the Watch is *unconnected*, does nothing.
+
+When the Watch becomes connected (in PrimaryLogPG::do_osd_op_effects),
+Notifies are resent to all remaining tracked Notify objects.
+
+Each Notify object tracks the set of un-notified Watchers via
+calls to complete_watcher(). Once the remaining set is empty or the
+timeout expires (cb, registered in init()) a notify completion
+is sent to the client.
+
+Watch Lifecycle
+---------------
+A watch may be in one of 5 states:
+
+1. Non existent.
+2. On disk, but not registered with an object context.
+3. Connected
+4. Disconnected, callback registered with timer
+5. Disconnected, callback in queue for scrub or is_degraded
+
+Case 2 occurs between when an OSD goes active and the ObjectContext
+for an object with watchers is loaded into memory due to an access.
+During Case 2, no state is registered for the watch. Case 2
+transitions to Case 4 in PrimaryLogPG::populate_obc_watchers() during
+PrimaryLogPG::find_object_context. Case 1 becomes case 3 via
+OSD::do_osd_op_effects due to a watch operation. Case 4,5 become case
+3 in the same way. Case 3 becomes case 4 when the connection resets
+on a watcher's session.
+
+Cases 4&5 can use some explanation. Normally, when a Watch enters Case
+4, a callback is registered with the OSDService::watch_timer to be
+called at timeout expiration. At the time that the callback is
+called, however, the pg might be in a state where it cannot write
+to the object in order to remove the watch (i.e., during a scrub
+or while the object is degraded). In that case, we use
+Watch::get_delayed_cb() to generate another Context for use from
+the callbacks_for_degraded_object and Scrubber::callbacks lists.
+In either case, Watch::unregister_cb() does the right thing
+(SafeTimer::cancel_event() is harmless for contexts not registered
+with the timer).
+
+Notify Lifecycle
+----------------
+The notify timeout is simpler: a timeout callback is registered when
+the notify is init()'d. If all watchers ack notifies before the
+timeout occurs, the timeout is canceled and the client is notified
+of the notify completion. Otherwise, the timeout fires, the Notify
+object pings each Watch via cancel_notify to remove itself, and
+sends the notify completion to the client early.
diff --git a/doc/dev/osd_internals/wbthrottle.rst b/doc/dev/osd_internals/wbthrottle.rst
new file mode 100644
index 000000000..9b67efbb6
--- /dev/null
+++ b/doc/dev/osd_internals/wbthrottle.rst
@@ -0,0 +1,28 @@
+==================
+Writeback Throttle
+==================
+
+Previously, the filestore had a problem when handling large numbers of
+small ios. We throttle dirty data implicitly via the journal, but
+a large number of inodes can be dirtied without filling the journal
+resulting in a very long sync time when the sync finally does happen.
+The flusher was not an adequate solution to this problem since it
+forced writeback of small writes too eagerly killing performance.
+
+WBThrottle tracks unflushed io per hobject_t and ::fsyncs in lru
+order once the start_flusher threshold is exceeded for any of
+dirty bytes, dirty ios, or dirty inodes. While any of these exceed
+the hard_limit, we block on throttle() in _do_op.
+
+See src/os/WBThrottle.h, src/osd/WBThrottle.cc
+
+To track the open FDs through the writeback process, there is now an
+fdcache to cache open fds. lfn_open now returns a cached FDRef which
+implicitly closes the fd once all references have expired.
+
+Filestore syncs have a sideeffect of flushing all outstanding objects
+in the wbthrottle.
+
+lfn_unlink clears the cached FDRef and wbthrottle entries for the
+unlinked object when the last link is removed and asserts that all
+outstanding FDRefs for that object are dead.
diff --git a/doc/dev/peering.rst b/doc/dev/peering.rst
new file mode 100644
index 000000000..7ee5debc9
--- /dev/null
+++ b/doc/dev/peering.rst
@@ -0,0 +1,259 @@
+======================
+Peering
+======================
+
+Concepts
+--------
+
+*Peering*
+ the process of bringing all of the OSDs that store
+ a Placement Group (PG) into agreement about the state
+ of all of the objects (and their metadata) in that PG.
+ Note that agreeing on the state does not mean that
+ they all have the latest contents.
+
+*Acting set*
+ the ordered list of OSDs who are (or were as of some epoch)
+ responsible for a particular PG.
+
+*Up set*
+ the ordered list of OSDs responsible for a particular PG for
+ a particular epoch according to CRUSH. Normally this
+ is the same as the *acting set*, except when the *acting set* has been
+ explicitly overridden via *PG temp* in the OSDMap.
+
+*PG temp*
+ a temporary placement group acting set used while backfilling the
+ primary osd. Let say acting is [0,1,2] and we are
+ active+clean. Something happens and acting is now [3,1,2]. osd 3 is
+ empty and can't serve reads although it is the primary. osd.3 will
+ see that and request a *PG temp* of [1,2,3] to the monitors using a
+ MOSDPGTemp message so that osd.1 temporarily becomes the
+ primary. It will select osd.3 as a backfill peer and continue to
+ serve reads and writes while osd.3 is backfilled. When backfilling
+ is complete, *PG temp* is discarded and the acting set changes back
+ to [3,1,2] and osd.3 becomes the primary.
+
+*current interval* or *past interval*
+ a sequence of OSD map epochs during which the *acting set* and *up
+ set* for particular PG do not change
+
+*primary*
+ the (by convention first) member of the *acting set*,
+ who is responsible for coordination peering, and is
+ the only OSD that will accept client initiated
+ writes to objects in a placement group.
+
+*replica*
+ a non-primary OSD in the *acting set* for a placement group
+ (and who has been recognized as such and *activated* by the primary).
+
+*stray*
+ an OSD who is not a member of the current *acting set*, but
+ has not yet been told that it can delete its copies of a
+ particular placement group.
+
+*recovery*
+ ensuring that copies of all of the objects in a PG
+ are on all of the OSDs in the *acting set*. Once
+ *peering* has been performed, the primary can start
+ accepting write operations, and *recovery* can proceed
+ in the background.
+
+*PG info* basic metadata about the PG's creation epoch, the version
+ for the most recent write to the PG, *last epoch started*, *last
+ epoch clean*, and the beginning of the *current interval*. Any
+ inter-OSD communication about PGs includes the *PG info*, such that
+ any OSD that knows a PG exists (or once existed) also has a lower
+ bound on *last epoch clean* or *last epoch started*.
+
+*PG log*
+ a list of recent updates made to objects in a PG.
+ Note that these logs can be truncated after all OSDs
+ in the *acting set* have acknowledged up to a certain
+ point.
+
+*missing set*
+ Each OSD notes update log entries and if they imply updates to
+ the contents of an object, adds that object to a list of needed
+ updates. This list is called the *missing set* for that <OSD,PG>.
+
+*Authoritative History*
+ a complete, and fully ordered set of operations that, if
+ performed, would bring an OSD's copy of a Placement Group
+ up to date.
+
+*epoch*
+ a (monotonically increasing) OSD map version number
+
+*last epoch start*
+ the last epoch at which all nodes in the *acting set*
+ for a particular placement group agreed on an
+ *authoritative history*. At this point, *peering* is
+ deemed to have been successful.
+
+*up_thru*
+ before a primary can successfully complete the *peering* process,
+ it must inform a monitor that is alive through the current
+ OSD map epoch by having the monitor set its *up_thru* in the osd
+ map. This helps peering ignore previous *acting sets* for which
+ peering never completed after certain sequences of failures, such as
+ the second interval below:
+
+ - *acting set* = [A,B]
+ - *acting set* = [A]
+ - *acting set* = [] very shortly after (e.g., simultaneous failure, but staggered detection)
+ - *acting set* = [B] (B restarts, A does not)
+
+*last epoch clean*
+ the last epoch at which all nodes in the *acting set*
+ for a particular placement group were completely
+ up to date (both PG logs and object contents).
+ At this point, *recovery* is deemed to have been
+ completed.
+
+Description of the Peering Process
+----------------------------------
+
+The *Golden Rule* is that no write operation to any PG
+is acknowledged to a client until it has been persisted
+by all members of the *acting set* for that PG. This means
+that if we can communicate with at least one member of
+each *acting set* since the last successful *peering*, someone
+will have a record of every (acknowledged) operation
+since the last successful *peering*.
+This means that it should be possible for the current
+primary to construct and disseminate a new *authoritative history*.
+
+It is also important to appreciate the role of the OSD map
+(list of all known OSDs and their states, as well as some
+information about the placement groups) in the *peering*
+process:
+
+ When OSDs go up or down (or get added or removed)
+ this has the potential to affect the *active sets*
+ of many placement groups.
+
+ Before a primary successfully completes the *peering*
+ process, the OSD map must reflect that the OSD was alive
+ and well as of the first epoch in the *current interval*.
+
+ Changes can only be made after successful *peering*.
+
+Thus, a new primary can use the latest OSD map along with a recent
+history of past maps to generate a set of *past intervals* to
+determine which OSDs must be consulted before we can successfully
+*peer*. The set of past intervals is bounded by *last epoch started*,
+the most recent *past interval* for which we know *peering* completed.
+The process by which an OSD discovers a PG exists in the first place is
+by exchanging *PG info* messages, so the OSD always has some lower
+bound on *last epoch started*.
+
+The high level process is for the current PG primary to:
+
+ 1. get a recent OSD map (to identify the members of the all
+ interesting *acting sets*, and confirm that we are still the
+ primary).
+
+ #. generate a list of *past intervals* since *last epoch started*.
+ Consider the subset of those for which *up_thru* was greater than
+ the first interval epoch by the last interval epoch's OSD map; that is,
+ the subset for which *peering* could have completed before the *acting
+ set* changed to another set of OSDs.
+
+ Successful *peering* will require that we be able to contact at
+ least one OSD from each of *past interval*'s *acting set*.
+
+ #. ask every node in that list for its *PG info*, which includes the most
+ recent write made to the PG, and a value for *last epoch started*. If
+ we learn about a *last epoch started* that is newer than our own, we can
+ prune older *past intervals* and reduce the peer OSDs we need to contact.
+
+ #. if anyone else has (in its PG log) operations that I do not have,
+ instruct them to send me the missing log entries so that the primary's
+ *PG log* is up to date (includes the newest write)..
+
+ #. for each member of the current *acting set*:
+
+ a. ask it for copies of all PG log entries since *last epoch start*
+ so that I can verify that they agree with mine (or know what
+ objects I will be telling it to delete).
+
+ If the cluster failed before an operation was persisted by all
+ members of the *acting set*, and the subsequent *peering* did not
+ remember that operation, and a node that did remember that
+ operation later rejoined, its logs would record a different
+ (divergent) history than the *authoritative history* that was
+ reconstructed in the *peering* after the failure.
+
+ Since the *divergent* events were not recorded in other logs
+ from that *acting set*, they were not acknowledged to the client,
+ and there is no harm in discarding them (so that all OSDs agree
+ on the *authoritative history*). But, we will have to instruct
+ any OSD that stores data from a divergent update to delete the
+ affected (and now deemed to be apocryphal) objects.
+
+ #. ask it for its *missing set* (object updates recorded
+ in its PG log, but for which it does not have the new data).
+ This is the list of objects that must be fully replicated
+ before we can accept writes.
+
+ #. at this point, the primary's PG log contains an *authoritative history* of
+ the placement group, and the OSD now has sufficient
+ information to bring any other OSD in the *acting set* up to date.
+
+ #. if the primary's *up_thru* value in the current OSD map is not greater than
+ or equal to the first epoch in the *current interval*, send a request to the
+ monitor to update it, and wait until receive an updated OSD map that reflects
+ the change.
+
+ #. for each member of the current *acting set*:
+
+ a. send them log updates to bring their PG logs into agreement with
+ my own (*authoritative history*) ... which may involve deciding
+ to delete divergent objects.
+
+ #. await acknowledgment that they have persisted the PG log entries.
+
+ #. at this point all OSDs in the *acting set* agree on all of the meta-data,
+ and would (in any future *peering*) return identical accounts of all
+ updates.
+
+ a. start accepting client write operations (because we have unanimous
+ agreement on the state of the objects into which those updates are
+ being accepted). Note, however, that if a client tries to write to an
+ object it will be promoted to the front of the recovery queue, and the
+ write willy be applied after it is fully replicated to the current *acting set*.
+
+ #. update the *last epoch started* value in our local *PG info*, and instruct
+ other *active set* OSDs to do the same.
+
+ #. start pulling object data updates that other OSDs have, but I do not. We may
+ need to query OSDs from additional *past intervals* prior to *last epoch started*
+ (the last time *peering* completed) and following *last epoch clean* (the last epoch that
+ recovery completed) in order to find copies of all objects.
+
+ #. start pushing object data updates to other OSDs that do not yet have them.
+
+ We push these updates from the primary (rather than having the replicas
+ pull them) because this allows the primary to ensure that a replica has
+ the current contents before sending it an update write. It also makes
+ it possible for a single read (from the primary) to be used to write
+ the data to multiple replicas. If each replica did its own pulls,
+ the data might have to be read multiple times.
+
+ #. once all replicas store the all copies of all objects (that
+ existed prior to the start of this epoch) we can update *last
+ epoch clean* in the *PG info*, and we can dismiss all of the
+ *stray* replicas, allowing them to delete their copies of objects
+ for which they are no longer in the *acting set*.
+
+ We could not dismiss the *strays* prior to this because it was possible
+ that one of those *strays* might hold the sole surviving copy of an
+ old object (all of whose copies disappeared before they could be
+ replicated on members of the current *acting set*).
+
+State Model
+-----------
+
+.. graphviz:: peering_graph.generated.dot
diff --git a/doc/dev/perf.rst b/doc/dev/perf.rst
new file mode 100644
index 000000000..57742eec4
--- /dev/null
+++ b/doc/dev/perf.rst
@@ -0,0 +1,55 @@
+Using perf
+==========
+
+Top::
+
+ sudo perf top -p `pidof ceph-osd`
+
+To capture some data with call graphs::
+
+ sudo perf record -p `pidof ceph-osd` -F 99 --call-graph dwarf -- sleep 60
+
+To view by caller (where you can see what each top function calls)::
+
+ sudo perf report --call-graph caller
+
+To view by callee (where you can see who calls each top function)::
+
+ sudo perf report --call-graph callee
+
+:note: If the caller/callee views look the same you may be
+ suffering from a kernel bug; upgrade to 4.8 or later.
+
+Common Issues
+-------------
+
+Ceph use `RelWithDebInfo` as its default `CMAKE_BUILD_TYPE`. Hence `-O2 -g` is
+used to compile the tree in this case. And the `-O2` optimization level
+enables `-fomit-frame-pointer` by default. But this prevents stack profilers
+from accessing the complete stack information. So one can disable this option
+when launching `cmake` ::
+
+ cmake -DCMAKE_CXX_FLAGS="-fno-omit-frame-pointer"
+
+or when building the tree::
+
+ make CMAKE_CXX_FLAGS="-fno-omit-frame-pointer"
+
+
+Flamegraphs
+-----------
+
+First, get things set up::
+
+ cd ~/src
+ git clone https://github.com/brendangregg/FlameGraph
+
+Run ceph, then record some perf data::
+
+ sudo perf record -p `pidof ceph-osd` -F 99 --call-graph dwarf -- sleep 60
+
+Then generate the flamegraph::
+
+ sudo perf script | ~/src/FlameGraph/stackcollapse-perf.pl > /tmp/folded
+ ~/src/FlameGraph/flamegraph.pl /tmp/folded > /tmp/perf.svg
+ firefox /tmp/perf.svg
diff --git a/doc/dev/perf_counters.rst b/doc/dev/perf_counters.rst
new file mode 100644
index 000000000..2f49f772f
--- /dev/null
+++ b/doc/dev/perf_counters.rst
@@ -0,0 +1,198 @@
+===============
+ Perf counters
+===============
+
+The perf counters provide generic internal infrastructure for gauges and counters. The counted values can be both integer and float. There is also an "average" type (normally float) that combines a sum and num counter which can be divided to provide an average.
+
+The intention is that this data will be collected and aggregated by a tool like ``collectd`` or ``statsd`` and fed into a tool like ``graphite`` for graphing and analysis. Also, note the :doc:`../mgr/prometheus`.
+
+Access
+------
+
+The perf counter data is accessed via the admin socket. For example::
+
+ ceph daemon osd.0 perf schema
+ ceph daemon osd.0 perf dump
+
+
+Collections
+-----------
+
+The values are grouped into named collections, normally representing a subsystem or an instance of a subsystem. For example, the internal ``throttle`` mechanism reports statistics on how it is throttling, and each instance is named something like::
+
+
+ throttle-msgr_dispatch_throttler-hbserver
+ throttle-msgr_dispatch_throttler-client
+ throttle-filestore_bytes
+ ...
+
+
+Schema
+------
+
+The ``perf schema`` command dumps a json description of which values are available, and what their type is. Each named value as a ``type`` bitfield, with the following bits defined.
+
++------+-------------------------------------+
+| bit | meaning |
++======+=====================================+
+| 1 | floating point value |
++------+-------------------------------------+
+| 2 | unsigned 64-bit integer value |
++------+-------------------------------------+
+| 4 | average (sum + count pair), where |
++------+-------------------------------------+
+| 8 | counter (vs gauge) |
++------+-------------------------------------+
+
+Every value will have either bit 1 or 2 set to indicate the type
+(float or integer).
+
+If bit 8 is set (counter), the value is monotonically increasing and
+the reader may want to subtract off the previously read value to get
+the delta during the previous interval.
+
+If bit 4 is set (average), there will be two values to read, a sum and
+a count. If it is a counter, the average for the previous interval
+would be sum delta (since the previous read) divided by the count
+delta. Alternatively, dividing the values outright would provide the
+lifetime average value. Normally these are used to measure latencies
+(number of requests and a sum of request latencies), and the average
+for the previous interval is what is interesting.
+
+Instead of interpreting the bit fields, the ``metric type`` has a
+value of either ``guage`` or ``counter``, and the ``value type``
+property will be one of ``real``, ``integer``, ``real-integer-pair``
+(for a sum + real count pair), or ``integer-integer-pair`` (for a
+sum + integer count pair).
+
+Here is an example of the schema output::
+
+ {
+ "throttle-bluestore_throttle_bytes": {
+ "val": {
+ "type": 2,
+ "metric_type": "gauge",
+ "value_type": "integer",
+ "description": "Currently available throttle",
+ "nick": ""
+ },
+ "max": {
+ "type": 2,
+ "metric_type": "gauge",
+ "value_type": "integer",
+ "description": "Max value for throttle",
+ "nick": ""
+ },
+ "get_started": {
+ "type": 10,
+ "metric_type": "counter",
+ "value_type": "integer",
+ "description": "Number of get calls, increased before wait",
+ "nick": ""
+ },
+ "get": {
+ "type": 10,
+ "metric_type": "counter",
+ "value_type": "integer",
+ "description": "Gets",
+ "nick": ""
+ },
+ "get_sum": {
+ "type": 10,
+ "metric_type": "counter",
+ "value_type": "integer",
+ "description": "Got data",
+ "nick": ""
+ },
+ "get_or_fail_fail": {
+ "type": 10,
+ "metric_type": "counter",
+ "value_type": "integer",
+ "description": "Get blocked during get_or_fail",
+ "nick": ""
+ },
+ "get_or_fail_success": {
+ "type": 10,
+ "metric_type": "counter",
+ "value_type": "integer",
+ "description": "Successful get during get_or_fail",
+ "nick": ""
+ },
+ "take": {
+ "type": 10,
+ "metric_type": "counter",
+ "value_type": "integer",
+ "description": "Takes",
+ "nick": ""
+ },
+ "take_sum": {
+ "type": 10,
+ "metric_type": "counter",
+ "value_type": "integer",
+ "description": "Taken data",
+ "nick": ""
+ },
+ "put": {
+ "type": 10,
+ "metric_type": "counter",
+ "value_type": "integer",
+ "description": "Puts",
+ "nick": ""
+ },
+ "put_sum": {
+ "type": 10,
+ "metric_type": "counter",
+ "value_type": "integer",
+ "description": "Put data",
+ "nick": ""
+ },
+ "wait": {
+ "type": 5,
+ "metric_type": "gauge",
+ "value_type": "real-integer-pair",
+ "description": "Waiting latency",
+ "nick": ""
+ }
+ }
+
+
+Dump
+----
+
+The actual dump is similar to the schema, except that average values are grouped. For example::
+
+ {
+ "throttle-msgr_dispatch_throttler-hbserver" : {
+ "get_or_fail_fail" : 0,
+ "get_sum" : 0,
+ "max" : 104857600,
+ "put" : 0,
+ "val" : 0,
+ "take" : 0,
+ "get_or_fail_success" : 0,
+ "wait" : {
+ "avgcount" : 0,
+ "sum" : 0
+ },
+ "get" : 0,
+ "take_sum" : 0,
+ "put_sum" : 0
+ },
+ "throttle-msgr_dispatch_throttler-client" : {
+ "get_or_fail_fail" : 0,
+ "get_sum" : 82760,
+ "max" : 104857600,
+ "put" : 2637,
+ "val" : 0,
+ "take" : 0,
+ "get_or_fail_success" : 0,
+ "wait" : {
+ "avgcount" : 0,
+ "sum" : 0
+ },
+ "get" : 2637,
+ "take_sum" : 0,
+ "put_sum" : 82760
+ }
+ }
+
diff --git a/doc/dev/perf_histograms.rst b/doc/dev/perf_histograms.rst
new file mode 100644
index 000000000..c277ac209
--- /dev/null
+++ b/doc/dev/perf_histograms.rst
@@ -0,0 +1,677 @@
+=================
+ Perf histograms
+=================
+
+The perf histograms build on perf counters infrastructure. Histograms are built for a number of counters and simplify gathering data on which groups of counter values occur most often over time.
+Perf histograms are currently unsigned 64-bit integer counters, so they're mostly useful for time and sizes. Data dumped by perf histogram can then be feed into other analysis tools/scripts.
+
+Access
+------
+
+The perf histogram data are accessed via the admin socket. For example::
+
+ ceph daemon osd.0 perf histogram schema
+ ceph daemon osd.0 perf histogram dump
+
+
+Collections
+-----------
+
+The histograms are grouped into named collections, normally representing a subsystem or an instance of a subsystem. For example, the internal ``throttle`` mechanism reports statistics on how it is throttling, and each instance is named something like::
+
+
+ op_r_latency_out_bytes_histogram
+ op_rw_latency_in_bytes_histogram
+ op_rw_latency_out_bytes_histogram
+ ...
+
+
+Schema
+------
+
+The ``perf histogram schema`` command dumps a json description of which values are available, and what their type is. Each named value as a ``type`` bitfield, with the 5-th bit always set and following bits defined.
+
++------+-------------------------------------+
+| bit | meaning |
++======+=====================================+
+| 1 | floating point value |
++------+-------------------------------------+
+| 2 | unsigned 64-bit integer value |
++------+-------------------------------------+
+| 4 | average (sum + count pair) |
++------+-------------------------------------+
+| 8 | counter (vs gauge) |
++------+-------------------------------------+
+
+In other words, histogram of type "18" is a histogram of unsigned 64-bit integer values (16 + 2).
+
+Here is an example of the schema output::
+
+ {
+ "AsyncMessenger::Worker-0": {},
+ "AsyncMessenger::Worker-1": {},
+ "AsyncMessenger::Worker-2": {},
+ "mutex-WBThrottle::lock": {},
+ "objecter": {},
+ "osd": {
+ "op_r_latency_out_bytes_histogram": {
+ "type": 18,
+ "description": "Histogram of operation latency (including queue time) + da ta read",
+ "nick": ""
+ },
+ "op_w_latency_in_bytes_histogram": {
+ "type": 18,
+ "description": "Histogram of operation latency (including queue time) + da ta written",
+ "nick": ""
+ },
+ "op_rw_latency_in_bytes_histogram": {
+ "type": 18,
+ "description": "Histogram of rw operation latency (including queue time) + data written",
+ "nick": ""
+ },
+ "op_rw_latency_out_bytes_histogram": {
+ "type": 18,
+ "description": "Histogram of rw operation latency (including queue time) + data read",
+ "nick": ""
+ }
+ }
+ }
+
+
+Dump
+----
+
+The actual dump is similar to the schema, except that there are actual value groups. For example::
+
+ "osd": {
+ "op_r_latency_out_bytes_histogram": {
+ "axes": [
+ {
+ "name": "Latency (usec)",
+ "min": 0,
+ "quant_size": 100000,
+ "buckets": 32,
+ "scale_type": "log2",
+ "ranges": [
+ {
+ "max": -1
+ },
+ {
+ "min": 0,
+ "max": 99999
+ },
+ {
+ "min": 100000,
+ "max": 199999
+ },
+ {
+ "min": 200000,
+ "max": 399999
+ },
+ {
+ "min": 400000,
+ "max": 799999
+ },
+ {
+ "min": 800000,
+ "max": 1599999
+ },
+ {
+ "min": 1600000,
+ "max": 3199999
+ },
+ {
+ "min": 3200000,
+ "max": 6399999
+ },
+ {
+ "min": 6400000,
+ "max": 12799999
+ },
+ {
+ "min": 12800000,
+ "max": 25599999
+ },
+ {
+ "min": 25600000,
+ "max": 51199999
+ },
+ {
+ "min": 51200000,
+ "max": 102399999
+ },
+ {
+ "min": 102400000,
+ "max": 204799999
+ },
+ {
+ "min": 204800000,
+ "max": 409599999
+ },
+ {
+ "min": 409600000,
+ "max": 819199999
+ },
+ {
+ "min": 819200000,
+ "max": 1638399999
+ },
+ {
+ "min": 1638400000,
+ "max": 3276799999
+ },
+ {
+ "min": 3276800000,
+ "max": 6553599999
+ },
+ {
+ "min": 6553600000,
+ "max": 13107199999
+ },
+ {
+ "min": 13107200000,
+ "max": 26214399999
+ },
+ {
+ "min": 26214400000,
+ "max": 52428799999
+ },
+ {
+ "min": 52428800000,
+ "max": 104857599999
+ },
+ {
+ "min": 104857600000,
+ "max": 209715199999
+ },
+ {
+ "min": 209715200000,
+ "max": 419430399999
+ },
+ {
+ "min": 419430400000,
+ "max": 838860799999
+ },
+ {
+ "min": 838860800000,
+ "max": 1677721599999
+ },
+ {
+ "min": 1677721600000,
+ "max": 3355443199999
+ },
+ {
+ "min": 3355443200000,
+ "max": 6710886399999
+ },
+ {
+ "min": 6710886400000,
+ "max": 13421772799999
+ },
+ {
+ "min": 13421772800000,
+ "max": 26843545599999
+ },
+ {
+ "min": 26843545600000,
+ "max": 53687091199999
+ },
+ },
+ {
+ "min": 53687091200000
+ }
+ ]
+ },
+ {
+ "name": "Request size (bytes)",
+ "min": 0,
+ "quant_size": 512,
+ "buckets": 32,
+ "scale_type": "log2",
+ "ranges": [
+ {
+ "max": -1
+ },
+ {
+ "min": 0,
+ "max": 511
+ },
+ {
+ "min": 512,
+ "max": 1023
+ },
+ {
+ "min": 1024,
+ "max": 2047
+ },
+ {
+ "min": 2048,
+ "max": 4095
+ },
+ {
+ "min": 4096,
+ "max": 8191
+ },
+ {
+ "min": 8192,
+ "max": 16383
+ },
+ {
+ "min": 16384,
+ "max": 32767
+ },
+ {
+ "min": 32768,
+ "max": 65535
+ },
+ {
+ "min": 65536,
+ "max": 131071
+ },
+ {
+ "min": 131072,
+ "max": 262143
+ },
+ {
+ "min": 262144,
+ "max": 524287
+ },
+ {
+ "min": 524288,
+ "max": 1048575
+ },
+ {
+ "min": 1048576,
+ "max": 2097151
+ },
+ {
+ "min": 2097152,
+ "max": 4194303
+ },
+ {
+ "min": 4194304,
+ "max": 8388607
+ },
+ {
+ "min": 8388608,
+ "max": 16777215
+ },
+ {
+ "min": 16777216,
+ "max": 33554431
+ },
+ {
+ "min": 33554432,
+ "max": 67108863
+ },
+ {
+ "min": 67108864,
+ "max": 134217727
+ },
+ {
+ "min": 134217728,
+ "max": 268435455
+ },
+ {
+ "min": 268435456,
+ "max": 536870911
+ },
+ {
+ "min": 536870912,
+ "max": 1073741823
+ },
+ {
+ "min": 1073741824,
+ "max": 2147483647
+ },
+ {
+ "min": 2147483648,
+ "max": 4294967295
+ },
+ {
+ "min": 4294967296,
+ "max": 8589934591
+ },
+ {
+ "min": 8589934592,
+ "max": 17179869183
+ },
+ {
+ "min": 17179869184,
+ "max": 34359738367
+ },
+ {
+ "min": 34359738368,
+ "max": 68719476735
+ },
+ {
+ "min": 68719476736,
+ "max": 137438953471
+ },
+ {
+ "min": 137438953472,
+ "max": 274877906943
+ },
+ {
+ "min": 274877906944
+ }
+ ]
+ }
+ ],
+ "values": [
+ [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ],
+ [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ],
+ [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ],
+ [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ],
+ [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ],
+ [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ],
+ [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ],
+ [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ],
+ [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0
+ ]
+ ]
+ }
+ },
+
+This represents the 2d histogram, consisting of 9 history entrires and 32 value groups per each history entry.
+"Ranges" element denote value bounds for each of value groups. "Buckets" denote amount of value groups ("buckets"),
+"Min" is a minimum accepted valaue, "quant_size" is quantization unit and "scale_type" is either "log2" (logarhitmic
+scale) or "linear" (linear scale).
+You can use histogram_dump.py tool (see src/tools/histogram_dump.py) for quick visualisation of existing histogram
+data.
diff --git a/doc/dev/placement-group.rst b/doc/dev/placement-group.rst
new file mode 100644
index 000000000..e29be2fa6
--- /dev/null
+++ b/doc/dev/placement-group.rst
@@ -0,0 +1,210 @@
+============================
+ PG (Placement Group) notes
+============================
+
+Miscellaneous copy-pastes from emails, when this gets cleaned up it
+should move out of /dev.
+
+Overview
+========
+
+PG = "placement group". When placing data in the cluster, objects are
+mapped into PGs, and those PGs are mapped onto OSDs. We use the
+indirection so that we can group objects, which reduces the amount of
+per-object metadata we need to keep track of and processes we need to
+run (it would be prohibitively expensive to track eg the placement
+history on a per-object basis). Increasing the number of PGs can
+reduce the variance in per-OSD load across your cluster, but each PG
+requires a bit more CPU and memory on the OSDs that are storing it. We
+try and ballpark it at 100 PGs/OSD, although it can vary widely
+without ill effects depending on your cluster. You hit a bug in how we
+calculate the initial PG number from a cluster description.
+
+There are a couple of different categories of PGs; the 6 that exist
+(in the original emailer's ``ceph -s`` output) are "local" PGs which
+are tied to a specific OSD. However, those aren't actually used in a
+standard Ceph configuration.
+
+
+Mapping algorithm (simplified)
+==============================
+
+| > How does the Object->PG mapping look like, do you map more than one object on
+| > one PG, or do you sometimes map an object to more than one PG? How about the
+| > mapping of PGs to OSDs, does one PG belong to exactly one OSD?
+| >
+| > Does one PG represent a fixed amount of storage space?
+
+Many objects map to one PG.
+
+Each object maps to exactly one PG.
+
+One PG maps to a single list of OSDs, where the first one in the list
+is the primary and the rest are replicas.
+
+Many PGs can map to one OSD.
+
+A PG represents nothing but a grouping of objects; you configure the
+number of PGs you want, number of OSDs * 100 is a good starting point
+, and all of your stored objects are pseudo-randomly evenly distributed
+to the PGs. So a PG explicitly does NOT represent a fixed amount of
+storage; it represents 1/pg_num'th of the storage you happen to have
+on your OSDs.
+
+Ignoring the finer points of CRUSH and custom placement, it goes
+something like this in pseudocode::
+
+ locator = object_name
+ obj_hash = hash(locator)
+ pg = obj_hash % num_pg
+ OSDs_for_pg = crush(pg) # returns a list of OSDs
+ primary = osds_for_pg[0]
+ replicas = osds_for_pg[1:]
+
+If you want to understand the crush() part in the above, imagine a
+perfectly spherical datacenter in a vacuum ;) that is, if all OSDs
+have weight 1.0, and there is no topology to the data center (all OSDs
+are on the top level), and you use defaults, etc, it simplifies to
+consistent hashing; you can think of it as::
+
+ def crush(pg):
+ all_osds = ['osd.0', 'osd.1', 'osd.2', ...]
+ result = []
+ # size is the number of copies; primary+replicas
+ while len(result) < size:
+ r = hash(pg)
+ chosen = all_osds[ r % len(all_osds) ]
+ if chosen in result:
+ # OSD can be picked only once
+ continue
+ result.append(chosen)
+ return result
+
+User-visible PG States
+======================
+
+.. todo:: diagram of states and how they can overlap
+
+*creating*
+ the PG is still being created
+
+*active*
+ requests to the PG will be processed
+
+*clean*
+ all objects in the PG are replicated the correct number of times
+
+*down*
+ a replica with necessary data is down, so the pg is offline
+
+*recovery_unfound*
+ recovery could not finish because object(s) are unfound.
+
+*backfill_unfound*
+ backfill could not finish because object(s) are unfound.
+
+*premerge*
+ the PG is in a quiesced-IO state due to an impending PG merge. That
+ happens when pg_num_pending < pg_num, and applies to the PGs with
+ pg_num_pending <= ps < pg_num as well as the corresponding peer PG
+ that it is merging with.
+
+*scrubbing*
+ the PG is being checked for inconsistencies
+
+*degraded*
+ some objects in the PG are not replicated enough times yet
+
+*inconsistent*
+ replicas of the PG are not consistent (e.g. objects are
+ the wrong size, objects are missing from one replica *after* recovery
+ finished, etc.)
+
+*peering*
+ the PG is undergoing the :doc:`/dev/peering` process
+
+*repair*
+ the PG is being checked and any inconsistencies found will be repaired (if possible)
+
+*recovering*
+ objects are being migrated/synchronized with replicas
+
+*backfill_wait*
+ the PG is waiting in line to start backfill
+
+*incomplete*
+ a pg is missing a necessary period of history from its
+ log. If you see this state, report a bug, and try to start any
+ failed OSDs that may contain the needed information.
+
+*stale*
+ the PG is in an unknown state - the monitors have not received
+ an update for it since the PG mapping changed.
+
+*remapped*
+ the PG is temporarily mapped to a different set of OSDs from what
+ CRUSH specified
+
+*deep*
+ In conjunction with *scrubbing* the scrub is a deep scrub
+
+*backfilling*
+ a special case of recovery, in which the entire contents of
+ the PG are scanned and synchronized, instead of inferring what
+ needs to be transferred from the PG logs of recent operations
+
+*backfill_toofull*
+ backfill reservation rejected, OSD too full
+
+*recovery_wait*
+ the PG is waiting for the local/remote recovery reservations
+
+*undersized*
+ the PG can't select enough OSDs given its size
+
+*activating*
+ the PG is peered but not yet active
+
+*peered*
+ the PG peered but can't go active
+
+*snaptrim*
+ the PG is trimming snaps
+
+*snaptrim_wait*
+ the PG is queued to trim snaps
+
+*recovery_toofull*
+ recovery reservation rejected, OSD too full
+
+*snaptrim_error*
+ the PG could not complete snap trimming due to errors
+
+*forced_recovery*
+ the PG has been marked for highest priority recovery
+
+*forced_backfill*
+ the PG has been marked for highest priority backfill
+
+*failed_repair*
+ an attempt to repair the PG has failed. Manual intervention is required.
+
+
+OMAP STATISTICS
+===============
+
+Omap statistics are gathered during deep scrub and displayed in the output of
+the following commands::
+
+ ceph pg dump
+ ceph pg dump all
+ ceph pg dump summary
+ ceph pg dump pgs
+ ceph pg dump pools
+ ceph pg ls
+
+As these statistics are not updated continuously they may be quite inaccurate in
+an environment where deep scrubs are run infrequently and/or there is a lot of
+omap activity. As such they should not be relied on for exact accuracy but
+rather used as a guide. Running a deep scrub and checking these statistics
+immediately afterwards should give a good indication of current omap usage.
diff --git a/doc/dev/quick_guide.rst b/doc/dev/quick_guide.rst
new file mode 100644
index 000000000..8e27dfb4b
--- /dev/null
+++ b/doc/dev/quick_guide.rst
@@ -0,0 +1,140 @@
+=================================
+ Developer Guide (Quick)
+=================================
+
+This guide will describe how to build and test Ceph for development.
+
+Development
+-----------
+
+The ``run-make-check.sh`` script will install Ceph dependencies,
+compile everything in debug mode and run a number of tests to verify
+the result behaves as expected.
+
+.. prompt:: bash $
+
+ ./run-make-check.sh
+
+Optionally if you want to work on a specific component of Ceph,
+install the dependencies and build Ceph in debug mode with required cmake flags.
+
+Example:
+
+.. prompt:: bash $
+
+ ./install-deps.sh
+ ./do_cmake.sh -DWITH_MANPAGE=OFF -DWITH_BABELTRACE=OFF -DWITH_MGR_DASHBOARD_FRONTEND=OFF
+
+Running a development deployment
+--------------------------------
+Ceph contains a script called ``vstart.sh`` (see also :doc:`/dev/dev_cluster_deployement`) which allows developers to quickly test their code using
+a simple deployment on your development system. Once the build finishes successfully, start the ceph
+deployment using the following command:
+
+.. prompt:: bash $
+
+ cd ceph/build # Assuming this is where you ran cmake
+ make vstart
+ ../src/vstart.sh -d -n -x
+
+You can also configure ``vstart.sh`` to use only one monitor and one metadata server by using the following:
+
+.. prompt:: bash $
+
+ MON=1 MDS=1 ../src/vstart.sh -d -n -x
+
+The system creates two pools on startup: `cephfs_data_a` and `cephfs_metadata_a`. Let's get some stats on
+the current pools:
+
+.. code-block:: console
+
+ $ bin/ceph osd pool stats
+ *** DEVELOPER MODE: setting PATH, PYTHONPATH and LD_LIBRARY_PATH ***
+ pool cephfs_data_a id 1
+ nothing is going on
+
+ pool cephfs_metadata_a id 2
+ nothing is going on
+
+ $ bin/ceph osd pool stats cephfs_data_a
+ *** DEVELOPER MODE: setting PATH, PYTHONPATH and LD_LIBRARY_PATH ***
+ pool cephfs_data_a id 1
+ nothing is going on
+
+ $ bin/rados df
+ POOL_NAME USED OBJECTS CLONES COPIES MISSING_ON_PRIMARY UNFOUND DEGRADED RD_OPS RD WR_OPS WR
+ cephfs_data_a 0 0 0 0 0 0 0 0 0 0 0
+ cephfs_metadata_a 2246 21 0 63 0 0 0 0 0 42 8192
+
+ total_objects 21
+ total_used 244G
+ total_space 1180G
+
+
+Make a pool and run some benchmarks against it:
+
+.. prompt:: bash $
+
+ bin/ceph osd pool create mypool
+ bin/rados -p mypool bench 10 write -b 123
+
+Place a file into the new pool:
+
+.. prompt:: bash $
+
+ bin/rados -p mypool put objectone <somefile>
+ bin/rados -p mypool put objecttwo <anotherfile>
+
+List the objects in the pool:
+
+.. prompt:: bash $
+
+ bin/rados -p mypool ls
+
+Once you are done, type the following to stop the development ceph deployment:
+
+.. prompt:: bash $
+
+ ../src/stop.sh
+
+Resetting your vstart environment
+---------------------------------
+
+The vstart script creates out/ and dev/ directories which contain
+the cluster's state. If you want to quickly reset your environment,
+you might do something like this:
+
+.. prompt:: bash [build]$
+
+ ../src/stop.sh
+ rm -rf out dev
+ MDS=1 MON=1 OSD=3 ../src/vstart.sh -n -d
+
+Running a RadosGW development environment
+-----------------------------------------
+
+Set the ``RGW`` environment variable when running vstart.sh to enable the RadosGW.
+
+.. prompt:: bash $
+
+ cd build
+ RGW=1 ../src/vstart.sh -d -n -x
+
+You can now use the swift python client to communicate with the RadosGW.
+
+.. prompt:: bash $
+
+ swift -A http://localhost:8000/auth -U test:tester -K testing list
+ swift -A http://localhost:8000/auth -U test:tester -K testing upload mycontainer ceph
+ swift -A http://localhost:8000/auth -U test:tester -K testing list
+
+
+Run unit tests
+--------------
+
+The tests are located in `src/tests`. To run them type:
+
+.. prompt:: bash $
+
+ make check
+
diff --git a/doc/dev/rados-client-protocol.rst b/doc/dev/rados-client-protocol.rst
new file mode 100644
index 000000000..920c65f39
--- /dev/null
+++ b/doc/dev/rados-client-protocol.rst
@@ -0,0 +1,117 @@
+RADOS client protocol
+=====================
+
+This is very incomplete, but one must start somewhere.
+
+Basics
+------
+
+Requests are MOSDOp messages. Replies are MOSDOpReply messages.
+
+An object request is targeted at an hobject_t, which includes a pool,
+hash value, object name, placement key (usually empty), and snapid.
+
+The hash value is a 32-bit hash value, normally generated by hashing
+the object name. The hobject_t can be arbitrarily constructed,
+though, with any hash value and name. Note that in the MOSDOp these
+components are spread across several fields and not logically
+assembled in an actual hobject_t member (mainly historical reasons).
+
+A request can also target a PG. In this case, the *ps* value matches
+a specific PG, the object name is empty, and (hopefully) the ops in
+the request are PG ops.
+
+Either way, the request ultimately targets a PG, either by using the
+explicit pgid or by folding the hash value onto the current number of
+pgs in the pool. The client sends the request to the primary for the
+associated PG.
+
+Each request is assigned a unique tid.
+
+Resends
+-------
+
+If there is a connection drop, the client will resend any outstanding
+requests.
+
+Any time there is a PG mapping change such that the primary changes,
+the client is responsible for resending the request. Note that
+although there may be an interval change from the OSD's perspective
+(triggering PG peering), if the primary doesn't change then the client
+need not resend.
+
+There are a few exceptions to this rule:
+
+ * There is a last_force_op_resend field in the pg_pool_t in the
+ OSDMap. If this changes, then the clients are forced to resend any
+ outstanding requests. (This happens when tiering is adjusted, for
+ example.)
+ * Some requests are such that they are resent on *any* PG interval
+ change, as defined by pg_interval_t's is_new_interval() (the same
+ criteria used by peering in the OSD).
+ * If the PAUSE OSDMap flag is set and unset.
+
+Each time a request is sent to the OSD the *attempt* field is incremented. The
+first time it is 0, the next 1, etc.
+
+Backoff
+-------
+
+Ordinarily the OSD will simply queue any requests it can't immediately
+process in memory until such time as it can. This can become
+problematic because the OSD limits the total amount of RAM consumed by
+incoming messages: if either of the thresholds for the number of
+messages or the number of bytes is reached, new messages will not be
+read off the network socket, causing backpressure through the network.
+
+In some cases, though, the OSD knows or expects that a PG or object
+will be unavailable for some time and does not want to consume memory
+by queuing requests. In these cases it can send a MOSDBackoff message
+to the client.
+
+A backoff request has four properties:
+
+#. the op code (block, unblock, or ack-block)
+#. *id*, a unique id assigned within this session
+#. hobject_t begin
+#. hobject_t end
+
+There are two types of backoff: a *PG* backoff will plug all requests
+targeting an entire PG at the client, as described by a range of the
+hash/hobject_t space [begin,end), while an *object* backoff will plug
+all requests targeting a single object (begin == end).
+
+When the client receives a *block* backoff message, it is now
+responsible for *not* sending any requests for hobject_ts described by
+the backoff. The backoff remains in effect until the backoff is
+cleared (via an 'unblock' message) or the OSD session is closed. A
+*ack_block* message is sent back to the OSD immediately to acknowledge
+receipt of the backoff.
+
+When an unblock is
+received, it will reference a specific id that the client previous had
+blocked. However, the range described by the unblock may be smaller
+than the original range, as the PG may have split on the OSD. The unblock
+should *only* unblock the range specified in the unblock message. Any requests
+that fall within the unblock request range are reexamined and, if no other
+installed backoff applies, resent.
+
+On the OSD, Backoffs are also tracked across ranges of the hash space, and
+exist in three states:
+
+#. new
+#. acked
+#. deleting
+
+A newly installed backoff is set to *new* and a message is sent to the
+client. When the *ack-block* message is received it is changed to the
+*acked* state. The OSD may process other messages from the client that
+are covered by the backoff in the *new* state, but once the backoff is
+*acked* it should never see a blocked request unless there is a bug.
+
+If the OSD wants to a remove a backoff in the *acked* state it can
+simply remove it and notify the client. If the backoff is in the
+*new* state it must move it to the *deleting* state and continue to
+use it to discard client requests until the *ack-block* message is
+received, at which point it can finally be removed. This is necessary to
+preserve the order of operations processed by the OSD.
diff --git a/doc/dev/radosgw/admin/adminops_nonimplemented.rst b/doc/dev/radosgw/admin/adminops_nonimplemented.rst
new file mode 100644
index 000000000..e579bd5aa
--- /dev/null
+++ b/doc/dev/radosgw/admin/adminops_nonimplemented.rst
@@ -0,0 +1,495 @@
+==================
+ Admin Operations
+==================
+
+An admin API request will be done on a URI that starts with the configurable 'admin'
+resource entry point. Authorization for the admin API duplicates the S3 authorization
+mechanism. Some operations require that the user holds special administrative capabilities.
+The response entity type (XML or JSON) may be specified as the 'format' option in the
+request and defaults to JSON if not specified.
+
+Get Object
+==========
+
+Get an existing object. NOTE: Does not require owner to be non-suspended.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/bucket?object&format=json HTTP/1.1
+ Host {fqdn}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``bucket``
+
+:Description: The bucket containing the object to be retrieved.
+:Type: String
+:Example: ``foo_bucket``
+:Required: Yes
+
+``object``
+
+:Description: The object to be retrieved.
+:Type: String
+:Example: ``foo.txt``
+:Required: Yes
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, returns the desired object.
+
+``object``
+
+:Description: The desired object.
+:Type: Object
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``NoSuchObject``
+
+:Description: Specified object does not exist.
+:Code: 404 Not Found
+
+Head Object
+===========
+
+Verify the existence of an object. If the object exists,
+metadata headers for the object will be returned.
+
+Syntax
+~~~~~~
+
+::
+
+ HEAD /{admin}/bucket?object HTTP/1.1
+ Host {fqdn}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``bucket``
+
+:Description: The bucket containing the object to be retrieved.
+:Type: String
+:Example: ``foo_bucket``
+:Required: Yes
+
+``object``
+
+:Description: The object to be retrieved.
+:Type: String
+:Example: ``foo.txt``
+:Required: Yes
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+None.
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``NoSuchObject``
+
+:Description: Specified object does not exist.
+:Code: 404 Not Found
+
+Get Zone Info
+=============
+
+Get cluster information.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/zone&format=json HTTP/1.1
+ Host {fqdn}
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, returns cluster pool configuration.
+
+``zone``
+
+:Description: Contains current cluster pool configuration.
+:Type: Container
+
+``domain_root``
+
+:Description: root of all buckets.
+:Type: String
+:Parent: ``cluster``
+
+``control_pool``
+
+:Description:
+:Type: String
+:Parent: ``cluster``
+
+``gc_pool``
+
+:Description: Garbage collection pool.
+:Type: String
+:Parent: ``cluster``
+
+``log_pool``
+
+:Description: Log pool.
+:Type: String
+:Parent: ``cluster``
+
+``intent_log_pool``
+
+:Description: Intent log pool.
+:Type: String
+:Parent: ``cluster``
+
+``usage_log_pool``
+
+:Description: Usage log pool.
+:Type: String
+:Parent: ``cluster``
+
+``user_keys_pool``
+
+:Description: User key pool.
+:Type: String
+:Parent: ``cluster``
+
+``user_email_pool``
+
+:Description: User email pool.
+:Type: String
+:Parent: ``cluster``
+
+``user_swift_pool``
+
+:Description: Pool of swift users.
+:Type: String
+:Parent: ``cluster``
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+None.
+
+Example Response
+~~~~~~~~~~~~~~~~
+
+::
+
+ HTTP/1.1 200
+ Content-Type: application/json
+
+ {
+ "domain_root": ".rgw",
+ "control_pool": ".rgw.control",
+ "gc_pool": ".rgw.gc",
+ "log_pool": ".log",
+ "intent_log_pool": ".intent-log",
+ "usage_log_pool": ".usage",
+ "user_keys_pool": ".users",
+ "user_email_pool": ".users.email",
+ "user_swift_pool": ".users.swift",
+ "user_uid_pool ": ".users.uid"
+ }
+
+
+
+Add Placement Pool
+==================
+
+Make a pool available for data placement.
+
+Syntax
+~~~~~~
+
+::
+
+ PUT /{admin}/pool?format=json HTTP/1.1
+ Host {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``pool``
+
+:Description: The pool to be made available for data placement.
+:Type: String
+:Example: ``foo_pool``
+:Required: Yes
+
+``create``
+
+:Description: Creates the data pool if it does not exist.
+:Type: Boolean
+:Example: False [False]
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+TBD.
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+TBD.
+
+Remove Placement Pool
+=====================
+
+Make a pool unavailable for data placement.
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{admin}/pool?format=json HTTP/1.1
+ Host {fqdn}
+
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``pool``
+
+:Description: The existing pool to be made available for data placement.
+:Type: String
+:Example: ``foo_pool``
+:Required: Yes
+
+``destroy``
+
+:Description: Destroys the pool after removing it from the active set.
+:Type: Boolean
+:Example: False [False]
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+TBD.
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+TBD.
+
+List Available Data Placement Pools
+===================================
+
+List current pools available for data placement.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/pool?format=json HTTP/1.1
+ Host {fqdn}
+
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If successful, returns a list of pools available for data placement.
+
+``pools``
+
+:Description: Contains currently available pools for data placement.
+:Type: Container
+
+
+
+List Expired Garbage Collection Items
+=====================================
+
+List objects scheduled for garbage collection.
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/garbage?format=json HTTP/1.1
+ Host {fqdn}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+None.
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If expired garbage collection items exist, a list of such objects
+will be returned.
+
+``garbage``
+
+:Description: Expired garbage collection items.
+:Type: Container
+
+``object``
+
+:Description: A container garbage collection object information.
+:Type: Container
+:Parent: ``garbage``
+
+``name``
+
+:Description: The name of the object.
+:Type: String
+:Parent: ``object``
+
+``expired``
+
+:Description: The date at which the object expired.
+:Type: String
+:Parent: ``object``
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+TBD.
+
+Manually Processes Garbage Collection Items
+===========================================
+
+List objects scheduled for garbage collection.
+
+Syntax
+~~~~~~
+
+::
+
+ DELETE /{admin}/garbage?format=json HTTP/1.1
+ Host {fqdn}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+None.
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If expired garbage collection items exist, a list of removed objects
+will be returned.
+
+``garbage``
+
+:Description: Expired garbage collection items.
+:Type: Container
+
+``object``
+
+:Description: A container garbage collection object information.
+:Type: Container
+:Parent: ``garbage``
+
+``name``
+
+:Description: The name of the object.
+:Type: String
+:Parent: ``object``
+
+``expired``
+
+:Description: The date at which the object expired.
+:Type: String
+:Parent: ``object``
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+TBD.
+
+Show Log Objects
+================
+
+Show log objects
+
+Syntax
+~~~~~~
+
+::
+
+ GET /{admin}/log?format=json HTTP/1.1
+ Host {fqdn}
+
+Request Parameters
+~~~~~~~~~~~~~~~~~~
+
+``object``
+
+:Description: The log object to return.
+:Type: String:
+:Example: ``2012-10-11-09-4165.2-foo_bucket``
+:Required: No
+
+Response Entities
+~~~~~~~~~~~~~~~~~
+
+If no object is specified, returns the full list of log objects.
+
+``log-objects``
+
+:Description: A list of log objects.
+:Type: Container
+
+``object``
+
+:Description: The name of the log object.
+:Type: String
+
+``log``
+
+:Description: The contents of the log object.
+:Type: Container
+
+Special Error Responses
+~~~~~~~~~~~~~~~~~~~~~~~
+
+None.
+
+Standard Error Responses
+========================
+
+``AccessDenied``
+
+:Description: Access denied.
+:Code: 403 Forbidden
+
+``InternalError``
+
+:Description: Internal server error.
+:Code: 500 Internal Server Error
+
+``NoSuchUser``
+
+:Description: User does not exist.
+:Code: 404 Not Found
+
+``NoSuchBucket``
+
+:Description: Bucket does not exist.
+:Code: 404 Not Found
+
+``NoSuchKey``
+
+:Description: No such access key.
+:Code: 404 Not Found
diff --git a/doc/dev/radosgw/index.rst b/doc/dev/radosgw/index.rst
new file mode 100644
index 000000000..5f77609d2
--- /dev/null
+++ b/doc/dev/radosgw/index.rst
@@ -0,0 +1,13 @@
+=======================================
+ RADOS Gateway developer documentation
+=======================================
+
+.. rubric:: Contents
+
+.. toctree::
+ :maxdepth: 1
+
+
+ usage
+ Admin Ops Nonimplemented <admin/adminops_nonimplemented>
+ s3_compliance
diff --git a/doc/dev/radosgw/s3_compliance.rst b/doc/dev/radosgw/s3_compliance.rst
new file mode 100644
index 000000000..50aeda36a
--- /dev/null
+++ b/doc/dev/radosgw/s3_compliance.rst
@@ -0,0 +1,304 @@
+===============================
+Rados Gateway S3 API Compliance
+===============================
+
+.. warning::
+ This document is a draft, it might not be accurate
+
+----------------------
+Naming code reference
+----------------------
+
+Here comes a BNF definition on how to name a feature in the code for referencing purpose : ::
+
+ name ::= request_type "_" ( header | operation ) ( "_" header_option )?
+
+ request_type ::= "req" | "res"
+
+ header ::= string
+
+ operation ::= method resource
+
+ method ::= "GET" | "PUT" | "POST" | "DELETE" | "OPTIONS" | "HEAD"
+
+ resource ::= string
+
+ header_option ::= string
+
+----------------------
+Common Request Headers
+----------------------
+
+S3 Documentation reference : http://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonRequestHeaders.html
+
++----------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Header | Supported? | Code Links | Tests links |
++======================+============+=========================================================================================================+=============+
+| Authorization | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1962 | |
+| | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L2051 | |
++----------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Content-Length | Yes | | |
++----------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Content-Type | Yes | | |
++----------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Content-MD5 | Yes | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1249 | |
+| | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1306 | |
++----------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Date | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_auth_s3.cc#L164 | |
++----------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Expect | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest.cc#L1227 | |
+| | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L802 | |
+| | | https://github.com/ceph/ceph/blob/76040d90f7eb9f9921a3b8dcd0f821ac2cd9c492/src/rgw/rgw_main.cc#L372 | |
++----------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Host | ? | | |
++----------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| x-amz-date | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_auth_s3.cc#L169 | |
+| | | should take precedence over DATE as mentioned here -> | |
+| | | http://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonRequestHeaders.html | |
++----------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| x-amz-security-token | No | | |
++----------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+
+-----------------------
+Common Response Headers
+-----------------------
+
+S3 Documentation reference : http://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html
+
++---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Header | Supported? | Code Links | Tests links |
++=====================+============+=========================================================================================================+=============+
+| Content-Length | Yes | | |
++---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Connection | ? | | |
++---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Date | ? | | |
++---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| ETag | Yes | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1312 | |
+| | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1436 | |
+| | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L2222 | |
+| | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L118 | |
+| | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L268 | |
+| | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L516 | |
+| | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1336 | |
+| | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1486 | |
+| | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1548 | |
++---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Server | No | | |
++---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| x-amz-delete-marker | No | | |
++---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| x-amz-id-2 | No | | |
++---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| x-amz-request-id | Yes | https://github.com/ceph/ceph/commit/b711e3124f8f73c17ebd19b38807a1b77f201e44 | |
++---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| x-amz-version-id | No | | |
++---------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+
+-------------------------
+Operations on the Service
+-------------------------
+
+S3 Documentation reference : http://docs.aws.amazon.com/AmazonS3/latest/API/RESTServiceOps.html
+
++------+-----------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Type | Operation | Supported? | Code links | Tests links |
++======+===========+============+=========================================================================================================+=============+
+| GET | Service | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L2094 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1676 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L185 | |
++------+-----------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+
+---------------------
+Operations on Buckets
+---------------------
+
+S3 Documentation reference : http://docs.aws.amazon.com/AmazonS3/latest/API/RESTBucketOps.html
+
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| Type | Operation | Supported? | Code links | Tests links |
++========+========================+============+============================================================================================================+=============+
+| DELETE | Bucket | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1728 | |
+| | | | https://github.com/ceph/ceph/blob/e91042171939b6bf82a56a1015c5cae792d228ad/src/rgw/rgw_rest_bucket.cc#L250 | |
+| | | | https://github.com/ceph/ceph/blob/e91042171939b6bf82a56a1015c5cae792d228ad/src/rgw/rgw_rest_bucket.cc#L212 | |
+| | | | https://github.com/ceph/ceph/blob/25948319c4d256c4aeb0137eb88947e54d14cc79/src/rgw/rgw_bucket.cc#L856 | |
+| | | | https://github.com/ceph/ceph/blob/25948319c4d256c4aeb0137eb88947e54d14cc79/src/rgw/rgw_bucket.cc#L513 | |
+| | | | https://github.com/ceph/ceph/blob/25948319c4d256c4aeb0137eb88947e54d14cc79/src/rgw/rgw_bucket.cc#L286 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L461 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| DELETE | Bucket cors | ? | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1731 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1916 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| DELETE | Bucket lifecycle | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| DELETE | Bucket policy | ? | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| DELETE | Bucket tagging | ? | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| DELETE | Bucket website | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1676 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L185 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket acl | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1697 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1728 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1344 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket cors | ? | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1698 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1845 | |
+| | | | https://github.com/ceph/ceph/blob/76040d90f7eb9f9921a3b8dcd0f821ac2cd9c492/src/rgw/rgw_main.cc#L345 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket lifecycle | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket location | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket policy | ? | https://github.com/ceph/ceph/blob/e91042171939b6bf82a56a1015c5cae792d228ad/src/rgw/rgw_rest_bucket.cc#L232 | |
+| | | | https://github.com/ceph/ceph/blob/e91042171939b6bf82a56a1015c5cae792d228ad/src/rgw/rgw_rest_bucket.cc#L58 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket logging | ? | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1695 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L287 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket notification | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket tagging | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket Object versions | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket requestPayment | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket versioning | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | Bucket website | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| GET | List Multipart uploads | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1701 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest.cc#L877 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L2355 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L2363 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| HEAD | Bucket | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1713 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1689 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L826 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L834 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1725 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L382 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L437 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L901 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L945 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket acl | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1721 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1354 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1373 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1739 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1753 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket cors | ? | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1723 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1398 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1858 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1866 | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket lifecycle | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket policy | ? | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket logging | ? | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket notification | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket tagging | ? | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket requestPayment | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket versioning | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Bucket website | No | | |
++--------+------------------------+------------+------------------------------------------------------------------------------------------------------------+-------------+
+
+---------------------
+Operations on Objects
+---------------------
+
+S3 Documentation reference : http://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectOps.html
+
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| Type | Operation | Supported? | Code links | Tests links |
++=========+===========================+============+=========================================================================================================+=============+
+| DELETE | Object | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1796 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1516 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1524 | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| DELETE | Multiple objects | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1739 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1616 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1626 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1641 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1667 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1516 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1524 | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| GET | Object | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1767 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L71 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L397 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L424 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L497 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L562 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L626 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L641 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L706 | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| GET | Object acl | Yes | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| GET | Object torrent | No | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| HEAD | Object | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1777 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L71 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L397 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L424 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L497 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L562 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L626 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L641 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L706 | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| OPTIONS | Object | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1814 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1418 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1951 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1968 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1993 | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| POST | Object | Yes | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1742 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L631 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L694 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L700 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L707 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L759 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L771 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L781 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L795 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L929 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1037 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1059 | |
+| | | | https://github.com/ceph/ceph/blob/8a2eb18494005aa968b71f18121da8ebab48e950/src/rgw/rgw_rest_s3.cc#L1134 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1344 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1360 | |
+| | | | https://github.com/ceph/ceph/blob/b139a7cd34b4e203ab164ada7a8fa590b50d8b13/src/rgw/rgw_op.cc#L1365 | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| POST | Object restore | ? | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Object | Yes | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Object acl | Yes | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Object copy | Yes | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Initate multipart upload | Yes | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Upload Part | Yes | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Upload Part copy | ? | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Complete multipart upload | Yes | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| PUT | Abort multipart upload | Yes | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
+| PUT | List parts | Yes | | |
++---------+---------------------------+------------+---------------------------------------------------------------------------------------------------------+-------------+
diff --git a/doc/dev/radosgw/usage.rst b/doc/dev/radosgw/usage.rst
new file mode 100644
index 000000000..6c856fc7f
--- /dev/null
+++ b/doc/dev/radosgw/usage.rst
@@ -0,0 +1,84 @@
+============================
+Usage Design Overview
+============================
+
+
+
+
+Testing
+-------
+
+The current usage testing does the following:
+
+Following these operations:
+
+ - Create a few buckets
+ - Remove buckets
+ - Create a bucket
+ - Put object
+ - Remove object
+
+Test:
+
+1. Verify that 'usage show' with delete_obj category isn't empty after no more than 45 seconds (wait to flush)
+2. Check the following
+
+ - 'usage show'
+
+ - does not error out
+ - num of entries > 0
+ - num of summary entries > 0
+ - for every entry in categories check successful_ops > 0
+ - check that correct uid in the user summary
+
+
+ - 'usage show' with specified uid (--uid=<uid>')
+
+ - num of entries > 0
+ - num of summary entries > 0
+ - for every entry in categories check successful_ops > 0
+ - check that correct uid in the user summary
+
+ - 'usage show' with specified uid and specified categories (create_bucket,
+ put_obj, delete_obj, delete_bucket)
+
+ - for each category:
+ - does not error out
+ - num of entries > 0
+ - user in user summary is correct user
+ - length of categories entries under user summary is exactly 1
+ - name of category under user summary is correct name
+ - successful ops for the category > 0
+
+ - 'usage trim' with specified uid
+ - does not error
+ - check following 'usage show' shows complete usage info cleared for user
+
+
+Additional required testing:
+
+ - test multiple users
+
+ Do the same as in (2), with multiple users being set up.
+
+ - test with multiple buckets (> 1000 * factor, e.g., 2000)
+
+ Create multiple buckets, put objects in each. Account the number written data and verify
+ that usage reports show the expected number (up to a certain delta).
+
+ - verify usage show with a date/time range
+
+ Take timestamp of the beginning of the test, and the end of the test. Round timestamps to the
+ nearest hour (downward from start of test, upward from the end of test). List data starting
+ at end-time, make sure that no data is being shown. List data ending at start-time, make sure
+ that no data is shown. List data beginning at start-time, make sure that correct data is
+ displayed. List data ending end end-time, make sure that correct data is displayed. List
+ data beginning in begin-time, ending in end-time, make sure that correct data is displayed.
+
+ - verify usage trim with a date/time range
+
+ Take timestamp of the beginning of the test, and the end of the test. Round timestamps to the
+ nearest hour (downward from start of test, upward from the end of test). Trim data starting
+ at end-time, make sure that no data has been trimmed. Trim data ending at start-time, make sure
+ that no data has been trimmed. Trim data beginning in begin-time, ending in end-time, make sure
+ that all data has been trimmed.
diff --git a/doc/dev/rbd-diff.rst b/doc/dev/rbd-diff.rst
new file mode 100644
index 000000000..083c13165
--- /dev/null
+++ b/doc/dev/rbd-diff.rst
@@ -0,0 +1,146 @@
+RBD Incremental Backup
+======================
+
+This is a simple streaming file format for representing a diff between
+two snapshots (or a snapshot and the head) of an RBD image.
+
+Header
+~~~~~~
+
+"rbd diff v1\\n"
+
+Metadata records
+~~~~~~~~~~~~~~~~
+
+Every record has a one byte "tag" that identifies the record type,
+followed by some other data.
+
+Metadata records come in the first part of the image. Order is not
+important, as long as all the metadata records come before the data
+records.
+
+From snap
+---------
+
+- u8: 'f'
+- le32: snap name length
+- snap name
+
+To snap
+-------
+
+- u8: 't'
+- le32: snap name length
+- snap name
+
+Size
+----
+
+- u8: 's'
+- le64: (ending) image size
+
+Data Records
+~~~~~~~~~~~~
+
+These records come in the second part of the sequence.
+
+Updated data
+------------
+
+- u8: 'w'
+- le64: offset
+- le64: length
+- length bytes of actual data
+
+Zero data
+---------
+
+- u8: 'z'
+- le64: offset
+- le64: length
+
+
+Final Record
+~~~~~~~~~~~~
+
+End
+---
+
+- u8: 'e'
+
+
+Header
+~~~~~~
+
+"rbd diff v2\\n"
+
+Metadata records
+~~~~~~~~~~~~~~~~
+
+Every record has a one byte "tag" that identifies the record type,
+followed by length of data, and then some other data.
+
+Metadata records come in the first part of the image. Order is not
+important, as long as all the metadata records come before the data
+records.
+
+In v2, we have the following metadata in each section:
+(1 Bytes) tag.
+(8 Bytes) length.
+(n Bytes) data.
+
+In this way, we can skip the unrecognized tag.
+
+From snap
+---------
+
+- u8: 'f'
+- le64: length of appending data (4 + length)
+- le32: snap name length
+- snap name
+
+To snap
+-------
+
+- u8: 't'
+- le64: length of appending data (4 + length)
+- le32: snap name length
+- snap name
+
+Size
+----
+
+- u8: 's'
+- le64: length of appending data (8)
+- le64: (ending) image size
+
+Data Records
+~~~~~~~~~~~~
+
+These records come in the second part of the sequence.
+
+Updated data
+------------
+
+- u8: 'w'
+- le64: length of appending data (8 + 8 + length)
+- le64: offset
+- le64: length
+- length bytes of actual data
+
+Zero data
+---------
+
+- u8: 'z'
+- le64: length of appending data (8 + 8)
+- le64: offset
+- le64: length
+
+
+Final Record
+~~~~~~~~~~~~
+
+End
+---
+
+- u8: 'e'
diff --git a/doc/dev/rbd-export.rst b/doc/dev/rbd-export.rst
new file mode 100644
index 000000000..2edb637f6
--- /dev/null
+++ b/doc/dev/rbd-export.rst
@@ -0,0 +1,104 @@
+RBD Export & Import
+===================
+
+This is a file format of an RBD image or snapshot. It's a sparse format
+for the full image. There are three recording sections in the file.
+
+(1) Header.
+(2) Metadata.
+(3) Diffs.
+
+Header
+~~~~~~
+
+"rbd image v2\\n"
+
+Metadata records
+~~~~~~~~~~~~~~~~
+
+Every record has a one byte "tag" that identifies the record type,
+followed by length of data, and then some other data.
+
+Metadata records come in the first part of the image. Order is not
+important, as long as all the metadata records come before the data
+records.
+
+In v2, we have the following metadata in each section:
+(1 Bytes) tag.
+(8 Bytes) length.
+(n Bytes) data.
+
+In this way, we can skip the unrecognized tag.
+
+Image order
+-----------
+
+- u8: 'O'
+- le64: length of appending data (8)
+- le64: image order
+
+Image format
+------------
+
+- u8: 'F'
+- le64: length of appending data (8)
+- le64: image format
+
+Image Features
+--------------
+
+- u8: 'T'
+- le64: length of appending data (8)
+- le64: image features
+
+Image Stripe unit
+-----------------
+
+- u8: 'U'
+- le64: length of appending data (8)
+- le64: image striping unit
+
+Image Stripe count
+------------------
+
+- u8: 'C'
+- le64: length of appending data (8)
+- le64: image striping count
+
+ImageMeta Key and Value
+-----------------------
+
+- u8: 'M'
+- le64: length of appending data (length of key + length of value + 4 * 2)
+- string: image-meta key
+- string: image-meta value
+
+Final Record
+~~~~~~~~~~~~
+
+End
+---
+
+- u8: 'E'
+
+
+Diffs records
+~~~~~~~~~~~~~
+
+Record the all snapshots and the HEAD in this section.
+
+Snap Protection status
+----------------------
+
+Record the snapshot's protection status if `--export-format=2`.
+- u8: 'p'
+- le64: length of appending data (8)
+- u8: snap protection status (0 for false, 1 for true)
+
+Others
+------
+
+- le64: number of diffs
+- Diffs ...
+
+Detail please refer to rbd-diff.rst
diff --git a/doc/dev/rbd-layering.rst b/doc/dev/rbd-layering.rst
new file mode 100644
index 000000000..e6e224ce4
--- /dev/null
+++ b/doc/dev/rbd-layering.rst
@@ -0,0 +1,281 @@
+============
+RBD Layering
+============
+
+RBD layering refers to the creation of copy-on-write clones of block
+devices. This allows for fast image creation, for example to clone a
+golden master image of a virtual machine into a new instance. To
+simplify the semantics, you can only create a clone of a snapshot -
+snapshots are always read-only, so the rest of the image is
+unaffected, and there's no possibility of writing to them
+accidentally.
+
+From a user's perspective, a clone is just like any other rbd image.
+You can take snapshots of them, read/write them, resize them, etc.
+There are no restrictions on clones from a user's viewpoint.
+
+Note: the terms `child` and `parent` below mean an rbd image created
+by cloning, and the rbd image snapshot a child was cloned from.
+
+Command line interface
+----------------------
+
+Before cloning a snapshot, you must mark it as protected, to prevent
+it from being deleted while child images refer to it:
+::
+
+ $ rbd snap protect pool/image@snap
+
+Then you can perform the clone:
+::
+
+ $ rbd clone [--parent] pool/parent@snap [--image] pool2/child1
+
+You can create a clone with different object sizes from the parent:
+::
+
+ $ rbd clone --order 25 pool/parent@snap pool2/child2
+
+To delete the parent, you must first mark it unprotected, which checks
+that there are no children left:
+::
+
+ $ rbd snap unprotect pool/image@snap
+ Cannot unprotect: Still in use by pool2/image2
+ $ rbd children pool/image@snap
+ pool2/child1
+ pool2/child2
+ $ rbd flatten pool2/child1
+ $ rbd rm pool2/child2
+ $ rbd snap rm pool/image@snap
+ Cannot remove a protected snapshot: pool/image@snap
+ $ rbd snap unprotect pool/image@snap
+
+Then the snapshot can be deleted like normal:
+::
+
+ $ rbd snap rm pool/image@snap
+
+Implementation
+--------------
+
+Data Flow
+^^^^^^^^^
+
+In the initial implementation, called 'trivial layering', there will
+be no tracking of which objects exist in a clone. A read that hits a
+non-existent object will attempt to read from the parent snapshot, and
+this will continue recursively until an object exists or an image with
+no parent is found. This is done through the normal read path from
+the parent, so differing object sizes between parents and children
+do not matter.
+
+Before a write to an object is performed, the object is checked for
+existence. If it doesn't exist, a copy-up operation is performed,
+which means reading the relevant range of data from the parent
+snapshot and writing it (plus the original write) to the child
+image. To prevent races with multiple writes trying to copy-up the
+same object, this copy-up operation will include an atomic create. If
+the atomic create fails, the original write is done instead. This
+copy-up operation is implemented as a class method so that extra
+metadata can be stored by it in the future. In trivial layering, the
+copy-up operation copies the entire range needed to the child object
+(that is, the full size of the child object). A future optimization
+could make this copy-up more fine-grained.
+
+Another future optimization could be storing a bitmap of which objects
+actually exist in a child. This would obviate the check for existence
+before each write, and let reads go directly to the parent if needed.
+
+These optimizations are discussed in:
+
+http://marc.info/?l=ceph-devel&m=129867273303846
+
+Parent/Child relationships
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Children store a reference to their parent in their header, as a tuple
+of (pool id, image id, snapshot id). This is enough information to
+open the parent and read from it.
+
+In addition to knowing which parent a given image has, we want to be
+able to tell if a protected snapshot still has children. This is
+accomplished with a new per-pool object, `rbd_children`, which maps
+(parent pool id, parent image id, parent snapshot id) to a list of
+child image ids. This is stored in the same pool as the child image
+because the client creating a clone already has read/write access to
+everything in this pool, but may not have write access to the parent's
+pool. This lets a client with read-only access to one pool clone a
+snapshot from that pool into a pool they have full access to. It
+increases the cost of unprotecting an image, since this needs to check
+for children in every pool, but this is a rare operation. It would
+likely only be done before removing old images, which is already much
+more expensive because it involves deleting every data object in the
+image.
+
+Protection
+^^^^^^^^^^
+
+Internally, protection_state is a field in the header object that
+can be in three states. "protected", "unprotected", and
+"unprotecting". The first two are set as the result of "rbd
+protect/unprotect". The "unprotecting" state is set while the "rbd
+unprotect" command checks for any child images. Only snapshots in the
+"protected" state may be cloned, so the "unprotected" state prevents
+a race like:
+
+1. A: walk through all pools, look for clones, find none
+2. B: create a clone
+3. A: unprotect parent
+4. A: rbd snap rm pool/parent@snap
+
+Resizing
+^^^^^^^^
+
+Resizing an rbd image is like truncating a sparse file. New space is
+treated as zeroes, and shrinking an rbd image deletes the contents
+beyond the old bounds. This means that if you have a 10G image full of
+data, and you resize it down to 5G and then up to 10G again, the last
+5G is treated as zeroes (and any objects that held that data were
+removed when the image was shrunk).
+
+Layering complicates this because the absence of an object no longer
+implies it should be treated as zeroes - if the object is part of a
+clone, it may mean that some data needs to be read from the parent.
+
+To preserve the resizing behavior for clones, we need to keep track of
+which objects could be stored in the parent. We can track this as the
+amount of overlap the child has with the parent, since resizing only
+changes the end of an image. When a child is created, its overlap
+is the size of the parent snapshot. On each subsequent resize, the
+overlap is `min(overlap, new_size)`. That is, shrinking the image
+may shrinks the overlap, but increasing the image's size does not
+change the overlap.
+
+Objects that do not exist past the overlap are treated as zeroes.
+Objects that do not exist before that point fall back to reading
+from the parent.
+
+Since this overlap changes over time, we store it as part of the
+metadata for a snapshot as well.
+
+Renaming
+^^^^^^^^
+
+Currently the rbd header object (that stores all the metadata about an
+image) is named after the name of the image. This makes renaming
+disrupt clients who have the image open (such as children reading from
+a parent). To avoid this, we can name the header object by the
+id of the image, which does not change. That is, the name of the
+header object could be `rbd_header.$id`, where $id is a unique id for
+the image in the pool.
+
+When a client opens an image, all it knows is the name. There is
+already a per-pool `rbd_directory` object that maps image names to
+ids, but if we relied on it to get the id, we could not open any
+images in that pool if that single object was unavailable. To avoid
+this dependency, we can store the id of an image in an object called
+`rbd_id.$image_name`, where $image_name is the name of the image. The
+per-pool `rbd_directory` object is still useful for listing all images
+in a pool, however.
+
+Header changes
+--------------
+
+The header needs a few new fields:
+
+* int64_t parent_pool_id
+* string parent_image_id
+* uint64_t parent_snap_id
+* uint64_t overlap (how much of the image may be referring to the parent)
+
+These are stored in a "parent" key, which is only present if the image
+has a parent.
+
+cls_rbd
+^^^^^^^
+
+Some new methods are needed:
+::
+
+ /***************** methods on the rbd header *********************/
+ /**
+ * Sets the parent and overlap keys.
+ * Fails if any of these keys exist, since the image already
+ * had a parent.
+ */
+ set_parent(uint64_t pool_id, string image_id, uint64_t snap_id)
+
+ /**
+ * returns the parent pool id, image id, snap id, and overlap, or -ENOENT
+ * if parent_pool_id does not exist or is -1
+ */
+ get_parent(uint64_t snapid)
+
+ /**
+ * Removes the parent key
+ */
+ remove_parent() // after all parent data is copied to the child
+
+ /*************** methods on the rbd_children object *****************/
+
+ add_child(uint64_t parent_pool_id, string parent_image_id,
+ uint64_t parent_snap_id, string image_id);
+ remove_child(uint64_t parent_pool_id, string parent_image_id,
+ uint64_t parent_snap_id, string image_id);
+ /**
+ * List ids of a given parent
+ */
+ get_children(uint64_t parent_pool_id, string parent_image_id,
+ uint64_t parent_snap_id, uint64_t max_return,
+ string start);
+ /**
+ * list parent
+ */
+ get_parents(uint64_t max_return, uint64_t start_pool_id,
+ string start_image_id, string start_snap_id);
+
+
+ /************ methods on the rbd_id.$image_name object **************/
+
+ set_id(string id)
+ get_id()
+
+ /************** methods on the rbd_directory object *****************/
+
+ dir_get_id(string name);
+ dir_get_name(string id);
+ dir_list(string start_after, uint64_t max_return);
+ dir_add_image(string name, string id);
+ dir_remove_image(string name, string id);
+ dir_rename_image(string src, string dest, string id);
+
+Two existing methods will change if the image supports
+layering:
+::
+
+ snapshot_add - stores current overlap and has_parent with
+ other snapshot metadata (images that don't have
+ layering enabled aren't affected)
+
+ set_size - will adjust the parent overlap down as needed.
+
+librbd
+^^^^^^
+
+Opening a child image opens its parent (and this will continue
+recursively as needed). This means that an ImageCtx will contain a
+pointer to the parent image context. Differing object sizes won't
+matter, since reading from the parent will go through the parent
+image context.
+
+Discard will need to change for layered images so that it only
+truncates objects, and does not remove them. If we removed objects, we
+could not tell if we needed to read them from the parent.
+
+A new clone method will be added, which takes the same arguments as
+create except size (size of the parent image is used).
+
+Instead of expanding the rbd_info struct, we will break the metadata
+retrieval into several API calls. Right now, the only users of
+rbd_stat() other than 'rbd info' only use it to retrieve image size.
diff --git a/doc/dev/release-checklists.rst b/doc/dev/release-checklists.rst
new file mode 100644
index 000000000..e9fe94d82
--- /dev/null
+++ b/doc/dev/release-checklists.rst
@@ -0,0 +1,107 @@
+==================
+Release checklists
+==================
+
+Dev Kickoff
+===========
+
+These steps should be taken when starting a new major release, just after
+the previous release has been tagged (vX.2.0) and that tag has been merged
+back into master.
+
+X is the release we are just starting development on. X-1 is the one
+that was just released (X-1).2.0.
+
+Versions and tags
+-----------------
+
+- [x] Update CMakeLists.txt VERSION (right at the top to X.0.0)
+- [x] Update src/ceph_release with the new release name, number, and type ('dev')
+- [ ] Initial tag vX.0.0 (so that we can distinguish from (and sort
+ after) the backported (X-1).2.Z versions.
+
+
+Define release names and constants
+----------------------------------
+
+Make sure X (and, ideally, X+1) is defined:
+
+- [x] src/common/ceph_releases.h (`ceph_release_t`)
+- [x] src/common/ceph_strings.cc (`ceph_release_name()`)
+- [x] src/include/rados.h (`CEPH_RELEASE_*` and `MAX`)
+- [x] src/mon/mon_types.h (`ceph::features::mon::FEATURE_*` and related structs and helpers; note that monmaptool CLI test output will need adjustment)
+- [x] src/mds/cephfs_features.h (`CEPHFS_CURRENT_RELEASE`)
+
+Scripts
+~~~~~~~
+
+- [x] src/script/backport-resolve-issue (`releases()`, `ver_to_release()`... but for X-1)
+- [x] src/script/ceph-release-notes (X-1)
+- [x] ceph-build.git scripts/build_utils.sh `release_from_version()`
+
+Misc
+~~~~
+- [x] update src/ceph-volume/ceph_volume/__init__.py (`__release__`)
+
+Feature bits
+------------
+
+- [x] ensure that `SERVER_X` is defined
+- [x] change any features `DEPRECATED` in release X-3 are now marked `RETIRED`.
+- [ ] look for features that (1) were present in X-2 and (2) have no
+ client dependency and mark them `DEPRECATED` as of X.
+
+
+Compatsets
+----------
+
+- [x] mon/Monitor.h (`CEPH_MON_FEATURE_INCOMPAT_X`)
+- [x] mon/Monitor.cc (include in `get_supported_features()`)
+- [x] mon/Monitor.cc (`apply_monmap_to_compatset_features()`)
+- [x] mon/Monitor.cc (`calc_quorum_requirements()`)
+
+Mon
+---
+
+- [x] qa/standalone/mon/misc adjust `TEST_mon_features` (add X cases and adjust `--mon-debug-no-require-X`)
+- [x] mon/MgrMonitor.cc adjust `always_on_modules`
+- [x] common/options.cc define `mon_debug_no_require_X`
+- [x] common/options.cc remove `mon_debug_no_require_X-2`
+- [x] mon/OSDMonitor.cc `create_initial`: adjust new `require_osd_release`, and add associated `mon_debug_no_require_X`
+- [x] mon/OSDMonitor.cc `preprocess_boot`: adjust "disallow boot of " condition to disallow X if `require_osd_release` < X-2.
+- [x] mon/OSDMonitor.cc: adjust "osd require-osd-release" to (1) allow setting X, and (2) check that all mons *and* OSDs have X
+- [x] mon/MonCommands.h: adjust "osd require-osd-release" allows options to include X
+- [x] qa/workunits/cephtool/test.sh: adjust `require-osd-release` test
+
+
+Code cleanup
+------------
+
+- [ ] search code for "after X-1" or "X" for conditional checks
+- [ ] search code for X-2 and X-3 (`CEPH_FEATURE_SERVER_*` and
+ `ceph_release_t::*`)
+- [ ] search code for `require_osd_release`
+- [ ] search code for `min_mon_release`
+
+QA suite
+--------
+
+- [ ] create qa/suites/upgrade/(X-1)-x
+- [x] remove qa/suites/upgrade/(X-3)-x-*
+- [x] remove qa/suites/rados/upgrade/(X-3)-x-singleton symlink
+- [x] create qa/releases/X.yaml
+- [x] create qa/suites/rados/thrash-old-clients/1-install/(X-1).yaml
+
+
+
+First release candidate
+=======================
+
+- [ ] src/ceph_release: change type to `rc`
+
+
+First stable release
+====================
+
+- [ ] src/ceph_release: change type `stable`
+- [ ] generate new object corpus for encoding/decoding tests - see :doc:`corpus`
diff --git a/doc/dev/release-process.rst b/doc/dev/release-process.rst
new file mode 100644
index 000000000..3750759b8
--- /dev/null
+++ b/doc/dev/release-process.rst
@@ -0,0 +1,225 @@
+======================
+ Ceph Release Process
+======================
+
+Prerequisites
+=============
+
+Signing Machine
+---------------
+The signing machine is a virtual machine in the `Sepia lab
+<https://wiki.sepia.ceph.com/doku.php?id=start>`_. SSH access to the signing
+machine is limited to the usual Infrastructure Admins along with a few other
+component leads (e.g., nfs-ganesha, ceph-iscsi).
+
+The ``ubuntu`` user on the machine has some `build scripts <https://github.com/ceph/ceph-build/tree/main/scripts>`_ that help with pulling, pushing, and signing packages.
+
+The GPG signing key permanently lives on a `Nitrokey Pro <https://shop.nitrokey.com/shop/product/nkpr2-nitrokey-pro-2-3>`_ and is passed through to the VM via RHV. This helps to ensure that the key cannot be exported or leave the datacenter in any way.
+
+New Major Releases
+------------------
+For each new major (alphabetical) release, you must create one ``ceph-release`` RPM for each RPM repo (e.g., one for el8 and one for el9). `chacra <https://github.com/ceph/chacra>`_ is a python service we use to store DEB and RPM repos. The chacra repos are configured to include this ceph-release RPM, but it must be built separately. You must make sure that chacra is properly configured to include this RPM for each particular release.
+
+1. Update chacra so it is aware of the new Ceph release. See `this PR <https://github.com/ceph/chacra/pull/219>`_ for an example.
+2. Redeploy chacra (e.g., ``ansible-playbook chacra.ceph.com.yml``)
+3. Run https://jenkins.ceph.com/view/all/job/ceph-release-rpm/
+
+Summarized build process
+========================
+
+1. QE finishes testing and finds a stopping point. That commit is pushed to the ``$release-release`` branch in ceph.git (e.g., ``quincy-release``). This allows work to continue in the working ``$release`` branch without having to freeze it during the release process.
+2. The Ceph Council approves and notifies the "Build Lead".
+3. The "Build Lead" starts the `Jenkins multijob <https://jenkins.ceph.com/view/all/job/ceph>`_, which triggers all builds.
+4. Packages are pushed to chacra.ceph.com.
+5. Packages are pulled from chacra.ceph.com to the Signer VM.
+6. Packages are signed.
+7. Packages are pushed to download.ceph.com.
+8. Release containers are built and pushed to quay.io.
+
+Hotfix Release Process Deviation
+--------------------------------
+
+A hotfix release has a couple differences.
+
+1. Check out the most recent tag. For example, if we're releasing a hotfix on top of 17.2.3, ``git checkout -f -B quincy-release origin/v17.2.3``
+2. ``git cherry-pick -x`` the necessary hotfix commits
+3. ``git push -f origin quincy-release``
+4. Notify the "Build Lead" to start the build.
+5. The "Build Lead" should set ``RELEASE_TYPE=HOTFIX`` instead of ``STABLE``.
+
+Security Release Process Deviation
+----------------------------------
+
+A security/CVE release is similar to a hotfix release with two differences:
+
+ 1. The fix should be pushed to the `ceph-private <https://github.com/ceph/ceph-private>`_ repo instead of ceph.git (requires GitHub Admin Role).
+ 2. The tags (e.g., v17.2.4) must be manually pushed to ceph.git by the "Build Lead."
+
+1. Check out the most recent tag. For example, if we're releasing a security fix on top of 17.2.3, ``git checkout -f -B quincy-release origin/v17.2.3``
+2. ``git cherry-pick -x`` the necessary security fix commits
+3. ``git remote add security git@github.com:ceph/ceph-private.git``
+4. ``git push -f security quincy-release``
+5. Notify the "Build Lead" to start the build.
+6. The "Build Lead" should set ``RELEASE_TYPE=SECURITY`` instead of ``STABLE``.
+7. Finally, the `ceph-tag <https://github.com/ceph/ceph-build/blob/main/ansible/roles/ceph-release/tasks/push.yml>`_ steps need to be manually run by the "Build Lead" as close to the Announcement time as possible::
+
+ # Example using quincy pretending 17.2.4 is the security release version
+ # Add the ceph-releases repo (also requires GitHub Admin Role). The `ceph-setup <https://jenkins.ceph.com/job/ceph-setup>`_ job will have already created and pushed the tag to ceph-releases.git.
+ git remote add releases git@github.com:ceph/ceph-releases.git
+ git fetch --all
+ # Check out the version commit
+ git checkout -f -B quincy-release releases/quincy-release
+ git push -f origin quincy-release
+ git push origin v17.2.4
+ # Now create a Pull Request of quincy-release targeting quincy to merge the version commit and security fixes back into the quincy branch
+
+1. Preparing the release branch
+===============================
+
+Once QE has determined a stopping point in the working (e.g., ``quincy``) branch, that commit should be pushed to the corresponding ``quincy-release`` branch.
+
+Notify the "Build Lead" that the release branch is ready.
+
+2. Starting the build
+=====================
+
+We'll use a stable/regular 15.2.17 release of Octopus as an example throughout this document.
+
+1. Browse to https://jenkins.ceph.com/view/all/job/ceph/build?delay=0sec
+2. Log in with GitHub OAuth
+3. Set the parameters as necessary::
+
+ BRANCH=octopus
+ TAG=checked
+ VERSION=15.2.17
+ RELEASE_TYPE=STABLE
+ ARCHS=x86_64 arm64
+
+4. Use https://docs.ceph.com/en/latest/start/os-recommendations/?highlight=debian#platforms to determine the ``DISTROS`` parameter. For example,
+
+ +-------------------+-------------------------------------------+
+ | Release | Distro Codemap |
+ +===================+===========================================+
+ | octopus (15.X.X) | ``focal bionic centos7 centos8 buster`` |
+ +-------------------+-------------------------------------------+
+ | pacific (16.X.X) | ``focal bionic centos8 buster bullseye`` |
+ +-------------------+-------------------------------------------+
+ | quincy (17.X.X) | ``focal centos8 centos9 bullseye`` |
+ +-------------------+-------------------------------------------+
+
+5. Click ``Build``.
+
+3. Release Notes
+================
+
+Packages take hours to build. Use those hours to create the Release Notes and Announcements:
+
+1. ceph.git Release Notes (e.g., `v15.2.17's ceph.git (docs.ceph.com) PR <https://github.com/ceph/ceph/pull/47198>`_)
+2. ceph.io Release Notes (e.g., `v15.2.17's ceph.io.git (www.ceph.io) PR <https://github.com/ceph/ceph.io/pull/427>`_)
+3. E-mail announcement
+
+See `the Ceph Tracker wiki page that explains how to write the release notes <https://tracker.ceph.com/projects/ceph-releases/wiki/HOWTO_write_the_release_notes>`_.
+
+4. Signing and Publishing the Build
+===================================
+
+#. Obtain the sha1 of the version commit from the `build job <https://jenkins.ceph.com/view/all/job/ceph>`_ or the ``sha1`` file created by the `ceph-setup <https://jenkins.ceph.com/job/ceph-setup/>`_ job.
+
+#. Download the packages from chacra.ceph.com to the signing virtual machine. These packages get downloaded to ``/opt/repos`` where the `Sepia Lab Long Running (Ceph) Cluster <https://wiki.sepia.ceph.com/doku.php?id=services:longrunningcluster>`_ is mounted.
+
+ .. prompt:: bash $
+
+ ssh ubuntu@signer.front.sepia.ceph.com
+ sync-pull ceph [pacific|quincy|etc] <sha1>
+
+ Example::
+
+ $ sync-pull ceph octopus 8a82819d84cf884bd39c17e3236e0632ac146dc4
+ sync for: ceph octopus
+ ********************************************
+ Found the most packages (332) in ubuntu/bionic.
+ No JSON object could be decoded
+ No JSON object could be decoded
+ ubuntu@chacra.ceph.com:/opt/repos/ceph/octopus/8a82819d84cf884bd39c17e3236e0632ac146dc4/ubuntu/bionic/flavors/default/* /opt/repos/ceph/octopus-15.2.17/debian/jessie/
+ --------------------------------------------
+ receiving incremental file list
+ db/
+ db/checksums.db
+ 180.22K 100% 2.23MB/s 0:00:00 (xfr#1, to-chk=463/467)
+ db/contents.cache.db
+ 507.90K 100% 1.95MB/s 0:00:00 (xfr#2, to-chk=462/467)
+ db/packages.db
+
+ etc...
+
+#. Sign the DEBs:
+
+ .. prompt:: bash
+
+ merfi gpg /opt/repos/ceph/octopus-15.2.17/debian
+
+ Example::
+
+ $ merfi gpg /opt/repos/ceph/octopus-15.2.17/debian
+ --> Starting path collection, looking for files to sign
+ --> 18 matching paths found
+ --> will sign with the following commands:
+ --> gpg --batch --yes --armor --detach-sig --output Release.gpg Release
+ --> gpg --batch --yes --clearsign --output InRelease Release
+ --> signing: /opt/repos/ceph/octopus-15.2.17/debian/jessie/dists/bionic/Release
+ --> Running command: gpg --batch --yes --armor --detach-sig --output Release.gpg Release
+ --> Running command: gpg --batch --yes --clearsign --output InRelease Release
+ --> signing: /opt/repos/ceph/octopus-15.2.17/debian/jessie/dists/focal/Release
+ --> Running command: gpg --batch --yes --armor --detach-sig --output Release.gpg Release
+ --> Running command: gpg --batch --yes --clearsign --output InRelease Release
+
+ etc...
+
+#. Sign the RPMs:
+
+ .. prompt:: bash
+
+ sign-rpms octopus
+
+ Example::
+
+ $ sign-rpms octopus
+ Checking packages in: /opt/repos/ceph/octopus-15.2.17/centos/7
+ signing: /opt/repos/ceph/octopus-15.2.17/centos/7/SRPMS/ceph-release-1-1.el7.src.rpm
+ /opt/repos/ceph/octopus-15.2.17/centos/7/SRPMS/ceph-release-1-1.el7.src.rpm:
+ signing: /opt/repos/ceph/octopus-15.2.17/centos/7/SRPMS/ceph-15.2.17-0.el7.src.rpm
+ /opt/repos/ceph/octopus-15.2.17/centos/7/SRPMS/ceph-15.2.17-0.el7.src.rpm:
+ signing: /opt/repos/ceph/octopus-15.2.17/centos/7/noarch/ceph-mgr-modules-core-15.2.17-0.el7.noarch.rpm
+
+ etc...
+
+5. Publish the packages to download.ceph.com:
+
+ .. prompt:: bash $
+
+ sync-push octopus
+
+5. Build Containers
+===================
+
+Start the following two jobs:
+
+#. https://2.jenkins.ceph.com/job/ceph-container-build-ceph-base-push-imgs/
+#. https://2.jenkins.ceph.com/job/ceph-container-build-ceph-base-push-imgs-arm64/
+
+6. Announce the Release
+=======================
+
+Version Commit PR
+-----------------
+
+The `ceph-tag Jenkins job <https://jenkins.ceph.com/job/ceph-tag>`_ creates a Pull Request in ceph.git that targets the release branch.
+
+If this was a regular release (not a hotfix release or a security release), the only commit in that Pull Request should be the version commit. For example, see `v15.2.17's version commit PR <https://github.com/ceph/ceph/pull/47520>`_.
+
+Request a review and then merge the Pull Request.
+
+Announcing
+----------
+
+Publish the Release Notes on ceph.io before announcing the release by email, because the e-mail announcement references the ceph.io blog post.
diff --git a/doc/dev/seastore.rst b/doc/dev/seastore.rst
new file mode 100644
index 000000000..eb89d8219
--- /dev/null
+++ b/doc/dev/seastore.rst
@@ -0,0 +1,323 @@
+==========
+ SeaStore
+==========
+
+Goals and Basics
+================
+
+* Target NVMe devices. Not primarily concerned with pmem or HDD.
+* make use of SPDK for user-space driven IO
+* Use Seastar futures programming model to facilitate
+ run-to-completion and a sharded memory/processing model
+* Allow zero- (or minimal) data copying on read and write paths when
+ combined with a seastar-based messenger using DPDK
+
+Motivation and background
+-------------------------
+
+All flash devices are internally structured in terms of segments that
+can be written efficiently but must be erased in their entirety. The
+NVMe device generally has limited knowledge about what data in a
+segment is still "live" (hasn't been logically discarded), making the
+inevitable garbage collection within the device inefficient. We can
+design an on-disk layout that is friendly to GC at lower layers and
+drive garbage collection at higher layers.
+
+In principle a fine-grained discard could communicate our intent to
+the device, but in practice discard is poorly implemented in the
+device and intervening software layers.
+
+The basic idea is that all data will be stream out sequentially to
+large segments on the device. In the SSD hardware, segments are
+likely to be on the order of 100's of MB to tens of GB.
+
+SeaStore's logical segments would ideally be perfectly aligned with
+the hardware segments. In practice, it may be challenging to
+determine geometry and to sufficiently hint to the device that LBAs
+being written should be aligned to the underlying hardware. In the
+worst case, we can structure our logical segments to correspond to
+e.g. 5x the physical segment size so that we have about ~20% of our
+data misaligned.
+
+When we reach some utilization threshold, we mix cleaning work in with
+the ongoing write workload in order to evacuate live data from
+previously written segments. Once they are completely free we can
+discard the entire segment so that it can be erased and reclaimed by
+the device.
+
+The key is to mix a small bit of cleaning work with every write
+transaction to avoid spikes and variance in write latency.
+
+Data layout basics
+------------------
+
+One or more cores/shards will be reading and writing to the device at
+once. Each shard will have its own independent data it is operating
+on and stream to its own open segments. Devices that support streams
+can be hinted accordingly so that data from different shards is not
+mixed on the underlying media.
+
+Persistent Memory
+-----------------
+
+As the initial sequential design above matures, we'll introduce
+persistent memory support for metadata and caching structures.
+
+Design
+======
+
+The design is based heavily on both f2fs and btrfs. Each reactor
+manages its own root. Prior to reusing a segment, we rewrite any live
+blocks to an open segment.
+
+Because we are only writing sequentially to open segments, we must
+“clean” one byte of an existing segment for every byte written at
+steady state. Generally, we’ll need to reserve some portion of the
+usable capacity in order to ensure that write amplification remains
+acceptably low (20% for 2x? -- TODO: find prior work). As a design
+choice, we want to avoid a background gc scheme as it tends to
+complicate estimating operation cost and tends to introduce
+non-deterministic latency behavior. Thus, we want a set of structures
+that permits us to relocate blocks from existing segments inline with
+ongoing client IO.
+
+To that end, at a high level, we’ll maintain 2 basic metadata trees.
+First, we need a tree mapping ghobject_t->onode_t (onode_by_hobject).
+Second, we need a way to find live blocks within a segment and a way
+to decouple internal references from physical locations (lba_tree).
+
+Each onode contains xattrs directly as well as the top of the omap and
+extent trees (optimization: we ought to be able to fit small enough
+objects into the onode).
+
+Segment Layout
+--------------
+
+The backing storage is abstracted into a set of segments. Each
+segment can be in one of 3 states: empty, open, closed. The byte
+contents of a segment are a sequence of records. A record is prefixed
+by a header (including length and checksums) and contains a sequence
+of deltas and/or blocks. Each delta describes a logical mutation for
+some block. Each included block is an aligned extent addressable by
+<segment_id_t, segment_offset_t>. A transaction can be implemented by
+constructing a record combining deltas and updated blocks and writing
+it to an open segment.
+
+Note that segments will generally be large (something like >=256MB),
+so there will not typically be very many of them.
+
+record: [ header | delta | delta... | block | block ... ]
+segment: [ record ... ]
+
+See src/crimson/os/seastore/journal.h for Journal implementation
+See src/crimson/os/seastore/seastore_types.h for most seastore structures.
+
+Each shard will keep open N segments for writes
+
+- HDD: N is probably 1 on one shard
+- NVME/SSD: N is probably 2/shard, one for "journal" and one for
+ finished data records as their lifetimes are different.
+
+I think the exact number to keep open and how to partition writes
+among them will be a tuning question -- gc/layout should be flexible.
+Where practical, the goal is probably to partition blocks by expected
+lifetime so that a segment either has long lived or short lived
+blocks.
+
+The backing physical layer is exposed via a segment based interface.
+See src/crimson/os/seastore/segment_manager.h
+
+Journal and Atomicity
+---------------------
+
+One open segment is designated to be the journal. A transaction is
+represented by an atomically written record. A record will contain
+blocks written as part of the transaction as well as deltas which
+are logical mutations to existing physical extents. Transaction deltas
+are always written to the journal. If the transaction is associated
+with blocks written to other segments, final record with the deltas
+should be written only once the other blocks are persisted. Crash
+recovery is done by finding the segment containing the beginning of
+the current journal, loading the root node, replaying the deltas, and
+loading blocks into the cache as needed.
+
+See src/crimson/os/seastore/journal.h
+
+Block Cache
+-----------
+
+Every block is in one of two states:
+
+- clean: may be in cache or not, reads may cause cache residence or
+ not
+- dirty: the current version of the record requires overlaying deltas
+ from the journal. Must be fully present in the cache.
+
+Periodically, we need to trim the journal (else, we’d have to replay
+journal deltas from the beginning of time). To do this, we need to
+create a checkpoint by rewriting the root blocks and all currently
+dirty blocks. Note, we can do journal checkpoints relatively
+infrequently, and they needn’t block the write stream.
+
+Note, deltas may not be byte range modifications. Consider a btree
+node structured with keys to the left and values to the right (common
+trick for improving point query/key scan performance). Inserting a
+key/value into that node at the min would involve moving a bunch of
+bytes, which would be expensive (or verbose) to express purely as a
+sequence of byte operations. As such, each delta indicates the type
+as well as the location of the corresponding extent. Each block
+type can therefore implement CachedExtent::apply_delta as appopriate.
+
+See src/os/crimson/seastore/cached_extent.h.
+See src/os/crimson/seastore/cache.h.
+
+GC
+---
+
+Prior to reusing a segment, we must relocate all live blocks. Because
+we only write sequentially to empty segments, for every byte we write
+to currently open segments, we need to clean a byte of an existing
+closed segment. As a design choice, we’d like to avoid background
+work as it complicates estimating operation cost and has a tendency to
+create non-deterministic latency spikes. Thus, under normal operation
+each seastore reactor will be inserting enough work to clean a segment
+at the same rate as incoming operations.
+
+In order to make this cheap for sparse segments, we need a way to
+positively identify dead blocks. Thus, for every block written, an
+entry will be added to the lba tree with a pointer to the previous lba
+in the segment. Any transaction that moves a block or modifies the
+reference set of an existing one will include deltas/blocks required
+to update the lba tree to update or remove the previous block
+allocation. The gc state thus simply needs to maintain an iterator
+(of a sort) into the lba tree segment linked list for segment
+currently being cleaned and a pointer to the next record to be
+examined -- records not present in the allocation tree may still
+contain roots (like allocation tree blocks) and so the record metadata
+must be checked for a flag indicating root blocks.
+
+For each transaction, we evaluate a heuristic function of the
+currently available space and currently live space in order to
+determine whether we need to do cleaning work (could be simply a range
+of live/used space ratios).
+
+TODO: there is not yet a GC implementation
+
+Logical Layout
+==============
+
+Using the above block and delta semantics, we build two root level trees:
+- onode tree: maps hobject_t to onode_t
+- lba_tree: maps lba_t to lba_range_t
+
+Each of the above structures is comprised of blocks with mutations
+encoded in deltas. Each node of the above trees maps onto a block.
+Each block is either physically addressed (root blocks and the
+lba_tree nodes) or is logically addressed (everything else).
+Physically addressed blocks are located by a paddr_t: <segment_id_t,
+segment_off_t> tuple and are marked as physically addressed in the
+record. Logical blocks are addressed by laddr_t and require a lookup in
+the lba_tree to address.
+
+Because the cache/transaction machinery lives below the level of the
+lba tree, we can represent atomic mutations of the lba tree and other
+structures by simply including both in a transaction.
+
+LBAManager/BtreeLBAManager
+--------------------------
+
+Implementations of the LBAManager interface are responsible for managing
+the logical->physical mapping -- see crimson/os/seastore/lba_manager.h.
+
+The BtreeLBAManager implements this interface directly on top of
+Journal and SegmentManager using a wandering btree approach.
+
+Because SegmentManager does not let us predict the location of a
+committed record (a property of both SMR and Zone devices), references
+to blocks created within the same transaction will necessarily be
+*relative* addresses. The BtreeLBAManager maintains an invariant by
+which the in-memory copy of any block will contain only absolute
+addresses when !is_pending() -- on_commit and complete_load fill in
+absolute addresses based on the actual block addr and on_delta_write
+does so based on the just committed record. When is_pending(), if
+is_initial_pending references in memory are block_relative (because
+they will be written to the original block location) and
+record_relative otherwise (value will be written to delta).
+
+TransactionManager
+------------------
+
+The TransactionManager is responsible for presenting a unified
+interface on top of the Journal, SegmentManager, Cache, and
+LBAManager. Users can allocate and mutate extents based on logical
+addresses with segment cleaning handled in the background.
+
+See crimson/os/seastore/transaction_manager.h
+
+Next Steps
+==========
+
+Journal
+-------
+
+- Support for scanning a segment to find physically addressed blocks
+- Add support for trimming the journal and releasing segments.
+
+Cache
+-----
+
+- Support for rewriting dirty blocks
+
+ - Need to add support to CachedExtent for finding/updating
+ dependent blocks
+ - Need to add support for adding dirty block writout to
+ try_construct_record
+
+LBAManager
+----------
+
+- Add support for pinning
+- Add segment -> laddr for use in GC
+- Support for locating remaining used blocks in segments
+
+GC
+---
+
+- Initial implementation
+- Support in BtreeLBAManager for tracking used blocks in segments
+- Heuristic for identifying segments to clean
+
+Other
+------
+
+- Add support for periodically generating a journal checkpoint.
+- Onode tree
+- Extent tree
+- Remaining ObjectStore integration
+
+ObjectStore considerations
+==========================
+
+Splits, merges, and sharding
+----------------------------
+
+One of the current ObjectStore requirements is to be able to split a
+collection (PG) in O(1) time. Starting in mimic, we also need to be
+able to merge two collections into one (i.e., exactly the reverse of a
+split).
+
+However, the PGs that we split into would hash to different shards of
+the OSD in the current sharding scheme. One can imagine replacing
+that sharding scheme with a temporary mapping directing the smaller
+child PG to the right shard since we generally then migrate that PG to
+another OSD anyway, but this wouldn't help us in the merge case where
+the constituent pieces may start out on different shards and
+ultimately need to be handled in the same collection (and be operated
+on via single transactions).
+
+This suggests that we likely need a way for data written via one shard
+to "switch ownership" and later be read and managed by a different
+shard.
+
+
+
diff --git a/doc/dev/sepia.rst b/doc/dev/sepia.rst
new file mode 100644
index 000000000..3064900c5
--- /dev/null
+++ b/doc/dev/sepia.rst
@@ -0,0 +1,8 @@
+Sepia community test lab
+========================
+
+The Ceph community maintains a test lab that is open to active contributors to
+the Ceph project. Please see the `Sepia wiki`_ for more information.
+
+.. _Sepia wiki: https://wiki.sepia.ceph.com/doku.php
+
diff --git a/doc/dev/session_authentication.rst b/doc/dev/session_authentication.rst
new file mode 100644
index 000000000..48fab623d
--- /dev/null
+++ b/doc/dev/session_authentication.rst
@@ -0,0 +1,160 @@
+==============================================
+Session Authentication for the Cephx Protocol
+==============================================
+Peter Reiher
+7/30/12
+
+The original Cephx protocol authenticated the client to the authenticator and set up a session
+key used to authenticate the client to the server it needs to talk to. It did not, however,
+authenticate the ongoing messages between the client and server. Based on the fact that they
+share a secret key, these ongoing session messages can be easily authenticated by using the
+key to sign the messages.
+
+This document describes changes to the code that allow such ongoing session authentication.
+The changes allow for future changes that permit other authentication protocols (and the
+existing null NONE and UNKNOWN protocols) to handle signatures, but the only protocol that
+actually does signatures, at the time of the writing, is the Cephx protocol.
+
+Introduction
+-------------
+
+This code comes into play after the Cephx protocol has completed. At this point, the client and
+server share a secret key. This key will be used for authentication. For other protocols, there
+may or may not be such a key in place, and perhaps the actual procedures used to perform
+signing will be different, so the code is written to be general.
+
+The "session" here is represented by an established pipe. For such pipes, there should be a
+``session\_security`` structure attached to the pipe. Whenever a message is to be sent on the
+pipe, code that handles the signature for this kind of session security will be called. On the
+other end of the pipe, code that checks this kind of session security's message signatures will
+be called. Messages that fail the signature check will not be processed further. That implies
+that the sender had better be in agreement with the receiver on the session security being used,
+since otherwise messages will be uniformly dropped between them.
+
+The code is also prepared to handle encryption and decryption of session messages, which would
+add secrecy to the integrity provided by the signatures. No protocol currently implemented
+encrypts the ongoing session messages, though.
+
+For this functionality to work, several steps are required. First, the sender and receiver must have
+a successful run of the cephx protocol to establish a shared key. They must store that key somewhere
+that the pipe can get at later, to permit messages to be signed with it. Sent messages must be
+signed, and received messages must have their signatures checked.
+
+The signature could be computed in a variety of ways, but currently its size is limited to 64 bits.
+A message's signature is placed in its footer, in a field called ``sig``.
+
+The signature code in Cephx can be turned on and off at runtime, using a Ceph boolean option called
+``cephx\_sign\_messages``. It is currently set to false, by default, so no messages will be signed. It
+must be changed to true to cause signatures to be calculated and checked.
+
+Storing the Key
+---------------
+
+The key is needed to create signatures on the sending end and check signatures on the receiving end.
+In the future, if asymmetric crypto is an option, it's possible that two keys (a private one for
+this end of the pipe and a public one for the other end) would need to be stored. At this time,
+messages going in both directions will be signed with the same key, so only that key needs to be
+saved.
+
+The key is saved when the pipe is established. On the client side, this happens in ``connect()``,
+which is located in ``msg/Pipe.cc``. The key is obtained from a run of the Cephx protocol,
+which results in a successfully checked authorizer structure. If there is such an authorizer
+available, the code calls ``get\_auth\_session\_handler()`` to create a new authentication session handler
+and stores it in the pipe data structure. On the server side, a similar thing is done in
+``accept()`` after the authorizer provided by the client has been verified.
+
+Once these things are done on either end of the connection, session authentication can start.
+
+These routines (``connect()`` and ``accept()``) are also used to handle situations where a new
+session is being set up. At this stage, no authorizer has been created yet, so there's no key.
+Special cases in the code that calls the signature code skip these calls when the
+``CEPH\_AUTH\_UNKNOWN`` protocol is in use. This protocol label is on the pre-authorizer
+messages in a session, indicating that negotiation on an authentication protocol is ongoing and
+thus signature is not possible. There will be a reliable authentication operation later in this
+session before anything sensitive should be passed, so this is not a security problem.
+
+Signing Messages
+----------------
+
+Messages are signed in the ``write\_message`` call located in ``msg/Pipe.cc``. The actual
+signature process is to encrypt the CRCs for the message using the shared key. Thus, we must
+defer signing until all CRCs have been computed. The header CRC is computed last, so we
+call ``sign\_message()`` as soon as we've calculated that CRC.
+
+``sign\_message()`` is a virtual function defined in ``auth/AuthSessionHandler.h``. Thus,
+a specific version of it must be written for each authentication protocol supported. Currently,
+only UNKNOWN, NONE and CEPHX are supported. So there is a separate version of ``sign\_message()`` in
+``auth/unknown/AuthUnknownSessionHandler.h``, ``auth/none/AuthNoneSessionHandler.h`` and
+``auth/cephx/CephxSessionHandler.cc``. The UNKNOWN and NONE versions simply return 0, indicating
+success.
+
+The CEPHX version is more extensive. It is found in ``auth/cephx/CephxSessionHandler.cc``.
+The first thing done is to determine if the run time option to handle signatures (see above) is on.
+If not, the Cephx version of ``sign\_message()`` simply returns success without actually calculating
+a signature or inserting it into the message.
+
+If the run time option is enabled, ``sign\_message()`` copies all of the message's CRCs (one from the
+header and three from the footer) into a buffer. It calls ``encode\_encrypt()`` on the buffer,
+using the key obtained from the pipe's ``session\_security`` structure. 64 bits of the encrypted
+result are put into the message footer's signature field and a footer flag is set to indicate that
+the message was signed. (This flag is a sanity check. It is not regarded as definitive
+evidence that the message was signed. The presence of a ``session\_security`` structure at the
+receiving end requires a signature regardless of the value of this flag.) If this all goes well,
+``sign\_message()`` returns 0. If there is a problem anywhere along the line and no signature
+was computed, it returns ``SESSION\_SIGNATURE\_FAILURE``.
+
+Checking Signatures
+-------------------
+
+The signature is checked by a routine called ``check\_message\_signature()``. This is also a
+virtual function, defined in ``auth/AuthSessionHandler.h``. So again there are specific versions
+for supported authentication protocols, such as UNKNOWN, NONE and CEPHX. Again, the UNKNOWN and
+NONE versions are stored in ``auth/unknown/AuthUnknownSessionHandler.h`` and
+``auth/none/AuthNoneSessionHandler.h``, respectively, and again they simply return 0, indicating
+success.
+
+The CEPHX version of ``check\_message\_signature()`` performs a real signature check. This routine
+(stored in ``auth/cephx/CephxSessionHandler.cc``) exits with success if the run time option has
+disabled signatures. Otherwise, it takes the CRCs from the header and footer, encrypts the result,
+and compares it to the signature stored in the footer. Since an earlier routine has checked that
+the CRCs actually match the contents of the message, it is unnecessary to recompute the CRCs
+on the raw data in the message. The encryption is performed with the same ``encode\_encrypt()``
+routine used on the sending end, using the key stored in the local ``session\_security``
+data structure.
+
+If everything checks out, the CEPHX routine returns 0, indicating success. If there is a
+problem, the routine returns ``SESSION\_SIGNATURE\_FAILURE``.
+
+Adding New Session Authentication Methods
+-----------------------------------------
+
+For the purpose of session authentication only (not the basic authentication of client and
+server currently performed by the Cephx protocol), in addition to adding a new protocol, that
+protocol must have a ``sign\_message()`` routine and a ``check\_message\_signature`` routine.
+These routines will take a message pointer as a parameter and return 0 on success. The procedure
+used to sign and check will be specific to the new method, but probably there will be a
+``session\_security`` structure attached to the pipe that contains a cryptographic key. This
+structure will be either an ``AuthSessionHandler`` (found in ``auth/AuthSessionHandler.h``)
+or a structure derived from that type.
+
+Adding Encryption to Sessions
+-----------------------------
+
+The existing code is partially, but not fully, set up to allow sessions to have their packets
+encrypted. Part of adding encryption would be similar to adding a new authentication method.
+But one would also need to add calls to the encryption and decryption routines in ``write\_message()``
+and ``read\_message()``. These calls would probably go near where the current calls for
+authentication are made. You should consider whether you want to replace the existing calls
+with something more general that does whatever the chosen form of session security requires,
+rather than explicitly saying ``sign`` or ``encrypt``.
+
+Session Security Statistics
+---------------------------
+
+The existing Cephx authentication code keeps statistics on how many messages were signed, how
+many message signature were checked, and how many checks succeeded and failed. It is prepared
+to keep similar statistics on encryption and decryption. These statistics can be accessed through
+the call ``printAuthSessionHandlerStats`` in ``auth/AuthSessionHandler.cc``.
+
+If new authentication or encryption methods are added, they should include code that keeps these
+statistics.
diff --git a/doc/dev/testing.rst b/doc/dev/testing.rst
new file mode 100644
index 000000000..1d99848a7
--- /dev/null
+++ b/doc/dev/testing.rst
@@ -0,0 +1,40 @@
+Testing notes
+=============
+
+
+build-integration-branch
+------------------------
+
+Setup
+^^^^^
+
+#. Create a github token at `<https://github.com/settings/tokens>`_
+ and put it in ``~/.github_token``. Note that only the
+ ``public_repo`` under the ``repo`` section needs to be checked.
+
+#. Create a ceph repo label `wip-yourname-testing` if you don't
+ already have one at `<https://github.com/ceph/ceph/labels>`_.
+
+#. Create the ``ci`` remote::
+
+ git remote add ci git@github.com:ceph/ceph-ci
+
+Using
+^^^^^
+
+#. Tag some subset of `needs-qa` commits with your label (usually `wip-yourname-testing`).
+
+#. Create the integration branch::
+
+ git checkout master
+ git pull
+ ../src/script/build-integration-branch wip-yourname-testing
+
+#. Smoke test::
+
+ make && ctest -j12
+
+#. Push to ceph-ci::
+
+ git push ci $(git rev-parse --abbrev-ref HEAD)
+
diff --git a/doc/dev/versions.rst b/doc/dev/versions.rst
new file mode 100644
index 000000000..34ed74724
--- /dev/null
+++ b/doc/dev/versions.rst
@@ -0,0 +1,42 @@
+==================
+Public OSD Version
+==================
+
+We maintain two versions on disk: an eversion_t pg_log.head and a
+version_t info.user_version. Each object is tagged with both the pg
+version and user_version it was last modified with. The PG version is
+modified by manipulating OpContext::at_version and then persisting it
+to the pg log as transactions, and is incremented in all the places it
+used to be. The user_version is modified by manipulating the new
+OpContext::user_at_version and is also persisted via the pg log
+transactions.
+user_at_version is modified only in PrimaryLogPG::prepare_transaction
+when the op was a "user modify" (a non-watch write), and the durable
+user_version is updated according to the following rules:
+1) set user_at_version to the maximum of ctx->new_obs.oi.user_version+1
+and info.last_user_version+1.
+2) set user_at_version to the maximum of itself and
+ctx->at_version.version.
+3) ctx->new_obs.oi.user_version = ctx->user_at_version (to change the
+object's user_version)
+
+This set of update semantics mean that for traditional pools the
+user_version will be equal to the past reassert_version, while for
+caching pools the object and PG user-version will be able to cross
+pools without making a total mess of things.
+In order to support old clients, we keep the old reassert_version but
+rename it to "bad_replay_version"; we fill it in as before: for writes
+it is set to the at_version (and is the proper replay version); for
+watches it is set to our user version; for ENOENT replies it is set to
+the replay version's epoch but the user_version's version. We also now
+fill in the version_t portion of the bad_replay_version on read ops as
+well as write ops, which should be fine for all old clients.
+
+For new clients, we prevent them from reading bad_replay_version and
+add two proper members: user_version and replay_version; user_version
+is filled in on every operation (reads included) while replay_version
+is filled in for writes.
+
+The objclass function get_current_version() now always returns the
+pg->info.last_user_version, which means it is guaranteed to contain
+the version of the last user update in the PG (including on reads!).
diff --git a/doc/dev/vstart-ganesha.rst b/doc/dev/vstart-ganesha.rst
new file mode 100644
index 000000000..4e77deb8b
--- /dev/null
+++ b/doc/dev/vstart-ganesha.rst
@@ -0,0 +1,45 @@
+==============================
+NFS CephFS-RGW Developer Guide
+==============================
+
+CephFS exports are supported since Octopus and RGW exports are supported since
+Quincy.
+
+Configuring NFS Ganesha to export CephFS with vstart
+====================================================
+
+1) Using ``cephadm``
+
+ .. code:: bash
+
+ $ MDS=1 MON=1 OSD=3 NFS=1 ../src/vstart.sh -n -d --cephadm
+
+ This will deploy a single NFS Ganesha daemon using ``vstart.sh``, where the
+ daemon will listen on the default NFS Ganesha port. Also cephfs export is
+ created.
+
+2) Using test orchestrator
+
+ .. code:: bash
+
+ $ MDS=1 MON=1 OSD=3 NFS=1 ../src/vstart.sh -n -d
+
+ Environment variable ``NFS`` is the number of NFS Ganesha daemons to be
+ deployed, each listening on a random port.
+
+ .. note:: NFS Ganesha packages must be pre-installed for this to work.
+
+Configuring NFS Ganesha to export RGW with vstart
+=================================================
+
+1) Using ``cephadm``
+
+ .. code:: bash
+
+ $ MON=1 OSD=3 RGW=1 NFS=1 ../src/vstart.sh -n -d --cephadm
+
+ This will deploy a single NFS Ganesha daemon using ``vstart.sh``, where the
+ daemon will listen on the default NFS Ganesha port. Also rgw export is
+ created.
+
+ .. note:: boto python module must be pre-installed for this to work.
diff --git a/doc/dev/wireshark.rst b/doc/dev/wireshark.rst
new file mode 100644
index 000000000..e03b3621c
--- /dev/null
+++ b/doc/dev/wireshark.rst
@@ -0,0 +1,41 @@
+=====================
+ Wireshark Dissector
+=====================
+
+Wireshark has support for the Ceph protocol and it will be shipped in the 1.12.1
+release.
+
+Using
+=====
+
+To use the Wireshark dissector you must build it from `git`__, the process is
+outlined in great detail in the `Building and Installing`__ section of the
+`Wireshark Users Guide`__.
+
+__ `Wireshark git`_
+__ WSUG_BI_
+__ WSUG_
+
+Developing
+==========
+
+The Ceph dissector lives in `Wireshark git`_ at
+``epan/dissectors/packet-ceph.c``. At the top of that file there are some
+comments explaining how to insert new functionality or to update the encoding
+of existing types.
+
+Before you start hacking on Wireshark code you should look at the
+``doc/README.developer`` and ``doc/README.dissector`` documents as they explain
+the basics of writing dissectors. After reading those two documents you should
+be prepared to work on the Ceph dissector. `The Wireshark
+developers guide`__ also contains a lot of useful information but it is less
+directed and is more useful as a reference then an introduction.
+
+__ WSDG_
+
+.. _WSUG: https://www.wireshark.org/docs/wsug_html_chunked/
+.. _WSDG: https://www.wireshark.org/docs/wsdg_html_chunked/
+.. _WSUG_BI: https://www.wireshark.org/docs/wsug_html_chunked/ChapterBuildInstall.html
+.. _Wireshark git: https://www.wireshark.org/develop.html
+
+.. vi: textwidth=80 noexpandtab
diff --git a/doc/dev/zoned-storage.rst b/doc/dev/zoned-storage.rst
new file mode 100644
index 000000000..27f592b4c
--- /dev/null
+++ b/doc/dev/zoned-storage.rst
@@ -0,0 +1,134 @@
+=======================
+ Zoned Storage Support
+=======================
+
+http://zonedstorage.io
+
+Zoned Storage is a class of storage devices that enables host and storage
+devices to cooperate to achieve higher storage capacities, increased throughput,
+and lower latencies. The zoned storage interface is available through the SCSI
+Zoned Block Commands (ZBC) and Zoned Device ATA Command Set (ZAC) standards on
+Shingled Magnetic Recording (SMR) hard disks today and is also being adopted for
+NVMe Solid State Disks with the upcoming NVMe Zoned Namespaces (ZNS) standard.
+
+This project aims to enable Ceph to work on zoned storage drives and at the same
+time explore research problems related to adopting this new interface. The
+first target is to enable non-ovewrite workloads (e.g. RGW) on host-managed SMR
+(HM-SMR) drives and explore cleaning (garbage collection) policies. HM-SMR
+drives are high capacity hard drives with the ZBC/ZAC interface. The longer
+term goal is to support ZNS SSDs, as they become available, as well as overwrite
+workloads.
+
+The first patch in these series enabled writing data to HM-SMR drives. This
+patch introduces ZonedFreelistManger, a FreelistManager implementation that
+passes enough information to ZonedAllocator to correctly initialize state of
+zones by tracking the write pointer and the number of dead bytes per zone. We
+have to introduce a new FreelistManager implementation because with zoned
+devices a region of disk can be in three states (empty, used, and dead), whereas
+current BitmapFreelistManager tracks only two states (empty and used). It is
+not possible to accurately initialize the state of zones in ZonedAllocator by
+tracking only two states. The third planned patch will introduce a rudimentary
+cleaner to form a baseline for further research.
+
+Currently we can perform basic RADOS benchmarks on an OSD running on an HM-SMR
+drives, restart the OSD, and read the written data, and write new data, as can
+be seen below.
+
+Please contact Abutalib Aghayev <agayev@psu.edu> for questions.
+
+::
+
+ $ sudo zbd report -i -n /dev/sdc
+ Device /dev/sdc:
+ Vendor ID: ATA HGST HSH721414AL T240
+ Zone model: host-managed
+ Capacity: 14000.520 GB (27344764928 512-bytes sectors)
+ Logical blocks: 3418095616 blocks of 4096 B
+ Physical blocks: 3418095616 blocks of 4096 B
+ Zones: 52156 zones of 256.0 MB
+ Maximum number of open zones: no limit
+ Maximum number of active zones: no limit
+ 52156 / 52156 zones
+ $ MON=1 OSD=1 MDS=0 sudo ../src/vstart.sh --new --localhost --bluestore --bluestore-devs /dev/sdc --bluestore-zoned
+ <snipped verbose output>
+ $ sudo ./bin/ceph osd pool create bench 32 32
+ pool 'bench' created
+ $ sudo ./bin/rados bench -p bench 10 write --no-cleanup
+ hints = 1
+ Maintaining 16 concurrent writes of 4194304 bytes to objects of size 4194304 for up to 10 seconds or 0 objects
+ Object prefix: benchmark_data_h0.cc.journaling712.narwhal.p_29846
+ sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg lat(s)
+ 0 0 0 0 0 0 - 0
+ 1 16 45 29 115.943 116 0.384175 0.407806
+ 2 16 86 70 139.949 164 0.259845 0.391488
+ 3 16 125 109 145.286 156 0.31727 0.404727
+ 4 16 162 146 145.953 148 0.826671 0.409003
+ 5 16 203 187 149.553 164 0.44815 0.404303
+ 6 16 242 226 150.621 156 0.227488 0.409872
+ 7 16 281 265 151.384 156 0.411896 0.408686
+ 8 16 320 304 151.956 156 0.435135 0.411473
+ 9 16 359 343 152.401 156 0.463699 0.408658
+ 10 15 396 381 152.356 152 0.409554 0.410851
+ Total time run: 10.3305
+ Total writes made: 396
+ Write size: 4194304
+ Object size: 4194304
+ Bandwidth (MB/sec): 153.333
+ Stddev Bandwidth: 13.6561
+ Max bandwidth (MB/sec): 164
+ Min bandwidth (MB/sec): 116
+ Average IOPS: 38
+ Stddev IOPS: 3.41402
+ Max IOPS: 41
+ Min IOPS: 29
+ Average Latency(s): 0.411226
+ Stddev Latency(s): 0.180238
+ Max latency(s): 1.00844
+ Min latency(s): 0.108616
+ $ sudo ../src/stop.sh
+ $ # Notice the lack of "--new" parameter to vstart.sh
+ $ MON=1 OSD=1 MDS=0 sudo ../src/vstart.sh --localhost --bluestore --bluestore-devs /dev/sdc --bluestore-zoned
+ <snipped verbose output>
+ $ sudo ./bin/rados bench -p bench 10 rand
+ hints = 1
+ sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg lat(s)
+ 0 0 0 0 0 0 - 0
+ 1 16 61 45 179.903 180 0.117329 0.244067
+ 2 16 116 100 199.918 220 0.144162 0.292305
+ 3 16 174 158 210.589 232 0.170941 0.285481
+ 4 16 251 235 234.918 308 0.241175 0.256543
+ 5 16 316 300 239.914 260 0.206044 0.255882
+ 6 15 392 377 251.206 308 0.137972 0.247426
+ 7 15 458 443 252.984 264 0.0800146 0.245138
+ 8 16 529 513 256.346 280 0.103529 0.239888
+ 9 16 587 571 253.634 232 0.145535 0.2453
+ 10 15 646 631 252.254 240 0.837727 0.246019
+ Total time run: 10.272
+ Total reads made: 646
+ Read size: 4194304
+ Object size: 4194304
+ Bandwidth (MB/sec): 251.558
+ Average IOPS: 62
+ Stddev IOPS: 10.005
+ Max IOPS: 77
+ Min IOPS: 45
+ Average Latency(s): 0.249385
+ Max latency(s): 0.888654
+ Min latency(s): 0.0103208
+ $ sudo ./bin/rados bench -p bench 10 write --no-cleanup
+ hints = 1
+ Maintaining 16 concurrent writes of 4194304 bytes to objects of size 4194304 for up to 10 seconds or 0 objects
+ Object prefix: benchmark_data_h0.aa.journaling712.narwhal.p_64416
+ sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg lat(s)
+ 0 0 0 0 0 0 - 0
+ 1 16 46 30 119.949 120 0.52627 0.396166
+ 2 16 82 66 131.955 144 0.48087 0.427311
+ 3 16 123 107 142.627 164 0.3287 0.420614
+ 4 16 158 142 141.964 140 0.405177 0.425993
+ 5 16 192 176 140.766 136 0.514565 0.425175
+ 6 16 224 208 138.635 128 0.69184 0.436672
+ 7 16 261 245 139.967 148 0.459929 0.439502
+ 8 16 301 285 142.468 160 0.250846 0.434799
+ 9 16 336 320 142.189 140 0.621686 0.435457
+ 10 16 374 358 143.166 152 0.460593 0.436384
+