diff options
Diffstat (limited to '')
-rw-r--r-- | src/common/options/osd.yaml.in | 1415 |
1 files changed, 1415 insertions, 0 deletions
diff --git a/src/common/options/osd.yaml.in b/src/common/options/osd.yaml.in new file mode 100644 index 000000000..7291ce11d --- /dev/null +++ b/src/common/options/osd.yaml.in @@ -0,0 +1,1415 @@ +# -*- mode: YAML -*- +--- + +options: +- name: osd_numa_prefer_iface + type: bool + level: advanced + desc: prefer IP on network interface on same numa node as storage + default: true + see_also: + - osd_numa_auto_affinity + flags: + - startup +- name: osd_numa_auto_affinity + type: bool + level: advanced + desc: automatically set affinity to numa node when storage and network match + default: true + flags: + - startup +- name: osd_numa_node + type: int + level: advanced + desc: set affinity to a numa node (-1 for none) + default: -1 + see_also: + - osd_numa_auto_affinity + flags: + - startup +- name: set_keepcaps + type: bool + level: advanced + desc: set the keepcaps flag before changing UID, preserving the permitted capability set + long_desc: When ceph switches from root to the ceph uid, all capabilities in all sets are eraseed. If + a component that is capability aware needs a specific capability, the keepcaps flag maintains + the permitted capability set, allowing the capabilities in the effective set to be activated as needed. + default: false + flags: + - startup +- name: osd_smart_report_timeout + type: uint + level: advanced + desc: Timeout (in seconds) for smartctl to run, default is set to 5 + default: 5 +# verify backend can support configured max object name length +- name: osd_check_max_object_name_len_on_startup + type: bool + level: dev + default: true + with_legacy: true +- name: osd_max_backfills + type: uint + level: advanced + desc: Maximum number of concurrent local and remote backfills or recoveries per + OSD + long_desc: There can be osd_max_backfills local reservations AND the same remote + reservations per OSD. So a value of 1 lets this OSD participate as 1 PG primary + in recovery and 1 shard of another recovering PG. + fmt_desc: The maximum number of backfills allowed to or from a single OSD. + Note that this is applied separately for read and write operations. + default: 1 + flags: + - runtime + with_legacy: true +# Minimum recovery priority (255 = max, smaller = lower) +- name: osd_min_recovery_priority + type: int + level: advanced + desc: Minimum priority below which recovery is not performed + long_desc: The purpose here is to prevent the cluster from doing *any* lower priority + work (e.g., rebalancing) below this threshold and focus solely on higher priority + work (e.g., replicating degraded objects). + default: 0 + with_legacy: true +- name: osd_backfill_retry_interval + type: float + level: advanced + desc: how frequently to retry backfill reservations after being denied (e.g., due + to a full OSD) + fmt_desc: The number of seconds to wait before retrying backfill requests. + default: 30 + with_legacy: true +- name: osd_recovery_retry_interval + type: float + level: advanced + desc: how frequently to retry recovery reservations after being denied (e.g., due + to a full OSD) + default: 30 + with_legacy: true +- name: osd_recovery_sleep + type: float + level: advanced + desc: Time in seconds to sleep before next recovery or backfill op. This setting + overrides _ssd, _hdd, and _hybrid if non-zero. + fmt_desc: Time in seconds to sleep before the next recovery or backfill op. + Increasing this value will slow down recovery operation while + client operations will be less impacted. + default: 0 + flags: + - runtime + with_legacy: true +- name: osd_recovery_sleep_hdd + type: float + level: advanced + desc: Time in seconds to sleep before next recovery or backfill op for HDDs + fmt_desc: Time in seconds to sleep before next recovery or backfill op + for HDDs. + default: 0.1 + flags: + - runtime + with_legacy: true +- name: osd_recovery_sleep_ssd + type: float + level: advanced + desc: Time in seconds to sleep before next recovery or backfill op for SSDs + fmt_desc: Time in seconds to sleep before the next recovery or backfill op + for SSDs. + default: 0 + see_also: + - osd_recovery_sleep + flags: + - runtime + with_legacy: true +- name: osd_recovery_sleep_hybrid + type: float + level: advanced + desc: Time in seconds to sleep before next recovery or backfill op when data is + on HDD and journal is on SSD + fmt_desc: Time in seconds to sleep before the next recovery or backfill op + when OSD data is on HDD and OSD journal / WAL+DB is on SSD. + default: 0.025 + see_also: + - osd_recovery_sleep + flags: + - runtime +- name: osd_snap_trim_sleep + type: float + level: advanced + desc: Time in seconds to sleep before next snap trim. This setting overrides _ssd, + _hdd, and _hybrid if non-zero. + fmt_desc: Time in seconds to sleep before next snap trim op. + Increasing this value will slow down snap trimming. + This option overrides backend specific variants. + default: 0 + flags: + - runtime + with_legacy: true +- name: osd_snap_trim_sleep_hdd + type: float + level: advanced + desc: Time in seconds to sleep before next snap trim for HDDs + default: 5 + flags: + - runtime +- name: osd_snap_trim_sleep_ssd + type: float + level: advanced + desc: Time in seconds to sleep before next snap trim for SSDs + fmt_desc: Time in seconds to sleep before next snap trim op + for SSD OSDs (including NVMe). + default: 0 + flags: + - runtime +- name: osd_snap_trim_sleep_hybrid + type: float + level: advanced + desc: Time in seconds to sleep before next snap trim when data is on HDD and journal + is on SSD + fmt_desc: Time in seconds to sleep before next snap trim op + when OSD data is on an HDD and the OSD journal or WAL+DB is on an SSD. + default: 2 + flags: + - runtime +- name: osd_scrub_invalid_stats + type: bool + level: advanced + default: true + with_legacy: true +- name: osd_max_scrubs + type: int + level: advanced + desc: Maximum concurrent scrubs on a single OSD + fmt_desc: The maximum number of simultaneous scrub operations for + a Ceph OSD Daemon. + default: 1 + with_legacy: true +- name: osd_scrub_during_recovery + type: bool + level: advanced + desc: Allow scrubbing when PGs on the OSD are undergoing recovery + fmt_desc: Allow scrub during recovery. Setting this to ``false`` will disable + scheduling new scrub (and deep--scrub) while there is active recovery. + Already running scrubs will be continued. This might be useful to reduce + load on busy clusters. + default: false + with_legacy: true +- name: osd_repair_during_recovery + type: bool + level: advanced + desc: Allow requested repairing when PGs on the OSD are undergoing recovery + default: false + with_legacy: true +- name: osd_scrub_begin_hour + type: int + level: advanced + desc: Restrict scrubbing to this hour of the day or later + long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day. + fmt_desc: This restricts scrubbing to this hour of the day or later. + Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` + to allow scrubbing the entire day. Along with ``osd_scrub_end_hour``, they define a time + window, in which the scrubs can happen. + But a scrub will be performed + no matter whether the time window allows or not, as long as the placement + group's scrub interval exceeds ``osd_scrub_max_interval``. + default: 0 + see_also: + - osd_scrub_end_hour + min: 0 + max: 23 + with_legacy: true +- name: osd_scrub_end_hour + type: int + level: advanced + desc: Restrict scrubbing to hours of the day earlier than this + long_desc: Use osd_scrub_begin_hour=0 and osd_scrub_end_hour=0 for the entire day. + fmt_desc: This restricts scrubbing to the hour earlier than this. + Use ``osd_scrub_begin_hour = 0`` and ``osd_scrub_end_hour = 0`` to allow scrubbing + for the entire day. Along with ``osd_scrub_begin_hour``, they define a time + window, in which the scrubs can happen. But a scrub will be performed + no matter whether the time window allows or not, as long as the placement + group's scrub interval exceeds ``osd_scrub_max_interval``. + default: 0 + see_also: + - osd_scrub_begin_hour + min: 0 + max: 23 + with_legacy: true +- name: osd_scrub_begin_week_day + type: int + level: advanced + desc: Restrict scrubbing to this day of the week or later + long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0 + for the entire week. + fmt_desc: This restricts scrubbing to this day of the week or later. + 0 = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0`` + and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week. + Along with ``osd_scrub_end_week_day``, they define a time window in which + scrubs can happen. But a scrub will be performed + no matter whether the time window allows or not, when the PG's + scrub interval exceeds ``osd_scrub_max_interval``. + default: 0 + see_also: + - osd_scrub_end_week_day + min: 0 + max: 6 + with_legacy: true +- name: osd_scrub_end_week_day + type: int + level: advanced + desc: Restrict scrubbing to days of the week earlier than this + long_desc: 0 = Sunday, 1 = Monday, etc. Use osd_scrub_begin_week_day=0 osd_scrub_end_week_day=0 + for the entire week. + fmt_desc: This restricts scrubbing to days of the week earlier than this. + 0 = Sunday, 1 = Monday, etc. Use ``osd_scrub_begin_week_day = 0`` + and ``osd_scrub_end_week_day = 0`` to allow scrubbing for the entire week. + Along with ``osd_scrub_begin_week_day``, they define a time + window, in which the scrubs can happen. But a scrub will be performed + no matter whether the time window allows or not, as long as the placement + group's scrub interval exceeds ``osd_scrub_max_interval``. + default: 0 + see_also: + - osd_scrub_begin_week_day + min: 0 + max: 6 + with_legacy: true +- name: osd_scrub_load_threshold + type: float + level: advanced + desc: Allow scrubbing when system load divided by number of CPUs is below this value + fmt_desc: The normalized maximum load. Ceph will not scrub when the system load + (as defined by ``getloadavg() / number of online CPUs``) is higher than this number. + Default is ``0.5``. + default: 0.5 + with_legacy: true +# if load is low +- name: osd_scrub_min_interval + type: float + level: advanced + desc: Scrub each PG no more often than this interval + fmt_desc: The minimal interval in seconds for scrubbing the Ceph OSD Daemon + when the Ceph Storage Cluster load is low. + default: 1_day + see_also: + - osd_scrub_max_interval + with_legacy: true +# regardless of load +- name: osd_scrub_max_interval + type: float + level: advanced + desc: Scrub each PG no less often than this interval + fmt_desc: The maximum interval in seconds for scrubbing the Ceph OSD Daemon + irrespective of cluster load. + default: 7_day + see_also: + - osd_scrub_min_interval + with_legacy: true +# randomize the scheduled scrub in the span of [min,min*(1+randomize_ratio)) +- name: osd_scrub_interval_randomize_ratio + type: float + level: advanced + desc: Ratio of scrub interval to randomly vary + long_desc: This prevents a scrub 'stampede' by randomly varying the scrub intervals + so that they are soon uniformly distributed over the week + fmt_desc: Add a random delay to ``osd_scrub_min_interval`` when scheduling + the next scrub job for a PG. The delay is a random + value less than ``osd_scrub_min_interval`` \* + ``osd_scrub_interval_randomized_ratio``. The default setting + spreads scrubs throughout the allowed time + window of ``[1, 1.5]`` \* ``osd_scrub_min_interval``. + default: 0.5 + see_also: + - osd_scrub_min_interval + with_legacy: true +# the probability to back off the scheduled scrub +- name: osd_scrub_backoff_ratio + type: float + level: dev + desc: Backoff ratio for scheduling scrubs + long_desc: This is the precentage of ticks that do NOT schedule scrubs, 66% means + that 1 out of 3 ticks will schedule scrubs + default: 0.66 + with_legacy: true +- name: osd_scrub_chunk_min + type: int + level: advanced + desc: Minimum number of objects to deep-scrub in a single chunk + fmt_desc: The minimal number of object store chunks to scrub during single operation. + Ceph blocks writes to single chunk during scrub. + default: 5 + see_also: + - osd_scrub_chunk_max + with_legacy: true +- name: osd_scrub_chunk_max + type: int + level: advanced + desc: Maximum number of objects to deep-scrub in a single chunk + fmt_desc: The maximum number of object store chunks to scrub during single operation. + default: 25 + see_also: + - osd_scrub_chunk_min + with_legacy: true +- name: osd_shallow_scrub_chunk_min + type: int + level: advanced + desc: Minimum number of objects to scrub in a single chunk + fmt_desc: The minimum number of object store chunks to scrub during single operation. + Not applicable to deep scrubs. + Ceph blocks writes to single chunk during scrub. + default: 50 + see_also: + - osd_shallow_scrub_chunk_max + - osd_scrub_chunk_min + with_legacy: true +- name: osd_shallow_scrub_chunk_max + type: int + level: advanced + desc: Maximum number of objects to scrub in a single chunk + fmt_desc: The maximum number of object store chunks to scrub during single operation. + Not applicable to deep scrubs. + default: 100 + see_also: + - osd_shallow_scrub_chunk_min + - osd_scrub_chunk_max + with_legacy: true +# sleep between [deep]scrub ops +- name: osd_scrub_sleep + type: float + level: advanced + desc: Duration to inject a delay during scrubbing + fmt_desc: Time to sleep before scrubbing the next group of chunks. Increasing this value will slow + down the overall rate of scrubbing so that client operations will be less impacted. + default: 0 + flags: + - runtime + with_legacy: true +# more sleep between [deep]scrub ops +- name: osd_scrub_extended_sleep + type: float + level: advanced + desc: Duration to inject a delay during scrubbing out of scrubbing hours + default: 0 + see_also: + - osd_scrub_begin_hour + - osd_scrub_end_hour + - osd_scrub_begin_week_day + - osd_scrub_end_week_day + with_legacy: true +# whether auto-repair inconsistencies upon deep-scrubbing +- name: osd_scrub_auto_repair + type: bool + level: advanced + desc: Automatically repair damaged objects detected during scrub + fmt_desc: Setting this to ``true`` will enable automatic PG repair when errors + are found by scrubs or deep-scrubs. However, if more than + ``osd_scrub_auto_repair_num_errors`` errors are found a repair is NOT performed. + default: false + with_legacy: true +# only auto-repair when number of errors is below this threshold +- name: osd_scrub_auto_repair_num_errors + type: uint + level: advanced + desc: Maximum number of detected errors to automatically repair + fmt_desc: Auto repair will not occur if more than this many errors are found. + default: 5 + see_also: + - osd_scrub_auto_repair + with_legacy: true +- name: osd_scrub_max_preemptions + type: uint + level: advanced + desc: Set the maximum number of times we will preempt a deep scrub due to a client + operation before blocking client IO to complete the scrub + default: 5 + min: 0 + max: 30 +- name: osd_deep_scrub_interval + type: float + level: advanced + desc: Deep scrub each PG (i.e., verify data checksums) at least this often + fmt_desc: The interval for "deep" scrubbing (fully reading all data). The + ``osd_scrub_load_threshold`` does not affect this setting. + default: 7_day + with_legacy: true +- name: osd_deep_scrub_randomize_ratio + type: float + level: advanced + desc: Scrubs will randomly become deep scrubs at this rate (0.15 -> 15% of scrubs + are deep) + long_desc: This prevents a deep scrub 'stampede' by spreading deep scrubs so they + are uniformly distributed over the week + default: 0.15 + with_legacy: true +- name: osd_deep_scrub_stride + type: size + level: advanced + desc: Number of bytes to read from an object at a time during deep scrub + fmt_desc: Read size when doing a deep scrub. + default: 512_K + with_legacy: true +- name: osd_deep_scrub_keys + type: int + level: advanced + desc: Number of keys to read from an object at a time during deep scrub + default: 1024 + with_legacy: true +# objects must be this old (seconds) before we update the whole-object digest on scrub +- name: osd_deep_scrub_update_digest_min_age + type: int + level: advanced + desc: Update overall object digest only if object was last modified longer ago than + this + default: 2_hr + with_legacy: true +- name: osd_deep_scrub_large_omap_object_key_threshold + type: uint + level: advanced + desc: Warn when we encounter an object with more omap keys than this + default: 200000 + services: + - osd + - mds + see_also: + - osd_deep_scrub_large_omap_object_value_sum_threshold + with_legacy: true +- name: osd_deep_scrub_large_omap_object_value_sum_threshold + type: size + level: advanced + desc: Warn when we encounter an object with more omap key bytes than this + default: 1_G + services: + - osd + see_also: + - osd_deep_scrub_large_omap_object_key_threshold + with_legacy: true +# when scrubbing blocks on a locked object +- name: osd_blocked_scrub_grace_period + type: int + level: advanced + desc: Time (seconds) before issuing a cluster-log warning + long_desc: Waiting too long for an object in the scrubbed chunk to be unlocked. + default: 120 + with_legacy: true +# timely updates to the 'pg dump' output, esp. re scrub scheduling +- name: osd_stats_update_period_scrubbing + type: int + level: advanced + desc: Stats update period (seconds) when scrubbing + long_desc: A PG actively scrubbing (or blocked while scrubbing) publishes its + stats (inc. scrub/block duration) every this many seconds. + default: 15 + with_legacy: false +- name: osd_stats_update_period_not_scrubbing + type: int + level: advanced + desc: Stats update period (seconds) when not scrubbing + long_desc: A PG we are a primary of, publishes its + stats (inc. scrub/block duration) every this many seconds. + default: 120 + with_legacy: false +# when replicas are slow to respond to scrub resource reservations +# Note: disable by using a very large value +- name: osd_scrub_slow_reservation_response + type: millisecs + level: advanced + desc: Duration before issuing a cluster-log warning + long_desc: Waiting too long for a replica to respond (after at least half of the + replicas have responded). + default: 2200 + min: 500 + see_also: + - osd_scrub_reservation_timeout + with_legacy: false +# when a replica does not respond to scrub resource request +# Note: disable by using a very large value +- name: osd_scrub_reservation_timeout + type: millisecs + level: advanced + desc: Duration before aborting the scrub session + long_desc: Waiting too long for some replicas to respond to + scrub reservation requests. + default: 5000 + min: 2000 + see_also: + - osd_scrub_slow_reservation_response + with_legacy: false +# where rados plugins are stored +- name: osd_class_dir + type: str + level: advanced + default: @CMAKE_INSTALL_LIBDIR@/rados-classes + fmt_desc: The class path for RADOS class plug-ins. + with_legacy: true +- name: osd_open_classes_on_start + type: bool + level: advanced + default: true + with_legacy: true +# list of object classes allowed to be loaded (allow all: *) +- name: osd_class_load_list + type: str + level: advanced + default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex + user version cas cmpomap queue 2pc_queue fifo + with_legacy: true +# list of object classes with default execute perm (allow all: *) +- name: osd_class_default_list + type: str + level: advanced + default: cephfs hello journal lock log numops otp rbd refcount rgw rgw_gc timeindex + user version cas cmpomap queue 2pc_queue fifo + with_legacy: true +- name: osd_agent_max_ops + type: int + level: advanced + desc: maximum concurrent tiering operations for tiering agent + fmt_desc: The maximum number of simultaneous flushing ops per tiering agent + in the high speed mode. + default: 4 + with_legacy: true +- name: osd_agent_max_low_ops + type: int + level: advanced + desc: maximum concurrent low-priority tiering operations for tiering agent + fmt_desc: The maximum number of simultaneous flushing ops per tiering agent + in the low speed mode. + default: 2 + with_legacy: true +- name: osd_agent_min_evict_effort + type: float + level: advanced + desc: minimum effort to expend evicting clean objects + default: 0.1 + min: 0 + max: 0.99 + with_legacy: true +- name: osd_agent_quantize_effort + type: float + level: advanced + desc: size of quantize unit for eviction effort + default: 0.1 + with_legacy: true +- name: osd_agent_delay_time + type: float + level: advanced + desc: how long agent should sleep if it has no work to do + default: 5 + with_legacy: true +# decay atime and hist histograms after how many objects go by +- name: osd_agent_hist_halflife + type: int + level: advanced + desc: halflife of agent atime and temp histograms + default: 1000 + with_legacy: true +# decay atime and hist histograms after how many objects go by +- name: osd_agent_slop + type: float + level: advanced + desc: slop factor to avoid switching tiering flush and eviction mode + default: 0.02 + with_legacy: true +- name: osd_find_best_info_ignore_history_les + type: bool + level: dev + desc: ignore last_epoch_started value when peering AND PROBABLY LOSE DATA + long_desc: THIS IS AN EXTREMELY DANGEROUS OPTION THAT SHOULD ONLY BE USED AT THE + DIRECTION OF A DEVELOPER. It makes peering ignore the last_epoch_started value + when peering, which can allow the OSD to believe an OSD has an authoritative view + of a PG's contents even when it is in fact old and stale, typically leading to + data loss (by believing a stale PG is up to date). + default: false + with_legacy: true +- name: osd_uuid + type: uuid + level: advanced + desc: uuid label for a new OSD + fmt_desc: The universally unique identifier (UUID) for the Ceph OSD Daemon. + note: The ``osd_uuid`` applies to a single Ceph OSD Daemon. The ``fsid`` + applies to the entire cluster. + flags: + - create + with_legacy: true +- name: osd_data + type: str + level: advanced + desc: path to OSD data + fmt_desc: The path to the OSDs data. You must create the directory when + deploying Ceph. You should mount a drive for OSD data at this + mount point. We do not recommend changing the default. + default: /var/lib/ceph/osd/$cluster-$id + flags: + - no_mon_update + with_legacy: true +- name: osd_journal + type: str + level: advanced + desc: path to OSD journal (when FileStore backend is in use) + fmt_desc: The path to the OSD's journal. This may be a path to a file or a + block device (such as a partition of an SSD). If it is a file, + you must create the directory to contain it. We recommend using a + separate fast device when the ``osd_data`` drive is an HDD. + default: /var/lib/ceph/osd/$cluster-$id/journal + flags: + - no_mon_update + with_legacy: true +- name: osd_journal_size + type: size + level: advanced + desc: size of FileStore journal (in MiB) + fmt_desc: The size of the journal in megabytes. + default: 5_K + flags: + - create + with_legacy: true +- name: osd_journal_flush_on_shutdown + type: bool + level: advanced + desc: flush FileStore journal contents during clean OSD shutdown + default: true + with_legacy: true +- name: osd_compact_on_start + type: bool + level: advanced + desc: compact OSD's object store's OMAP on start + default: false +# flags for specific control purpose during osd mount() process. +# e.g., can be 1 to skip over replaying journal +# or 2 to skip over mounting omap or 3 to skip over both. +# This might be helpful in case the journal is totally corrupted +# and we still want to bring the osd daemon back normally, etc. +- name: osd_os_flags + type: uint + level: dev + desc: flags to skip filestore omap or journal initialization + default: 0 +- name: osd_max_write_size + type: size + level: advanced + desc: Maximum size of a RADOS write operation in megabytes + long_desc: This setting prevents clients from doing very large writes to RADOS. If + you set this to a value below what clients expect, they will receive an error + when attempting to write to the cluster. + fmt_desc: The maximum size of a write in megabytes. + default: 90 + min: 4 + with_legacy: true +- name: osd_max_pgls + type: uint + level: advanced + desc: maximum number of results when listing objects in a pool + fmt_desc: The maximum number of placement groups to list. A client + requesting a large number can tie up the Ceph OSD Daemon. + default: 1_K + with_legacy: true +- name: osd_client_message_size_cap + type: size + level: advanced + desc: maximum memory to devote to in-flight client requests + long_desc: If this value is exceeded, the OSD will not read any new client data + off of the network until memory is freed. + fmt_desc: The largest client data message allowed in memory. + default: 500_M + with_legacy: true +- name: osd_client_message_cap + type: uint + level: advanced + desc: maximum number of in-flight client requests + default: 256 + with_legacy: true +- name: osd_crush_update_on_start + type: bool + level: advanced + desc: update OSD CRUSH location on startup + default: true + with_legacy: true +- name: osd_class_update_on_start + type: bool + level: advanced + desc: set OSD device class on startup + default: true + with_legacy: true +- name: osd_crush_initial_weight + type: float + level: advanced + desc: if >= 0, initial CRUSH weight for newly created OSDs + long_desc: If this value is negative, the size of the OSD in TiB is used. + fmt_desc: The initial CRUSH weight for newly added OSDs. The default + value of this option is ``the size of a newly added OSD in TB``. By default, + the initial CRUSH weight for a newly added OSD is set to its device size in + TB. See `Weighting Bucket Items`_ for details. + default: -1 + with_legacy: true +# Allows the "peered" state for recovery and backfill below min_size +- name: osd_allow_recovery_below_min_size + type: bool + level: dev + desc: allow replicated pools to recover with < min_size active members + default: true + services: + - osd + with_legacy: true +# cap on # of inc maps we send to peers, clients +- name: osd_map_share_max_epochs + type: int + level: advanced + default: 40 + with_legacy: true +- name: osd_map_cache_size + type: int + level: advanced + default: 50 + fmt_desc: The number of OSD maps to keep cached. + with_legacy: true +- name: osd_pg_epoch_max_lag_factor + type: float + level: advanced + desc: Max multiple of the map cache that PGs can lag before we throttle map injest + default: 2 + see_also: + - osd_map_cache_size +- name: osd_inject_bad_map_crc_probability + type: float + level: dev + default: 0 + with_legacy: true +- name: osd_inject_failure_on_pg_removal + type: bool + level: dev + default: false + with_legacy: true +# shutdown the OSD if stuatus flipping more than max_markdown_count times in recent max_markdown_period seconds +- name: osd_max_markdown_period + type: int + level: advanced + default: 10_min + with_legacy: true +- name: osd_max_markdown_count + type: int + level: advanced + default: 5 + with_legacy: true +- name: osd_op_thread_timeout + type: int + level: advanced + default: 15 + fmt_desc: The Ceph OSD Daemon operation thread timeout in seconds. + with_legacy: true +- name: osd_op_thread_suicide_timeout + type: int + level: advanced + default: 150 + with_legacy: true +- name: osd_op_pq_max_tokens_per_priority + type: uint + level: advanced + default: 4_M + with_legacy: true +- name: osd_op_pq_min_cost + type: size + level: advanced + default: 64_K + with_legacy: true +# preserve clone_overlap during recovery/migration +- name: osd_recover_clone_overlap + type: bool + level: advanced + default: true + fmt_desc: Preserves clone overlap during recovery. Should always be set + to ``true``. + with_legacy: true +- name: osd_num_cache_shards + type: size + level: advanced + desc: The number of cache shards to use in the object store. + default: 32 + flags: + - startup +- name: osd_aggregated_slow_ops_logging + type: bool + level: advanced + desc: Allow OSD daemon to send an aggregated slow ops to the cluster log + fmt_desc: If set to ``true``, the OSD daemon will send slow ops information in + an aggregated format to the cluster log else sends every slow op to the + cluster log. + default: true + with_legacy: true +- name: osd_op_num_threads_per_shard + type: int + level: advanced + default: 0 + flags: + - startup + with_legacy: true +- name: osd_op_num_threads_per_shard_hdd + type: int + level: advanced + default: 1 + see_also: + - osd_op_num_threads_per_shard + flags: + - startup + with_legacy: true +- name: osd_op_num_threads_per_shard_ssd + type: int + level: advanced + default: 2 + see_also: + - osd_op_num_threads_per_shard + flags: + - startup + with_legacy: true +- name: osd_op_num_shards + type: int + level: advanced + fmt_desc: The number of shards allocated for a given OSD. Each shard has its own processing queue. + PGs on the OSD are distributed evenly in the shard. This setting overrides _ssd and _hdd if + non-zero. + default: 0 + flags: + - startup + with_legacy: true +- name: osd_op_num_shards_hdd + type: int + level: advanced + fmt_desc: the number of shards allocated for a given OSD (for rotational media). + default: 5 + see_also: + - osd_op_num_shards + flags: + - startup + with_legacy: true +- name: osd_op_num_shards_ssd + type: int + level: advanced + fmt_desc: the number of shards allocated for a given OSD (for solid state media). + default: 8 + see_also: + - osd_op_num_shards + flags: + - startup + with_legacy: true +- name: osd_skip_data_digest + type: bool + level: dev + desc: Do not store full-object checksums if the backend (bluestore) does its own + checksums. Only usable with all BlueStore OSDs. + default: false +# PrioritzedQueue (prio), Weighted Priority Queue (wpq ; default), +# mclock_opclass, mclock_client, or debug_random. "mclock_opclass" +# and "mclock_client" are based on the mClock/dmClock algorithm +# (Gulati, et al. 2010). "mclock_opclass" prioritizes based on the +# class the operation belongs to. "mclock_client" does the same but +# also works to ienforce fairness between clients. "debug_random" +# chooses among all four with equal probability. +- name: osd_op_queue + type: str + level: advanced + desc: which operation priority queue algorithm to use + long_desc: which operation priority queue algorithm to use + fmt_desc: This sets the type of queue to be used for prioritizing ops + within each OSD. Both queues feature a strict sub-queue which is + dequeued before the normal queue. The normal queue is different + between implementations. The WeightedPriorityQueue (``wpq``) + dequeues operations in relation to their priorities to prevent + starvation of any queue. WPQ should help in cases where a few OSDs + are more overloaded than others. The mClockQueue + (``mclock_scheduler``) prioritizes operations based on which class + they belong to (recovery, scrub, snaptrim, client op, osd subop). + See `QoS Based on mClock`_. Requires a restart. + default: mclock_scheduler + see_also: + - osd_op_queue_cut_off + enum_values: + - wpq + - mclock_scheduler + - debug_random + with_legacy: true +# Min priority to go to strict queue. (low, high) +- name: osd_op_queue_cut_off + type: str + level: advanced + desc: the threshold between high priority ops and low priority ops + long_desc: the threshold between high priority ops that use strict priority ordering + and low priority ops that use a fairness algorithm that may or may not incorporate + priority + fmt_desc: This selects which priority ops will be sent to the strict + queue verses the normal queue. The ``low`` setting sends all + replication ops and higher to the strict queue, while the ``high`` + option sends only replication acknowledgment ops and higher to + the strict queue. Setting this to ``high`` should help when a few + OSDs in the cluster are very busy especially when combined with + ``wpq`` in the ``osd_op_queue`` setting. OSDs that are very busy + handling replication traffic could starve primary client traffic + on these OSDs without these settings. Requires a restart. + default: high + see_also: + - osd_op_queue + enum_values: + - low + - high + - debug_random + with_legacy: true +- name: osd_mclock_scheduler_client_res + type: float + level: advanced + desc: IO proportion reserved for each client (default). The default value + of 0 specifies the lowest possible reservation. Any value greater than + 0 and up to 1.0 specifies the minimum IO proportion to reserve for each + client in terms of a fraction of the OSD's maximum IOPS capacity. + long_desc: Only considered for osd_op_queue = mclock_scheduler + fmt_desc: IO proportion reserved for each client (default). + default: 0 + min: 0 + max: 1.0 + see_also: + - osd_op_queue +- name: osd_mclock_scheduler_client_wgt + type: uint + level: advanced + desc: IO share for each client (default) over reservation + long_desc: Only considered for osd_op_queue = mclock_scheduler + fmt_desc: IO share for each client (default) over reservation. + default: 1 + see_also: + - osd_op_queue +- name: osd_mclock_scheduler_client_lim + type: float + level: advanced + desc: IO limit for each client (default) over reservation. The default + value of 0 specifies no limit enforcement, which means each client can + use the maximum possible IOPS capacity of the OSD. Any value greater + than 0 and up to 1.0 specifies the upper IO limit over reservation + that each client receives in terms of a fraction of the OSD's + maximum IOPS capacity. + long_desc: Only considered for osd_op_queue = mclock_scheduler + fmt_desc: IO limit for each client (default) over reservation. + default: 0 + min: 0 + max: 1.0 + see_also: + - osd_op_queue +- name: osd_mclock_scheduler_background_recovery_res + type: float + level: advanced + desc: IO proportion reserved for background recovery (default). The + default value of 0 specifies the lowest possible reservation. Any value + greater than 0 and up to 1.0 specifies the minimum IO proportion to + reserve for background recovery operations in terms of a fraction of + the OSD's maximum IOPS capacity. + long_desc: Only considered for osd_op_queue = mclock_scheduler + fmt_desc: IO proportion reserved for background recovery (default). + default: 0 + min: 0 + max: 1.0 + see_also: + - osd_op_queue +- name: osd_mclock_scheduler_background_recovery_wgt + type: uint + level: advanced + desc: IO share for each background recovery over reservation + long_desc: Only considered for osd_op_queue = mclock_scheduler + fmt_desc: IO share for each background recovery over reservation. + default: 1 + see_also: + - osd_op_queue +- name: osd_mclock_scheduler_background_recovery_lim + type: float + level: advanced + desc: IO limit for background recovery over reservation. The default + value of 0 specifies no limit enforcement, which means background + recovery operation can use the maximum possible IOPS capacity of the + OSD. Any value greater than 0 and up to 1.0 specifies the upper IO + limit over reservation that background recovery operation receives in + terms of a fraction of the OSD's maximum IOPS capacity. + long_desc: Only considered for osd_op_queue = mclock_scheduler + fmt_desc: IO limit for background recovery over reservation. + default: 0 + min: 0 + max: 1.0 + see_also: + - osd_op_queue +- name: osd_mclock_scheduler_background_best_effort_res + type: float + level: advanced + desc: IO proportion reserved for background best_effort (default). The + default value of 0 specifies the lowest possible reservation. Any value + greater than 0 and up to 1.0 specifies the minimum IO proportion to + reserve for background best_effort operations in terms of a fraction + of the OSD's maximum IOPS capacity. + long_desc: Only considered for osd_op_queue = mclock_scheduler + fmt_desc: IO proportion reserved for background best_effort (default). + default: 0 + min: 0 + max: 1.0 + see_also: + - osd_op_queue +- name: osd_mclock_scheduler_background_best_effort_wgt + type: uint + level: advanced + desc: IO share for each background best_effort over reservation + long_desc: Only considered for osd_op_queue = mclock_scheduler + fmt_desc: IO share for each background best_effort over reservation. + default: 1 + see_also: + - osd_op_queue +- name: osd_mclock_scheduler_background_best_effort_lim + type: float + level: advanced + desc: IO limit for background best_effort over reservation. The default + value of 0 specifies no limit enforcement, which means background + best_effort operation can use the maximum possible IOPS capacity of the + OSD. Any value greater than 0 and up to 1.0 specifies the upper IO + limit over reservation that background best_effort operation receives + in terms of a fraction of the OSD's maximum IOPS capacity. + long_desc: Only considered for osd_op_queue = mclock_scheduler + fmt_desc: IO limit for background best_effort over reservation. + default: 0 + min: 0 + max: 1.0 + see_also: + - osd_op_queue +- name: osd_mclock_scheduler_anticipation_timeout + type: float + level: advanced + desc: mclock anticipation timeout in seconds + long_desc: the amount of time that mclock waits until the unused resource is forfeited + default: 0 +- name: osd_mclock_max_sequential_bandwidth_hdd + type: size + level: basic + desc: The maximum sequential bandwidth in bytes/second of the OSD (for + rotational media) + long_desc: This option specifies the maximum sequential bandwidth to consider + for an OSD whose underlying device type is rotational media. This is + considered by the mclock scheduler to derive the cost factor to be used in + QoS calculations. Only considered for osd_op_queue = mclock_scheduler + fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the + OSD (for rotational media) + default: 150_M + flags: + - runtime +- name: osd_mclock_max_sequential_bandwidth_ssd + type: size + level: basic + desc: The maximum sequential bandwidth in bytes/second of the OSD (for + solid state media) + long_desc: This option specifies the maximum sequential bandwidth to consider + for an OSD whose underlying device type is solid state media. This is + considered by the mclock scheduler to derive the cost factor to be used in + QoS calculations. Only considered for osd_op_queue = mclock_scheduler + fmt_desc: The maximum sequential bandwidth in bytes/second to consider for the + OSD (for solid state media) + default: 1200_M + flags: + - runtime +- name: osd_mclock_max_capacity_iops_hdd + type: float + level: basic + desc: Max random write IOPS capacity (at 4KiB block size) to consider per OSD + (for rotational media) + long_desc: This option specifies the max OSD random write IOPS capacity per + OSD. Contributes in QoS calculations when enabling a dmclock profile. Only + considered for osd_op_queue = mclock_scheduler + fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per + OSD (for rotational media) + default: 315 + flags: + - runtime +- name: osd_mclock_max_capacity_iops_ssd + type: float + level: basic + desc: Max random write IOPS capacity (at 4 KiB block size) to consider per OSD + (for solid state media) + long_desc: This option specifies the max OSD random write IOPS capacity per + OSD. Contributes in QoS calculations when enabling a dmclock profile. Only + considered for osd_op_queue = mclock_scheduler + fmt_desc: Max random write IOPS capacity (at 4 KiB block size) to consider per + OSD (for solid state media) + default: 21500 + flags: + - runtime +- name: osd_mclock_force_run_benchmark_on_init + type: bool + level: advanced + desc: Force run the OSD benchmark on OSD initialization/boot-up + long_desc: This option specifies whether the OSD benchmark must be run during + the OSD boot-up sequence even if historical data about the OSD iops capacity + is available in the MON config store. Enable this to refresh the OSD iops + capacity if the underlying device's performance characteristics have changed + significantly. Only considered for osd_op_queue = mclock_scheduler. + fmt_desc: Force run the OSD benchmark on OSD initialization/boot-up + default: false + see_also: + - osd_mclock_max_capacity_iops_hdd + - osd_mclock_max_capacity_iops_ssd + flags: + - startup +- name: osd_mclock_skip_benchmark + type: bool + level: dev + desc: Skip the OSD benchmark on OSD initialization/boot-up + long_desc: This option specifies whether the OSD benchmark must be skipped during + the OSD boot-up sequence. Only considered for osd_op_queue = mclock_scheduler. + fmt_desc: Skip the OSD benchmark on OSD initialization/boot-up + default: false + see_also: + - osd_mclock_max_capacity_iops_hdd + - osd_mclock_max_capacity_iops_ssd + flags: + - runtime +- name: osd_mclock_profile + type: str + level: advanced + desc: Which mclock profile to use + long_desc: This option specifies the mclock profile to enable - one among the set + of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler + fmt_desc: | + This sets the type of mclock profile to use for providing QoS + based on operations belonging to different classes (background + recovery, scrub, snaptrim, client op, osd subop). Once a built-in + profile is enabled, the lower level mclock resource control + parameters [*reservation, weight, limit*] and some Ceph + configuration parameters are set transparently. Note that the + above does not apply for the *custom* profile. + default: balanced + see_also: + - osd_op_queue + enum_values: + - balanced + - high_recovery_ops + - high_client_ops + - custom + flags: + - runtime +- name: osd_mclock_override_recovery_settings + type: bool + level: advanced + desc: Setting this option enables the override of recovery/backfill limits + for the mClock scheduler. + long_desc: This option when set enables the override of the max recovery + active and the max backfills limits with mClock scheduler active. These + options are not modifiable when mClock scheduler is active. Any attempt + to modify these values without setting this option will reset the + recovery or backfill option back to its default value. + fmt_desc: Setting this option will enable the override of the + recovery/backfill limits for the mClock scheduler as defined by the + ``osd_recovery_max_active_hdd``, ``osd_recovery_max_active_ssd`` and + ``osd_max_backfills`` options. + default: false + see_also: + - osd_recovery_max_active_hdd + - osd_recovery_max_active_ssd + - osd_max_backfills + flags: + - runtime +- name: osd_mclock_iops_capacity_threshold_hdd + type: float + level: basic + desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore + the OSD bench results for an OSD (for rotational media) + long_desc: This option specifies the threshold IOPS capacity for an OSD under + which the OSD bench results can be considered for QoS calculations. Only + considered for osd_op_queue = mclock_scheduler + fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to + ignore OSD bench results for an OSD (for rotational media) + default: 500 + flags: + - runtime +- name: osd_mclock_iops_capacity_threshold_ssd + type: float + level: basic + desc: The threshold IOPs capacity (at 4KiB block size) beyond which to ignore + the OSD bench results for an OSD (for solid state media) + long_desc: This option specifies the threshold IOPS capacity for an OSD under + which the OSD bench results can be considered for QoS calculations. Only + considered for osd_op_queue = mclock_scheduler + fmt_desc: The threshold IOPS capacity (at 4KiB block size) beyond which to + ignore OSD bench results for an OSD (for solid state media) + default: 80000 + flags: + - runtime +# Set to true for testing. Users should NOT set this. +# If set to true even after reading enough shards to +# decode the object, any error will be reported. +- name: osd_read_ec_check_for_errors + type: bool + level: advanced + default: false + with_legacy: true +- name: osd_recovery_delay_start + type: float + level: advanced + default: 0 + fmt_desc: After peering completes, Ceph will delay for the specified number + of seconds before starting to recover RADOS objects. + with_legacy: true +- name: osd_recovery_max_active + type: uint + level: advanced + desc: Number of simultaneous active recovery operations per OSD (overrides _ssd + and _hdd if non-zero) + fmt_desc: The number of active recovery requests per OSD at one time. More + requests will accelerate recovery, but the requests places an + increased load on the cluster. + note: This value is only used if it is non-zero. Normally it + is ``0``, which means that the ``hdd`` or ``ssd`` values + (below) are used, depending on the type of the primary + device backing the OSD. + default: 0 + see_also: + - osd_recovery_max_active_hdd + - osd_recovery_max_active_ssd + flags: + - runtime + with_legacy: true +- name: osd_recovery_max_active_hdd + type: uint + level: advanced + desc: Number of simultaneous active recovery operations per OSD (for rotational + devices) + fmt_desc: The number of active recovery requests per OSD at one time, if the + primary device is rotational. + default: 3 + see_also: + - osd_recovery_max_active + - osd_recovery_max_active_ssd + flags: + - runtime + with_legacy: true +- name: osd_recovery_max_active_ssd + type: uint + level: advanced + desc: Number of simultaneous active recovery operations per OSD (for non-rotational + solid state devices) + fmt_desc: The number of active recovery requests per OSD at one time, if the + primary device is non-rotational (i.e., an SSD). + default: 10 + see_also: + - osd_recovery_max_active + - osd_recovery_max_active_hdd + flags: + - runtime + with_legacy: true +- name: osd_recovery_max_single_start + type: uint + level: advanced + default: 1 + fmt_desc: The maximum number of recovery operations per OSD that will be + newly started when an OSD is recovering. + with_legacy: true +# max size of push chunk +- name: osd_recovery_max_chunk + type: size + level: advanced + default: 8_M + fmt_desc: the maximum total size of data chunks a recovery op can carry. + with_legacy: true +# max number of omap entries per chunk; 0 to disable limit +- name: osd_recovery_max_omap_entries_per_chunk + type: uint + level: advanced + default: 8096 + with_legacy: true +# max size of a COPYFROM chunk +- name: osd_copyfrom_max_chunk + type: size + level: advanced + default: 8_M + with_legacy: true +# push cost per object +- name: osd_push_per_object_cost + type: size + level: advanced + default: 1000 + fmt_desc: the overhead for serving a push op + with_legacy: true +# max size of push message +- name: osd_max_push_cost + type: size + level: advanced + default: 8_M + with_legacy: true +# max objects in single push op +- name: osd_max_push_objects + type: uint + level: advanced + default: 10 + with_legacy: true +# Only use clone_overlap for recovery if there are fewer than +# osd_recover_clone_overlap_limit entries in the overlap set +- name: osd_recover_clone_overlap_limit + type: uint + level: advanced + default: 10 + flags: + - runtime +- name: osd_debug_feed_pullee + type: int + level: dev + desc: Feed a pullee, and force primary to pull a currently missing object from it + default: -1 + with_legacy: true +- name: osd_backfill_scan_min + type: int + level: advanced + default: 64 + fmt_desc: The minimum number of objects per backfill scan. + with_legacy: true +- name: osd_backfill_scan_max + type: int + level: advanced + default: 512 + fmt_desc: The maximum number of objects per backfill scan.p + with_legacy: true +- name: osd_extblkdev_plugins + type: str + level: advanced + desc: extended block device plugins to load, provide compression feedback at runtime + default: vdo + flags: + - startup +# minimum number of peers +- name: osd_heartbeat_min_peers + type: int + level: advanced + default: 10 + with_legacy: true +- name: osd_delete_sleep + type: float + level: advanced + desc: Time in seconds to sleep before next removal transaction. This setting + overrides _ssd, _hdd, and _hybrid if non-zero. + fmt_desc: Time in seconds to sleep before the next removal transaction. This + throttles the PG deletion process. + default: 0 + flags: + - runtime +- name: osd_delete_sleep_hdd + type: float + level: advanced + desc: Time in seconds to sleep before next removal transaction for HDDs + default: 5 + flags: + - runtime +- name: osd_delete_sleep_ssd + type: float + level: advanced + desc: Time in seconds to sleep before next removal transaction for SSDs + default: 1 + flags: + - runtime +- name: osd_delete_sleep_hybrid + type: float + level: advanced + desc: Time in seconds to sleep before next removal transaction when OSD data is on HDD + and OSD journal or WAL+DB is on SSD + default: 1 + flags: + - runtime +- name: osd_rocksdb_iterator_bounds_enabled + desc: Whether omap iterator bounds are applied to rocksdb iterator ReadOptions + type: bool + level: dev + default: true + with_legacy: true |