From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/blk/BlockDevice.h | 278 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 src/blk/BlockDevice.h (limited to 'src/blk/BlockDevice.h') diff --git a/src/blk/BlockDevice.h b/src/blk/BlockDevice.h new file mode 100644 index 000000000..191eb8ec9 --- /dev/null +++ b/src/blk/BlockDevice.h @@ -0,0 +1,278 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky + * + * Author: Haomai Wang + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_BLK_BLOCKDEVICE_H +#define CEPH_BLK_BLOCKDEVICE_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acconfig.h" +#include "common/ceph_mutex.h" +#include "include/common_fwd.h" + +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) +#include "aio/aio.h" +#endif +#include "include/ceph_assert.h" +#include "include/buffer.h" +#include "include/interval_set.h" +#define SPDK_PREFIX "spdk:" + +#if defined(__linux__) +#if !defined(F_SET_FILE_RW_HINT) +#define F_LINUX_SPECIFIC_BASE 1024 +#define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) +#endif +// These values match Linux definition +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56 +#define WRITE_LIFE_NOT_SET 0 // No hint information set +#define WRITE_LIFE_NONE 1 // No hints about write life time +#define WRITE_LIFE_SHORT 2 // Data written has a short life time +#define WRITE_LIFE_MEDIUM 3 // Data written has a medium life time +#define WRITE_LIFE_LONG 4 // Data written has a long life time +#define WRITE_LIFE_EXTREME 5 // Data written has an extremely long life time +#define WRITE_LIFE_MAX 6 +#else +// On systems don't have WRITE_LIFE_* only use one FD +// And all files are created equal +#define WRITE_LIFE_NOT_SET 0 // No hint information set +#define WRITE_LIFE_NONE 0 // No hints about write life time +#define WRITE_LIFE_SHORT 0 // Data written has a short life time +#define WRITE_LIFE_MEDIUM 0 // Data written has a medium life time +#define WRITE_LIFE_LONG 0 // Data written has a long life time +#define WRITE_LIFE_EXTREME 0 // Data written has an extremely long life time +#define WRITE_LIFE_MAX 1 +#endif + + +/// track in-flight io +struct IOContext { +private: + ceph::mutex lock = ceph::make_mutex("IOContext::lock"); + ceph::condition_variable cond; + int r = 0; + +public: + CephContext* cct; + void *priv; +#ifdef HAVE_SPDK + void *nvme_task_first = nullptr; + void *nvme_task_last = nullptr; + std::atomic_int total_nseg = {0}; +#endif + +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) + std::list pending_aios; ///< not yet submitted + std::list running_aios; ///< submitting or submitted +#endif + std::atomic_int num_pending = {0}; + std::atomic_int num_running = {0}; + bool allow_eio; + + explicit IOContext(CephContext* cct, void *p, bool allow_eio = false) + : cct(cct), priv(p), allow_eio(allow_eio) + {} + + // no copying + IOContext(const IOContext& other) = delete; + IOContext &operator=(const IOContext& other) = delete; + + bool has_pending_aios() { + return num_pending.load(); + } + void release_running_aios(); + void aio_wait(); + uint64_t get_num_ios() const; + + void try_aio_wake() { + assert(num_running >= 1); + + std::lock_guard l(lock); + if (num_running.fetch_sub(1) == 1) { + + // we might have some pending IOs submitted after the check + // as there is no lock protection for aio_submit. + // Hence we might have false conditional trigger. + // aio_wait has to handle that hence do not care here. + cond.notify_all(); + } + } + + void set_return_value(int _r) { + r = _r; + } + + int get_return_value() const { + return r; + } +}; + + +class BlockDevice { +public: + CephContext* cct; + typedef void (*aio_callback_t)(void *handle, void *aio); +private: + ceph::mutex ioc_reap_lock = ceph::make_mutex("BlockDevice::ioc_reap_lock"); + std::vector ioc_reap_queue; + std::atomic_int ioc_reap_count = {0}; + enum class block_device_t { + unknown, +#if defined(HAVE_LIBAIO) || defined(HAVE_POSIXAIO) + aio, +#if defined(HAVE_LIBZBD) + hm_smr, +#endif +#endif +#if defined(HAVE_SPDK) + spdk, +#endif +#if defined(HAVE_BLUESTORE_PMEM) + pmem, +#endif + }; + static block_device_t detect_device_type(const std::string& path); + static block_device_t device_type_from_name(const std::string& blk_dev_name); + static BlockDevice *create_with_type(block_device_t device_type, + CephContext* cct, const std::string& path, aio_callback_t cb, + void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); + +protected: + uint64_t size = 0; + uint64_t block_size = 0; + bool support_discard = false; + bool rotational = true; + bool lock_exclusive = true; + + // HM-SMR specific properties. In HM-SMR drives the LBA space is divided into + // fixed-size zones. Typically, the first few zones are randomly writable; + // they form a conventional region of the drive. The remaining zones must be + // written sequentially and they must be reset before rewritten. For example, + // a 14 TB HGST HSH721414AL drive has 52156 zones each of size is 256 MiB. + // The zones 0-523 are randomly writable and they form the conventional region + // of the drive. The zones 524-52155 are sequential zones. + uint64_t conventional_region_size = 0; + uint64_t zone_size = 0; + +public: + aio_callback_t aio_callback; + void *aio_callback_priv; + BlockDevice(CephContext* cct, aio_callback_t cb, void *cbpriv) + : cct(cct), + aio_callback(cb), + aio_callback_priv(cbpriv) + {} + virtual ~BlockDevice() = default; + + static BlockDevice *create( + CephContext* cct, const std::string& path, aio_callback_t cb, void *cbpriv, aio_callback_t d_cb, void *d_cbpriv); + virtual bool supported_bdev_label() { return true; } + virtual bool is_rotational() { return rotational; } + + // HM-SMR-specific calls + virtual bool is_smr() const { return false; } + virtual uint64_t get_zone_size() const { + ceph_assert(is_smr()); + return zone_size; + } + virtual uint64_t get_conventional_region_size() const { + ceph_assert(is_smr()); + return conventional_region_size; + } + + virtual void aio_submit(IOContext *ioc) = 0; + + void set_no_exclusive_lock() { + lock_exclusive = false; + } + + uint64_t get_size() const { return size; } + uint64_t get_block_size() const { return block_size; } + + /// hook to provide utilization of thinly-provisioned device + virtual bool get_thin_utilization(uint64_t *total, uint64_t *avail) const { + return false; + } + + virtual int collect_metadata(const std::string& prefix, std::map *pm) const = 0; + + virtual int get_devname(std::string *out) const { + return -ENOENT; + } + virtual int get_devices(std::set *ls) const { + std::string s; + if (get_devname(&s) == 0) { + ls->insert(s); + } + return 0; + } + virtual int get_numa_node(int *node) const { + return -EOPNOTSUPP; + } + + virtual int read( + uint64_t off, + uint64_t len, + ceph::buffer::list *pbl, + IOContext *ioc, + bool buffered) = 0; + virtual int read_random( + uint64_t off, + uint64_t len, + char *buf, + bool buffered) = 0; + virtual int write( + uint64_t off, + ceph::buffer::list& bl, + bool buffered, + int write_hint = WRITE_LIFE_NOT_SET) = 0; + + virtual int aio_read( + uint64_t off, + uint64_t len, + ceph::buffer::list *pbl, + IOContext *ioc) = 0; + virtual int aio_write( + uint64_t off, + ceph::buffer::list& bl, + IOContext *ioc, + bool buffered, + int write_hint = WRITE_LIFE_NOT_SET) = 0; + virtual int flush() = 0; + virtual int discard(uint64_t offset, uint64_t len) { return 0; } + virtual int queue_discard(interval_set &to_release) { return -1; } + virtual void discard_drain() { return; } + + void queue_reap_ioc(IOContext *ioc); + void reap_ioc(); + + // for managing buffered readers/writers + virtual int invalidate_cache(uint64_t off, uint64_t len) = 0; + virtual int open(const std::string& path) = 0; + virtual void close() = 0; + +protected: + bool is_valid_io(uint64_t off, uint64_t len) const; +}; + +#endif //CEPH_BLK_BLOCKDEVICE_H -- cgit v1.2.3