Adding upstream version 18.2.2.upstream/18.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
commit: e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree: 64f88b554b444a49f656b6c656111a145cbbaa28 /src/seastar/apps/iotune/iotune.cc
parent: Initial commit. (diff)
download: ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
1 files changed, 878 insertions, 0 deletions
diff --git a/src/seastar/apps/iotune/iotune.cc b/src/seastar/apps/iotune/iotune.cc
new file mode 100644
index 000000000..c47f64340
--- /dev/null
+++ b/src/seastar/apps/iotune/iotune.cc
@@ -0,0 +1,878 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2018 ScyllaDB
+ *
+ * The goal of this program is to allow a user to properly configure the Seastar I/O
+ * scheduler.
+ */
+#include <iostream>
+#include <chrono>
+#include <random>
+#include <memory>
+#include <vector>
+#include <cmath>
+#include <sys/vfs.h>
+#include <sys/sysmacros.h>
+#include <boost/range/irange.hpp>
+#include <boost/program_options.hpp>
+#include <boost/iterator/counting_iterator.hpp>
+#include <fstream>
+#include <wordexp.h>
+#include <yaml-cpp/yaml.h>
+#include <fmt/printf.h>
+#include <seastar/core/seastar.hh>
+#include <seastar/core/file.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/posix.hh>
+#include <seastar/core/aligned_buffer.hh>
+#include <seastar/core/sharded.hh>
+#include <seastar/core/app-template.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/fsqual.hh>
+#include <seastar/core/loop.hh>
+#include <seastar/util/defer.hh>
+#include <seastar/util/log.hh>
+#include <seastar/util/std-compat.hh>
+#include <seastar/util/read_first_line.hh>
+
+using namespace seastar;
+using namespace std::chrono_literals;
+namespace fs = std::filesystem;
+
+logger iotune_logger("iotune");
+
+using iotune_clock = std::chrono::steady_clock;
+static thread_local std::default_random_engine random_generator(std::chrono::duration_cast<std::chrono::nanoseconds>(iotune_clock::now().time_since_epoch()).count());
+
+void check_device_properties(fs::path dev_sys_file) {
+    auto sched_file = dev_sys_file / "queue" / "scheduler";
+    auto sched_string = read_first_line(sched_file);
+    auto beg = sched_string.find('[');
+    size_t len = sched_string.size();
+    if (beg == sstring::npos) {
+        beg = 0;
+    } else {
+        auto end = sched_string.find(']');
+        if (end != sstring::npos) {
+            len = end - beg - 1;
+        }
+        beg++;
+    }
+    auto scheduler = sched_string.substr(beg, len);
+    if ((scheduler != "noop") && (scheduler != "none")) {
+        iotune_logger.warn("Scheduler for {} set to {}. It is recommend to set it to noop before evaluation so as not to skew the results.",
+                sched_file.string(), scheduler);
+    }
+
+    auto nomerges_file = dev_sys_file / "queue" / "nomerges";
+    auto nomerges = read_first_line_as<unsigned>(nomerges_file);
+    if (nomerges != 2u) {
+        iotune_logger.warn("nomerges for {} set to {}. It is recommend to set it to 2 before evaluation so that merges are disabled. Results can be skewed otherwise.",
+                nomerges_file.string(), nomerges);
+    }
+
+    auto write_cache_file = dev_sys_file / "queue" / "write_cache";
+    auto write_cache = read_first_line_as<std::string>(write_cache_file);
+    if (write_cache == "write back") {
+        iotune_logger.warn("write_cache for {} is set to write back. Some disks have poor implementation of this mode, pay attention to the measurements accuracy.",
+                write_cache_file.string());
+    }
+}
+
+struct evaluation_directory {
+    sstring _name;
+    // We know that if we issue more than this, they will be blocked on linux anyway.
+    unsigned _max_iodepth = 0;
+    uint64_t _available_space;
+    uint64_t _min_data_transfer_size = 512;
+    unsigned _disks_per_array = 0;
+
+    void scan_device(unsigned dev_maj, unsigned dev_min) {
+        scan_device(fmt::format("{}:{}", dev_maj, dev_min));
+    }
+
+    void scan_device(std::string dev_str) {
+        scan_device(fs::path("/sys/dev/block") / dev_str);
+    }
+
+    void scan_device(fs::path sys_file) {
+        try {
+            sys_file = fs::canonical(sys_file);
+            bool is_leaf = true;
+            if (fs::exists(sys_file / "slaves")) {
+                for (auto& dev : fs::directory_iterator(sys_file / "slaves")) {
+                    is_leaf = false;
+                    scan_device(read_first_line(dev.path() / "dev"));
+                }
+            }
+
+            // our work is done if not leaf. We'll tune the leaves
+            if (!is_leaf) {
+                return;
+            }
+
+            if (fs::exists(sys_file / "partition")) {
+                scan_device(sys_file.remove_filename());
+            } else {
+                check_device_properties(sys_file);
+                auto queue_dir = sys_file / "queue";
+                auto disk_min_io_size = read_first_line_as<uint64_t>(queue_dir / "minimum_io_size");
+
+                _min_data_transfer_size = std::max(_min_data_transfer_size, disk_min_io_size);
+                _max_iodepth += read_first_line_as<uint64_t>(queue_dir / "nr_requests");
+                _disks_per_array++;
+            }
+        } catch (std::system_error& se) {
+            iotune_logger.error("Error while parsing sysfs. Will continue with guessed values: {}", se.what());
+            _max_iodepth = 128;
+        }
+        _disks_per_array = std::max(_disks_per_array, 1u);
+    }
+public:
+    evaluation_directory(sstring name)
+        : _name(name)
+        , _available_space(fs::space(fs::path(_name)).available)
+    {}
+
+    unsigned max_iodepth() const {
+        return _max_iodepth;
+    }
+
+    fs::path path() const {
+        return fs::path(_name);
+    }
+
+    const sstring& name() const {
+        return _name;
+    }
+
+    unsigned disks_per_array() const {
+        return _disks_per_array;
+    }
+
+    uint64_t minimum_io_size() const {
+        return _min_data_transfer_size;
+    }
+
+    future<> discover_directory() {
+        return seastar::async([this] {
+            auto f = open_directory(_name).get0();
+            auto st = f.stat().get0();
+            f.close().get();
+
+            scan_device(major(st.st_dev), minor(st.st_dev));
+        });
+    }
+
+    uint64_t available_space() const {
+        return _available_space;
+    }
+};
+
+struct io_rates {
+    float bytes_per_sec = 0;
+    float iops = 0;
+    io_rates operator+(const io_rates& a) const {
+        return io_rates{bytes_per_sec + a.bytes_per_sec, iops + a.iops};
+    }
+
+    io_rates& operator+=(const io_rates& a) {
+        bytes_per_sec += a.bytes_per_sec;
+        iops += a.iops;
+        return *this;
+    }
+};
+
+struct row_stats {
+    size_t points;
+    double average;
+    double stdev;
+
+    float stdev_percents() const {
+        return points > 0 ? stdev / average : 0.0;
+    }
+};
+
+template <typename T>
+static row_stats get_row_stats_for(const std::vector<T>& v) {
+    if (v.size() == 0) {
+        return row_stats{0, 0.0, 0.0};
+    }
+
+    double avg = std::accumulate(v.begin(), v.end(), 0.0) / v.size();
+    double stdev = std::sqrt(std::transform_reduce(v.begin(), v.end(), 0.0,
+                std::plus<double>(), [avg] (auto& v) -> double { return (v - avg) * (v - avg); }) / v.size());
+
+    return row_stats{ v.size(), avg, stdev };
+}
+
+class invalid_position : public std::exception {
+public:
+    virtual const char* what() const noexcept {
+        return "file access position invalid";
+    }
+};
+
+struct position_generator {
+    virtual uint64_t get_pos() = 0;
+    virtual bool is_sequential() const = 0;
+    virtual ~position_generator() {}
+};
+
+class sequential_issuer : public position_generator {
+    size_t _buffer_size;
+    uint64_t _position = 0;
+    uint64_t _size_limit;
+public:
+    sequential_issuer(size_t buffer_size, uint64_t size_limit)
+        : _buffer_size(buffer_size)
+        , _size_limit(size_limit)
+    {}
+
+    virtual bool is_sequential() const {
+        return true;
+    }
+
+    virtual uint64_t get_pos() {
+        if (_position >= _size_limit) {
+            // Wrap around if reaching EOF. The write bandwidth is lower,
+            // and we also split the write bandwidth among shards, while we
+            // read only from shard 0, so shard 0's file may not be large
+            // enough to read from.
+            _position = 0;
+        }
+        auto pos = _position;
+        _position += _buffer_size;
+        return pos;
+    }
+};
+
+class random_issuer : public position_generator {
+    size_t _buffer_size;
+    uint64_t _last_position;
+    std::uniform_int_distribution<uint64_t> _pos_distribution;
+public:
+    random_issuer(size_t buffer_size, uint64_t last_position)
+        : _buffer_size(buffer_size)
+        , _last_position(last_position)
+        , _pos_distribution(0, (last_position / buffer_size) - 1)
+    {}
+
+    virtual bool is_sequential() const {
+        return false;
+    }
+
+    virtual uint64_t get_pos() {
+        uint64_t pos = _pos_distribution(random_generator) * _buffer_size;
+        if (pos >= _last_position) {
+            throw invalid_position();
+        }
+        return pos;
+    }
+};
+
+class request_issuer {
+public:
+    virtual future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) = 0;
+    virtual ~request_issuer() {}
+};
+
+
+class write_request_issuer : public request_issuer {
+    file _file;
+public:
+    explicit write_request_issuer(file f) : _file(f) {}
+    future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) override {
+        return _file.dma_write(pos, buf, size);
+    }
+};
+
+class read_request_issuer : public request_issuer {
+    file _file;
+public:
+    explicit read_request_issuer(file f) : _file(f) {}
+    future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) override {
+        return _file.dma_read(pos, buf, size);
+    }
+};
+
+class io_worker {
+    class requests_rate_meter {
+        std::vector<unsigned>& _rates;
+        const unsigned& _requests;
+        unsigned _prev_requests = 0;
+        timer<> _tick;
+
+        static constexpr auto period = 1s;
+
+    public:
+        requests_rate_meter(std::chrono::duration<double> duration, std::vector<unsigned>& rates, const unsigned& requests)
+            : _rates(rates)
+            , _requests(requests)
+            , _tick([this] {
+                _rates.push_back(_requests - _prev_requests);
+                _prev_requests = _requests;
+            })
+        {
+            _rates.reserve(256); // ~2 minutes
+            if (duration > 4 * period) {
+                _tick.arm_periodic(period);
+            }
+        }
+
+        ~requests_rate_meter() {
+            if (_tick.armed()) {
+                _tick.cancel();
+            } else {
+                _rates.push_back(_requests);
+            }
+        }
+    };
+
+    uint64_t _bytes = 0;
+    uint64_t _max_offset = 0;
+    unsigned _requests = 0;
+    size_t _buffer_size;
+    std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _start_measuring;
+    std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _end_measuring;
+    std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _end_load;
+    // track separately because in the sequential case we may exhaust the file before _duration
+    std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _last_time_seen;
+
+    requests_rate_meter _rr_meter;
+    std::unique_ptr<position_generator> _pos_impl;
+    std::unique_ptr<request_issuer> _req_impl;
+public:
+    bool is_sequential() const {
+        return _pos_impl->is_sequential();
+    }
+
+    bool should_stop() const {
+        return iotune_clock::now() >= _end_load;
+    }
+
+    io_worker(size_t buffer_size, std::chrono::duration<double> duration, std::unique_ptr<request_issuer> reqs, std::unique_ptr<position_generator> pos, std::vector<unsigned>& rates)
+        : _buffer_size(buffer_size)
+        , _start_measuring(iotune_clock::now() + std::chrono::duration<double>(10ms))
+        , _end_measuring(_start_measuring + duration)
+        , _end_load(_end_measuring + 10ms)
+        , _last_time_seen(_start_measuring)
+        , _rr_meter(duration, rates, _requests)
+        , _pos_impl(std::move(pos))
+        , _req_impl(std::move(reqs))
+    {}
+
+    std::unique_ptr<char[], free_deleter> get_buffer() {
+        return allocate_aligned_buffer<char>(_buffer_size, _buffer_size);
+    }
+
+    future<> issue_request(char* buf) {
+        uint64_t pos = _pos_impl->get_pos();
+        return _req_impl->issue_request(pos, buf, _buffer_size).then([this, pos] (size_t size) {
+            auto now = iotune_clock::now();
+            _max_offset = std::max(_max_offset, pos + size);
+            if ((now > _start_measuring) && (now < _end_measuring)) {
+                _last_time_seen = now;
+                _bytes += size;
+                _requests++;
+            }
+        });
+    }
+
+    uint64_t max_offset() const noexcept { return _max_offset; }
+
+    io_rates get_io_rates() const {
+        io_rates rates;
+        auto t = _last_time_seen - _start_measuring;
+        if (!t.count()) {
+            throw std::runtime_error("No data collected");
+        }
+        rates.bytes_per_sec = _bytes / t.count();
+        rates.iops = _requests / t.count();
+        return rates;
+    }
+};
+
+class test_file {
+public:
+    enum class pattern { sequential, random };
+private:
+    fs::path _dirpath;
+    uint64_t _file_size;
+    file _file;
+
+    std::unique_ptr<position_generator> get_position_generator(size_t buffer_size, pattern access_pattern) {
+        if (access_pattern == pattern::sequential) {
+            return std::make_unique<sequential_issuer>(buffer_size, _file_size);
+        } else {
+            return std::make_unique<random_issuer>(buffer_size, _file_size);
+        }
+    }
+public:
+    test_file(const ::evaluation_directory& dir, uint64_t maximum_size)
+        : _dirpath(dir.path() / fs::path(fmt::format("ioqueue-discovery-{}", this_shard_id())))
+        , _file_size(maximum_size)
+    {}
+
+    future<> create_data_file() {
+        // XFS likes access in many directories better.
+        return make_directory(_dirpath.string()).then([this] {
+            auto testfile = _dirpath / fs::path("testfile");
+            file_open_options options;
+            options.extent_allocation_size_hint = _file_size;
+            return open_file_dma(testfile.string(), open_flags::rw | open_flags::create, std::move(options)).then([this, testfile] (file file) {
+                _file = file;
+                return remove_file(testfile.string()).then([this] {
+                    return remove_file(_dirpath.string());
+                });
+            }).then([this] {
+                return _file.truncate(_file_size);
+            });
+        });
+    }
+
+    future<io_rates> do_workload(std::unique_ptr<io_worker> worker_ptr, unsigned max_os_concurrency, bool update_file_size = false) {
+        if (update_file_size) {
+            _file_size = 0;
+        }
+
+        auto worker = worker_ptr.get();
+        auto concurrency = boost::irange<unsigned, unsigned>(0, max_os_concurrency, 1);
+        return parallel_for_each(std::move(concurrency), [worker] (unsigned idx) {
+            auto bufptr = worker->get_buffer();
+            auto buf = bufptr.get();
+            return do_until([worker] { return worker->should_stop(); }, [buf, worker] {
+                return worker->issue_request(buf);
+            }).finally([alive = std::move(bufptr)] {});
+        }).then_wrapped([this, worker = std::move(worker_ptr), update_file_size] (future<> f) {
+            try {
+                f.get();
+            } catch (invalid_position& ip) {
+                // expected if sequential. Example: reading and the file ended.
+                if (!worker->is_sequential()) {
+                    throw;
+                }
+            }
+
+            if (update_file_size) {
+                _file_size = worker->max_offset();
+            }
+            return make_ready_future<io_rates>(worker->get_io_rates());
+        });
+    }
+
+    future<io_rates> read_workload(size_t buffer_size, pattern access_pattern, unsigned max_os_concurrency, std::chrono::duration<double> duration, std::vector<unsigned>& rates) {
+        buffer_size = std::max(buffer_size, _file.disk_read_dma_alignment());
+        auto worker = std::make_unique<io_worker>(buffer_size, duration, std::make_unique<read_request_issuer>(_file), get_position_generator(buffer_size, access_pattern), rates);
+        return do_workload(std::move(worker), max_os_concurrency);
+    }
+
+    future<io_rates> write_workload(size_t buffer_size, pattern access_pattern, unsigned max_os_concurrency, std::chrono::duration<double> duration, std::vector<unsigned>& rates) {
+        buffer_size = std::max(buffer_size, _file.disk_write_dma_alignment());
+        auto worker = std::make_unique<io_worker>(buffer_size, duration, std::make_unique<write_request_issuer>(_file), get_position_generator(buffer_size, access_pattern), rates);
+        bool update_file_size = worker->is_sequential();
+        return do_workload(std::move(worker), max_os_concurrency, update_file_size).then([this] (io_rates r) {
+            return _file.flush().then([r = std::move(r)] () mutable {
+                return make_ready_future<io_rates>(std::move(r));
+            });
+        });
+    }
+
+    future<> stop() {
+        return _file.close();
+    }
+};
+
+class iotune_multi_shard_context {
+    ::evaluation_directory _test_directory;
+
+    unsigned per_shard_io_depth() const {
+        auto iodepth = _test_directory.max_iodepth() / smp::count;
+        if (this_shard_id() < _test_directory.max_iodepth() % smp::count) {
+            iodepth++;
+        }
+        return std::min(iodepth, 128u);
+    }
+    seastar::sharded<test_file> _iotune_test_file;
+
+    std::vector<unsigned> serial_rates;
+    seastar::sharded<std::vector<unsigned>> sharded_rates;
+
+public:
+    future<> stop() {
+        return _iotune_test_file.stop().then([this] { return sharded_rates.stop(); });
+    }
+
+    future<> start() {
+       return _iotune_test_file.start(_test_directory, _test_directory.available_space() / (2 * smp::count)).then([this] {
+           return sharded_rates.start();
+       });
+    }
+
+    future<row_stats> get_serial_rates() {
+        row_stats ret = get_row_stats_for<unsigned>(serial_rates);
+        serial_rates.clear();
+        return make_ready_future<row_stats>(ret);
+    }
+
+    future<row_stats> get_sharded_worst_rates() {
+        return sharded_rates.map_reduce0([] (std::vector<unsigned>& rates) {
+            row_stats ret = get_row_stats_for<unsigned>(rates);
+            rates.clear();
+            return ret;
+        }, row_stats{0, 0.0, 0.0},
+        [] (const row_stats& res, row_stats lres) {
+            return res.stdev < lres.stdev ? lres : res;
+        });
+    }
+
+    future<> create_data_file() {
+        return _iotune_test_file.invoke_on_all([] (test_file& tf) {
+            return tf.create_data_file();
+        });
+    }
+
+    future<io_rates> write_sequential_data(unsigned shard, size_t buffer_size, std::chrono::duration<double> duration) {
+        return _iotune_test_file.invoke_on(shard, [this, buffer_size, duration] (test_file& tf) {
+            return tf.write_workload(buffer_size, test_file::pattern::sequential, 4 * _test_directory.disks_per_array(), duration, serial_rates);
+        });
+    }
+
+    future<io_rates> read_sequential_data(unsigned shard, size_t buffer_size, std::chrono::duration<double> duration) {
+        return _iotune_test_file.invoke_on(shard, [this, buffer_size, duration] (test_file& tf) {
+            return tf.read_workload(buffer_size, test_file::pattern::sequential, 4 * _test_directory.disks_per_array(), duration, serial_rates);
+        });
+    }
+
+    future<io_rates> write_random_data(size_t buffer_size, std::chrono::duration<double> duration) {
+        return _iotune_test_file.map_reduce0([buffer_size, this, duration] (test_file& tf) {
+            return tf.write_workload(buffer_size, test_file::pattern::random, per_shard_io_depth(), duration, sharded_rates.local());
+        }, io_rates(), std::plus<io_rates>());
+    }
+
+    future<io_rates> read_random_data(size_t buffer_size, std::chrono::duration<double> duration) {
+        return _iotune_test_file.map_reduce0([buffer_size, this, duration] (test_file& tf) {
+            return tf.read_workload(buffer_size, test_file::pattern::random, per_shard_io_depth(), duration, sharded_rates.local());
+        }, io_rates(), std::plus<io_rates>());
+    }
+
+private:
+    template <typename Fn>
+    future<uint64_t> saturate(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration, Fn&& workload) {
+        return _iotune_test_file.invoke_on(0, [this, rate_threshold, buffer_size, duration, workload] (test_file& tf) {
+            return (tf.*workload)(buffer_size, test_file::pattern::sequential, 1, duration, serial_rates).then([this, rate_threshold, buffer_size, duration, workload] (io_rates rates) {
+                serial_rates.clear();
+                if (rates.bytes_per_sec < rate_threshold) {
+                    // The throughput with the given buffer-size is already "small enough", so
+                    // return back its previous value
+                    return make_ready_future<uint64_t>(buffer_size * 2);
+                } else {
+                    return saturate(rate_threshold, buffer_size / 2, duration, workload);
+                }
+            });
+        });
+    }
+
+public:
+    future<uint64_t> saturate_write(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration) {
+        return saturate(rate_threshold, buffer_size, duration, &test_file::write_workload);
+    }
+
+    future<uint64_t> saturate_read(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration) {
+        return saturate(rate_threshold, buffer_size, duration, &test_file::read_workload);
+    }
+
+    iotune_multi_shard_context(::evaluation_directory dir)
+        : _test_directory(dir)
+    {}
+};
+
+struct disk_descriptor {
+    std::string mountpoint;
+    uint64_t read_iops;
+    uint64_t read_bw;
+    uint64_t write_iops;
+    uint64_t write_bw;
+    std::optional<uint64_t> read_sat_len;
+    std::optional<uint64_t> write_sat_len;
+};
+
+void string_to_file(sstring conf_file, sstring buf) {
+    auto f = file_desc::open(conf_file, O_WRONLY | O_CLOEXEC | O_CREAT | O_TRUNC, 0664);
+    auto ret = f.write(buf.data(), buf.size());
+    if (!ret || (*ret != buf.size())) {
+        throw std::runtime_error(fmt::format("Can't write {}: {}", conf_file, *ret));
+    }
+}
+
+void write_configuration_file(sstring conf_file, std::string format, sstring properties_file) {
+    sstring buf;
+    if (format == "seastar") {
+        buf = fmt::format("io-properties-file={}\n", properties_file);
+    } else {
+        buf = fmt::format("SEASTAR_IO=\"--io-properties-file={}\"\n", properties_file);
+    }
+    string_to_file(conf_file, buf);
+}
+
+void write_property_file(sstring conf_file, std::vector<disk_descriptor> disk_descriptors) {
+    YAML::Emitter out;
+    out << YAML::BeginMap;
+    out << YAML::Key << "disks";
+    out << YAML::BeginSeq;
+    for (auto& desc : disk_descriptors) {
+        out << YAML::BeginMap;
+        out << YAML::Key << "mountpoint" << YAML::Value << desc.mountpoint;
+        out << YAML::Key << "read_iops" << YAML::Value << desc.read_iops;
+        out << YAML::Key << "read_bandwidth" << YAML::Value << desc.read_bw;
+        out << YAML::Key << "write_iops" << YAML::Value << desc.write_iops;
+        out << YAML::Key << "write_bandwidth" << YAML::Value << desc.write_bw;
+        if (desc.read_sat_len) {
+            out << YAML::Key << "read_saturation_length" << YAML::Value << *desc.read_sat_len;
+        }
+        if (desc.write_sat_len) {
+            out << YAML::Key << "write_saturation_length" << YAML::Value << *desc.write_sat_len;
+        }
+        out << YAML::EndMap;
+    }
+    out << YAML::EndSeq;
+    out << YAML::EndMap;
+    out << YAML::Newline;
+
+    string_to_file(conf_file, sstring(out.c_str(), out.size()));
+}
+
+// Returns the mountpoint of a path. It works by walking backwards from the canonical path
+// (absolute, with symlinks resolved), until we find a point that crosses a device ID.
+fs::path mountpoint_of(sstring filename) {
+    fs::path mnt_candidate = fs::canonical(fs::path(filename));
+    std::optional<dev_t> candidate_id = {};
+    auto current = mnt_candidate;
+    do {
+        auto f = open_directory(current.string()).get0();
+        auto st = f.stat().get0();
+        if ((candidate_id) && (*candidate_id != st.st_dev)) {
+            return mnt_candidate;
+        }
+        mnt_candidate = current;
+        candidate_id = st.st_dev;
+        current = current.parent_path();
+    } while (mnt_candidate != current);
+
+    return mnt_candidate;
+}
+
+int main(int ac, char** av) {
+    namespace bpo = boost::program_options;
+    bool fs_check = false;
+
+    app_template::config app_cfg;
+    app_cfg.name = "IOTune";
+
+    app_template app(std::move(app_cfg));
+    auto opt_add = app.add_options();
+    opt_add
+        ("evaluation-directory", bpo::value<std::vector<sstring>>()->required(), "directory where to execute the evaluation")
+        ("properties-file", bpo::value<sstring>(), "path in which to write the YAML file")
+        ("options-file", bpo::value<sstring>(), "path in which to write the legacy conf file")
+        ("duration", bpo::value<unsigned>()->default_value(120), "time, in seconds, for which to run the test")
+        ("format", bpo::value<sstring>()->default_value("seastar"), "Configuration file format (seastar | envfile)")
+        ("fs-check", bpo::bool_switch(&fs_check), "perform FS check only")
+        ("accuracy", bpo::value<unsigned>()->default_value(3), "acceptable deviation of measurements (percents)")
+        ("saturation", bpo::value<sstring>()->default_value(""), "measure saturation lengths (read | write | both) (this is very slow!)")
+    ;
+
+    return app.run(ac, av, [&] {
+        return seastar::async([&] {
+            auto& configuration = app.configuration();
+            auto eval_dirs = configuration["evaluation-directory"].as<std::vector<sstring>>();
+            auto format = configuration["format"].as<sstring>();
+            auto duration = std::chrono::duration<double>(configuration["duration"].as<unsigned>() * 1s);
+            auto accuracy = configuration["accuracy"].as<unsigned>();
+            auto saturation = configuration["saturation"].as<sstring>();
+
+            bool read_saturation, write_saturation;
+            if (saturation == "") {
+                read_saturation = false;
+                write_saturation = false;
+            } else if (saturation == "both") {
+                read_saturation = true;
+                write_saturation = true;
+            } else if (saturation == "read") {
+                read_saturation = true;
+                write_saturation = false;
+            } else if (saturation == "write") {
+                read_saturation = false;
+                write_saturation = true;
+            } else {
+                fmt::print("Bad --saturation value\n");
+                return 1;
+            }
+
+            std::vector<disk_descriptor> disk_descriptors;
+            std::unordered_map<sstring, sstring> mountpoint_map;
+            // We want to evaluate once per mountpoint, but we still want to write in one of the
+            // directories that we were provided - we may not have permissions to write into the
+            // mountpoint itself. If we are passed more than one directory per mountpoint, we don't
+            // really care to which one we write, so this simple hash will do.
+            for (auto& eval_dir : eval_dirs) {
+                mountpoint_map[mountpoint_of(eval_dir).string()] = eval_dir;
+            }
+            for (auto eval: mountpoint_map) {
+                auto mountpoint = eval.first;
+                auto eval_dir = eval.second;
+
+                if (!filesystem_has_good_aio_support(eval_dir, false)) {
+                    iotune_logger.error("Exception when qualifying filesystem at {}", eval_dir);
+                    return 1;
+                }
+
+                auto rec = 10000000000ULL;
+                auto avail = fs_avail(eval_dir).get0();
+                if (avail < rec) {
+                    uint64_t val;
+                    const char* units;
+                    if (avail >= 1000000000) {
+                        val = (avail + 500000000) / 1000000000;
+                        units = "GB";
+                    } else if (avail >= 1000000) {
+                        val = (avail + 500000) / 1000000;
+                        units = "MB";
+                    } else {
+                        val = avail;
+                        units = "bytes";
+                    }
+                    iotune_logger.warn("Available space on filesystem at {}: {} {}: is less than recommended: {} GB",
+                                       eval_dir, val, units, rec / 1000000000ULL);
+                }
+
+                iotune_logger.info("{} passed sanity checks", eval_dir);
+                if (fs_check) {
+                    continue;
+                }
+
+                // Directory is the same object for all tests.
+                ::evaluation_directory test_directory(eval_dir);
+                test_directory.discover_directory().get();
+                iotune_logger.info("Disk parameters: max_iodepth={} disks_per_array={} minimum_io_size={}",
+                        test_directory.max_iodepth(), test_directory.disks_per_array(), test_directory.minimum_io_size());
+
+                ::iotune_multi_shard_context iotune_tests(test_directory);
+                iotune_tests.start().get();
+                auto stop = defer([&iotune_tests] () noexcept {
+                    try {
+                        iotune_tests.stop().get();
+                    } catch (...) {
+                        fmt::print("Error occurred during iotune context shutdown: {}", std::current_exception());
+                        abort();
+                    }
+                });
+
+                row_stats rates;
+                auto accuracy_msg = [accuracy, &rates] {
+                    auto stdev = rates.stdev_percents() * 100.0;
+                    return (accuracy == 0 || stdev > accuracy) ? fmt::format(" (deviation {}%)", int(round(stdev))) : std::string("");
+                };
+
+                iotune_tests.create_data_file().get();
+
+                fmt::print("Starting Evaluation. This may take a while...\n");
+                fmt::print("Measuring sequential write bandwidth: ");
+                std::cout.flush();
+                io_rates write_bw;
+                size_t sequential_buffer_size = 1 << 20;
+                for (unsigned shard = 0; shard < smp::count; ++shard) {
+                    write_bw += iotune_tests.write_sequential_data(shard, sequential_buffer_size, duration * 0.70 / smp::count).get0();
+                }
+                write_bw.bytes_per_sec /= smp::count;
+                rates = iotune_tests.get_serial_rates().get0();
+                fmt::print("{} MB/s{}\n", uint64_t(write_bw.bytes_per_sec / (1024 * 1024)), accuracy_msg());
+
+                std::optional<uint64_t> write_sat;
+
+                if (write_saturation) {
+                    fmt::print("Measuring write saturation length: ");
+                    std::cout.flush();
+                    write_sat = iotune_tests.saturate_write(write_bw.bytes_per_sec * (1.0 - rates.stdev_percents()), sequential_buffer_size/2, duration * 0.70).get0();
+                    fmt::print("{}\n", *write_sat);
+                }
+
+                fmt::print("Measuring sequential read bandwidth: ");
+                std::cout.flush();
+                auto read_bw = iotune_tests.read_sequential_data(0, sequential_buffer_size, duration * 0.1).get0();
+                rates = iotune_tests.get_serial_rates().get0();
+                fmt::print("{} MB/s{}\n", uint64_t(read_bw.bytes_per_sec / (1024 * 1024)), accuracy_msg());
+
+                std::optional<uint64_t> read_sat;
+
+                if (read_saturation) {
+                    fmt::print("Measuring read saturation length: ");
+                    std::cout.flush();
+                    read_sat = iotune_tests.saturate_read(read_bw.bytes_per_sec * (1.0 - rates.stdev_percents()), sequential_buffer_size/2, duration * 0.1).get0();
+                    fmt::print("{}\n", *read_sat);
+                }
+
+                fmt::print("Measuring random write IOPS: ");
+                std::cout.flush();
+                auto write_iops = iotune_tests.write_random_data(test_directory.minimum_io_size(), duration * 0.1).get0();
+                rates = iotune_tests.get_sharded_worst_rates().get0();
+                fmt::print("{} IOPS{}\n", uint64_t(write_iops.iops), accuracy_msg());
+
+                fmt::print("Measuring random read IOPS: ");
+                std::cout.flush();
+                auto read_iops = iotune_tests.read_random_data(test_directory.minimum_io_size(), duration * 0.1).get0();
+                rates = iotune_tests.get_sharded_worst_rates().get0();
+                fmt::print("{} IOPS{}\n", uint64_t(read_iops.iops), accuracy_msg());
+
+                struct disk_descriptor desc;
+                desc.mountpoint = mountpoint;
+                desc.read_iops = read_iops.iops;
+                desc.read_bw = read_bw.bytes_per_sec;
+                desc.read_sat_len = read_sat;
+                desc.write_iops = write_iops.iops;
+                desc.write_bw = write_bw.bytes_per_sec;
+                desc.write_sat_len = write_sat;
+                disk_descriptors.push_back(std::move(desc));
+            }
+
+            if (fs_check) {
+                return 0;
+            }
+
+            auto file = "properties file";
+            try {
+                if (configuration.count("properties-file")) {
+                    fmt::print("Writing result to {}\n", configuration["properties-file"].as<sstring>());
+                    write_property_file(configuration["properties-file"].as<sstring>(), disk_descriptors);
+                }
+
+                file = "configuration file";
+                if (configuration.count("options-file")) {
+                    fmt::print("Writing result to {}\n", configuration["options-file"].as<sstring>());
+                    write_configuration_file(configuration["options-file"].as<sstring>(), format, configuration["properties-file"].as<sstring>());
+                }
+            } catch (...) {
+                iotune_logger.error("Exception when writing {}: {}.\nPlease add the above values manually to your seastar command line.", file, std::current_exception());
+                return 1;
+            }
+            return 0;
+        });
+    });
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
commit	e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree	64f88b554b444a49f656b6c656111a145cbbaa28 /src/seastar/apps/iotune/iotune.cc
parent	Initial commit. (diff)
download	ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip