summaryrefslogtreecommitdiffstats
path: root/src/seastar/apps/iotune/iotune.cc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/seastar/apps/iotune/iotune.cc
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/seastar/apps/iotune/iotune.cc')
-rw-r--r--src/seastar/apps/iotune/iotune.cc878
1 files changed, 878 insertions, 0 deletions
diff --git a/src/seastar/apps/iotune/iotune.cc b/src/seastar/apps/iotune/iotune.cc
new file mode 100644
index 000000000..c47f64340
--- /dev/null
+++ b/src/seastar/apps/iotune/iotune.cc
@@ -0,0 +1,878 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2018 ScyllaDB
+ *
+ * The goal of this program is to allow a user to properly configure the Seastar I/O
+ * scheduler.
+ */
+#include <iostream>
+#include <chrono>
+#include <random>
+#include <memory>
+#include <vector>
+#include <cmath>
+#include <sys/vfs.h>
+#include <sys/sysmacros.h>
+#include <boost/range/irange.hpp>
+#include <boost/program_options.hpp>
+#include <boost/iterator/counting_iterator.hpp>
+#include <fstream>
+#include <wordexp.h>
+#include <yaml-cpp/yaml.h>
+#include <fmt/printf.h>
+#include <seastar/core/seastar.hh>
+#include <seastar/core/file.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/posix.hh>
+#include <seastar/core/aligned_buffer.hh>
+#include <seastar/core/sharded.hh>
+#include <seastar/core/app-template.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/fsqual.hh>
+#include <seastar/core/loop.hh>
+#include <seastar/util/defer.hh>
+#include <seastar/util/log.hh>
+#include <seastar/util/std-compat.hh>
+#include <seastar/util/read_first_line.hh>
+
+using namespace seastar;
+using namespace std::chrono_literals;
+namespace fs = std::filesystem;
+
+logger iotune_logger("iotune");
+
+using iotune_clock = std::chrono::steady_clock;
+static thread_local std::default_random_engine random_generator(std::chrono::duration_cast<std::chrono::nanoseconds>(iotune_clock::now().time_since_epoch()).count());
+
+void check_device_properties(fs::path dev_sys_file) {
+ auto sched_file = dev_sys_file / "queue" / "scheduler";
+ auto sched_string = read_first_line(sched_file);
+ auto beg = sched_string.find('[');
+ size_t len = sched_string.size();
+ if (beg == sstring::npos) {
+ beg = 0;
+ } else {
+ auto end = sched_string.find(']');
+ if (end != sstring::npos) {
+ len = end - beg - 1;
+ }
+ beg++;
+ }
+ auto scheduler = sched_string.substr(beg, len);
+ if ((scheduler != "noop") && (scheduler != "none")) {
+ iotune_logger.warn("Scheduler for {} set to {}. It is recommend to set it to noop before evaluation so as not to skew the results.",
+ sched_file.string(), scheduler);
+ }
+
+ auto nomerges_file = dev_sys_file / "queue" / "nomerges";
+ auto nomerges = read_first_line_as<unsigned>(nomerges_file);
+ if (nomerges != 2u) {
+ iotune_logger.warn("nomerges for {} set to {}. It is recommend to set it to 2 before evaluation so that merges are disabled. Results can be skewed otherwise.",
+ nomerges_file.string(), nomerges);
+ }
+
+ auto write_cache_file = dev_sys_file / "queue" / "write_cache";
+ auto write_cache = read_first_line_as<std::string>(write_cache_file);
+ if (write_cache == "write back") {
+ iotune_logger.warn("write_cache for {} is set to write back. Some disks have poor implementation of this mode, pay attention to the measurements accuracy.",
+ write_cache_file.string());
+ }
+}
+
+struct evaluation_directory {
+ sstring _name;
+ // We know that if we issue more than this, they will be blocked on linux anyway.
+ unsigned _max_iodepth = 0;
+ uint64_t _available_space;
+ uint64_t _min_data_transfer_size = 512;
+ unsigned _disks_per_array = 0;
+
+ void scan_device(unsigned dev_maj, unsigned dev_min) {
+ scan_device(fmt::format("{}:{}", dev_maj, dev_min));
+ }
+
+ void scan_device(std::string dev_str) {
+ scan_device(fs::path("/sys/dev/block") / dev_str);
+ }
+
+ void scan_device(fs::path sys_file) {
+ try {
+ sys_file = fs::canonical(sys_file);
+ bool is_leaf = true;
+ if (fs::exists(sys_file / "slaves")) {
+ for (auto& dev : fs::directory_iterator(sys_file / "slaves")) {
+ is_leaf = false;
+ scan_device(read_first_line(dev.path() / "dev"));
+ }
+ }
+
+ // our work is done if not leaf. We'll tune the leaves
+ if (!is_leaf) {
+ return;
+ }
+
+ if (fs::exists(sys_file / "partition")) {
+ scan_device(sys_file.remove_filename());
+ } else {
+ check_device_properties(sys_file);
+ auto queue_dir = sys_file / "queue";
+ auto disk_min_io_size = read_first_line_as<uint64_t>(queue_dir / "minimum_io_size");
+
+ _min_data_transfer_size = std::max(_min_data_transfer_size, disk_min_io_size);
+ _max_iodepth += read_first_line_as<uint64_t>(queue_dir / "nr_requests");
+ _disks_per_array++;
+ }
+ } catch (std::system_error& se) {
+ iotune_logger.error("Error while parsing sysfs. Will continue with guessed values: {}", se.what());
+ _max_iodepth = 128;
+ }
+ _disks_per_array = std::max(_disks_per_array, 1u);
+ }
+public:
+ evaluation_directory(sstring name)
+ : _name(name)
+ , _available_space(fs::space(fs::path(_name)).available)
+ {}
+
+ unsigned max_iodepth() const {
+ return _max_iodepth;
+ }
+
+ fs::path path() const {
+ return fs::path(_name);
+ }
+
+ const sstring& name() const {
+ return _name;
+ }
+
+ unsigned disks_per_array() const {
+ return _disks_per_array;
+ }
+
+ uint64_t minimum_io_size() const {
+ return _min_data_transfer_size;
+ }
+
+ future<> discover_directory() {
+ return seastar::async([this] {
+ auto f = open_directory(_name).get0();
+ auto st = f.stat().get0();
+ f.close().get();
+
+ scan_device(major(st.st_dev), minor(st.st_dev));
+ });
+ }
+
+ uint64_t available_space() const {
+ return _available_space;
+ }
+};
+
+struct io_rates {
+ float bytes_per_sec = 0;
+ float iops = 0;
+ io_rates operator+(const io_rates& a) const {
+ return io_rates{bytes_per_sec + a.bytes_per_sec, iops + a.iops};
+ }
+
+ io_rates& operator+=(const io_rates& a) {
+ bytes_per_sec += a.bytes_per_sec;
+ iops += a.iops;
+ return *this;
+ }
+};
+
+struct row_stats {
+ size_t points;
+ double average;
+ double stdev;
+
+ float stdev_percents() const {
+ return points > 0 ? stdev / average : 0.0;
+ }
+};
+
+template <typename T>
+static row_stats get_row_stats_for(const std::vector<T>& v) {
+ if (v.size() == 0) {
+ return row_stats{0, 0.0, 0.0};
+ }
+
+ double avg = std::accumulate(v.begin(), v.end(), 0.0) / v.size();
+ double stdev = std::sqrt(std::transform_reduce(v.begin(), v.end(), 0.0,
+ std::plus<double>(), [avg] (auto& v) -> double { return (v - avg) * (v - avg); }) / v.size());
+
+ return row_stats{ v.size(), avg, stdev };
+}
+
+class invalid_position : public std::exception {
+public:
+ virtual const char* what() const noexcept {
+ return "file access position invalid";
+ }
+};
+
+struct position_generator {
+ virtual uint64_t get_pos() = 0;
+ virtual bool is_sequential() const = 0;
+ virtual ~position_generator() {}
+};
+
+class sequential_issuer : public position_generator {
+ size_t _buffer_size;
+ uint64_t _position = 0;
+ uint64_t _size_limit;
+public:
+ sequential_issuer(size_t buffer_size, uint64_t size_limit)
+ : _buffer_size(buffer_size)
+ , _size_limit(size_limit)
+ {}
+
+ virtual bool is_sequential() const {
+ return true;
+ }
+
+ virtual uint64_t get_pos() {
+ if (_position >= _size_limit) {
+ // Wrap around if reaching EOF. The write bandwidth is lower,
+ // and we also split the write bandwidth among shards, while we
+ // read only from shard 0, so shard 0's file may not be large
+ // enough to read from.
+ _position = 0;
+ }
+ auto pos = _position;
+ _position += _buffer_size;
+ return pos;
+ }
+};
+
+class random_issuer : public position_generator {
+ size_t _buffer_size;
+ uint64_t _last_position;
+ std::uniform_int_distribution<uint64_t> _pos_distribution;
+public:
+ random_issuer(size_t buffer_size, uint64_t last_position)
+ : _buffer_size(buffer_size)
+ , _last_position(last_position)
+ , _pos_distribution(0, (last_position / buffer_size) - 1)
+ {}
+
+ virtual bool is_sequential() const {
+ return false;
+ }
+
+ virtual uint64_t get_pos() {
+ uint64_t pos = _pos_distribution(random_generator) * _buffer_size;
+ if (pos >= _last_position) {
+ throw invalid_position();
+ }
+ return pos;
+ }
+};
+
+class request_issuer {
+public:
+ virtual future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) = 0;
+ virtual ~request_issuer() {}
+};
+
+
+class write_request_issuer : public request_issuer {
+ file _file;
+public:
+ explicit write_request_issuer(file f) : _file(f) {}
+ future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) override {
+ return _file.dma_write(pos, buf, size);
+ }
+};
+
+class read_request_issuer : public request_issuer {
+ file _file;
+public:
+ explicit read_request_issuer(file f) : _file(f) {}
+ future<size_t> issue_request(uint64_t pos, char* buf, uint64_t size) override {
+ return _file.dma_read(pos, buf, size);
+ }
+};
+
+class io_worker {
+ class requests_rate_meter {
+ std::vector<unsigned>& _rates;
+ const unsigned& _requests;
+ unsigned _prev_requests = 0;
+ timer<> _tick;
+
+ static constexpr auto period = 1s;
+
+ public:
+ requests_rate_meter(std::chrono::duration<double> duration, std::vector<unsigned>& rates, const unsigned& requests)
+ : _rates(rates)
+ , _requests(requests)
+ , _tick([this] {
+ _rates.push_back(_requests - _prev_requests);
+ _prev_requests = _requests;
+ })
+ {
+ _rates.reserve(256); // ~2 minutes
+ if (duration > 4 * period) {
+ _tick.arm_periodic(period);
+ }
+ }
+
+ ~requests_rate_meter() {
+ if (_tick.armed()) {
+ _tick.cancel();
+ } else {
+ _rates.push_back(_requests);
+ }
+ }
+ };
+
+ uint64_t _bytes = 0;
+ uint64_t _max_offset = 0;
+ unsigned _requests = 0;
+ size_t _buffer_size;
+ std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _start_measuring;
+ std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _end_measuring;
+ std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _end_load;
+ // track separately because in the sequential case we may exhaust the file before _duration
+ std::chrono::time_point<iotune_clock, std::chrono::duration<double>> _last_time_seen;
+
+ requests_rate_meter _rr_meter;
+ std::unique_ptr<position_generator> _pos_impl;
+ std::unique_ptr<request_issuer> _req_impl;
+public:
+ bool is_sequential() const {
+ return _pos_impl->is_sequential();
+ }
+
+ bool should_stop() const {
+ return iotune_clock::now() >= _end_load;
+ }
+
+ io_worker(size_t buffer_size, std::chrono::duration<double> duration, std::unique_ptr<request_issuer> reqs, std::unique_ptr<position_generator> pos, std::vector<unsigned>& rates)
+ : _buffer_size(buffer_size)
+ , _start_measuring(iotune_clock::now() + std::chrono::duration<double>(10ms))
+ , _end_measuring(_start_measuring + duration)
+ , _end_load(_end_measuring + 10ms)
+ , _last_time_seen(_start_measuring)
+ , _rr_meter(duration, rates, _requests)
+ , _pos_impl(std::move(pos))
+ , _req_impl(std::move(reqs))
+ {}
+
+ std::unique_ptr<char[], free_deleter> get_buffer() {
+ return allocate_aligned_buffer<char>(_buffer_size, _buffer_size);
+ }
+
+ future<> issue_request(char* buf) {
+ uint64_t pos = _pos_impl->get_pos();
+ return _req_impl->issue_request(pos, buf, _buffer_size).then([this, pos] (size_t size) {
+ auto now = iotune_clock::now();
+ _max_offset = std::max(_max_offset, pos + size);
+ if ((now > _start_measuring) && (now < _end_measuring)) {
+ _last_time_seen = now;
+ _bytes += size;
+ _requests++;
+ }
+ });
+ }
+
+ uint64_t max_offset() const noexcept { return _max_offset; }
+
+ io_rates get_io_rates() const {
+ io_rates rates;
+ auto t = _last_time_seen - _start_measuring;
+ if (!t.count()) {
+ throw std::runtime_error("No data collected");
+ }
+ rates.bytes_per_sec = _bytes / t.count();
+ rates.iops = _requests / t.count();
+ return rates;
+ }
+};
+
+class test_file {
+public:
+ enum class pattern { sequential, random };
+private:
+ fs::path _dirpath;
+ uint64_t _file_size;
+ file _file;
+
+ std::unique_ptr<position_generator> get_position_generator(size_t buffer_size, pattern access_pattern) {
+ if (access_pattern == pattern::sequential) {
+ return std::make_unique<sequential_issuer>(buffer_size, _file_size);
+ } else {
+ return std::make_unique<random_issuer>(buffer_size, _file_size);
+ }
+ }
+public:
+ test_file(const ::evaluation_directory& dir, uint64_t maximum_size)
+ : _dirpath(dir.path() / fs::path(fmt::format("ioqueue-discovery-{}", this_shard_id())))
+ , _file_size(maximum_size)
+ {}
+
+ future<> create_data_file() {
+ // XFS likes access in many directories better.
+ return make_directory(_dirpath.string()).then([this] {
+ auto testfile = _dirpath / fs::path("testfile");
+ file_open_options options;
+ options.extent_allocation_size_hint = _file_size;
+ return open_file_dma(testfile.string(), open_flags::rw | open_flags::create, std::move(options)).then([this, testfile] (file file) {
+ _file = file;
+ return remove_file(testfile.string()).then([this] {
+ return remove_file(_dirpath.string());
+ });
+ }).then([this] {
+ return _file.truncate(_file_size);
+ });
+ });
+ }
+
+ future<io_rates> do_workload(std::unique_ptr<io_worker> worker_ptr, unsigned max_os_concurrency, bool update_file_size = false) {
+ if (update_file_size) {
+ _file_size = 0;
+ }
+
+ auto worker = worker_ptr.get();
+ auto concurrency = boost::irange<unsigned, unsigned>(0, max_os_concurrency, 1);
+ return parallel_for_each(std::move(concurrency), [worker] (unsigned idx) {
+ auto bufptr = worker->get_buffer();
+ auto buf = bufptr.get();
+ return do_until([worker] { return worker->should_stop(); }, [buf, worker] {
+ return worker->issue_request(buf);
+ }).finally([alive = std::move(bufptr)] {});
+ }).then_wrapped([this, worker = std::move(worker_ptr), update_file_size] (future<> f) {
+ try {
+ f.get();
+ } catch (invalid_position& ip) {
+ // expected if sequential. Example: reading and the file ended.
+ if (!worker->is_sequential()) {
+ throw;
+ }
+ }
+
+ if (update_file_size) {
+ _file_size = worker->max_offset();
+ }
+ return make_ready_future<io_rates>(worker->get_io_rates());
+ });
+ }
+
+ future<io_rates> read_workload(size_t buffer_size, pattern access_pattern, unsigned max_os_concurrency, std::chrono::duration<double> duration, std::vector<unsigned>& rates) {
+ buffer_size = std::max(buffer_size, _file.disk_read_dma_alignment());
+ auto worker = std::make_unique<io_worker>(buffer_size, duration, std::make_unique<read_request_issuer>(_file), get_position_generator(buffer_size, access_pattern), rates);
+ return do_workload(std::move(worker), max_os_concurrency);
+ }
+
+ future<io_rates> write_workload(size_t buffer_size, pattern access_pattern, unsigned max_os_concurrency, std::chrono::duration<double> duration, std::vector<unsigned>& rates) {
+ buffer_size = std::max(buffer_size, _file.disk_write_dma_alignment());
+ auto worker = std::make_unique<io_worker>(buffer_size, duration, std::make_unique<write_request_issuer>(_file), get_position_generator(buffer_size, access_pattern), rates);
+ bool update_file_size = worker->is_sequential();
+ return do_workload(std::move(worker), max_os_concurrency, update_file_size).then([this] (io_rates r) {
+ return _file.flush().then([r = std::move(r)] () mutable {
+ return make_ready_future<io_rates>(std::move(r));
+ });
+ });
+ }
+
+ future<> stop() {
+ return _file.close();
+ }
+};
+
+class iotune_multi_shard_context {
+ ::evaluation_directory _test_directory;
+
+ unsigned per_shard_io_depth() const {
+ auto iodepth = _test_directory.max_iodepth() / smp::count;
+ if (this_shard_id() < _test_directory.max_iodepth() % smp::count) {
+ iodepth++;
+ }
+ return std::min(iodepth, 128u);
+ }
+ seastar::sharded<test_file> _iotune_test_file;
+
+ std::vector<unsigned> serial_rates;
+ seastar::sharded<std::vector<unsigned>> sharded_rates;
+
+public:
+ future<> stop() {
+ return _iotune_test_file.stop().then([this] { return sharded_rates.stop(); });
+ }
+
+ future<> start() {
+ return _iotune_test_file.start(_test_directory, _test_directory.available_space() / (2 * smp::count)).then([this] {
+ return sharded_rates.start();
+ });
+ }
+
+ future<row_stats> get_serial_rates() {
+ row_stats ret = get_row_stats_for<unsigned>(serial_rates);
+ serial_rates.clear();
+ return make_ready_future<row_stats>(ret);
+ }
+
+ future<row_stats> get_sharded_worst_rates() {
+ return sharded_rates.map_reduce0([] (std::vector<unsigned>& rates) {
+ row_stats ret = get_row_stats_for<unsigned>(rates);
+ rates.clear();
+ return ret;
+ }, row_stats{0, 0.0, 0.0},
+ [] (const row_stats& res, row_stats lres) {
+ return res.stdev < lres.stdev ? lres : res;
+ });
+ }
+
+ future<> create_data_file() {
+ return _iotune_test_file.invoke_on_all([] (test_file& tf) {
+ return tf.create_data_file();
+ });
+ }
+
+ future<io_rates> write_sequential_data(unsigned shard, size_t buffer_size, std::chrono::duration<double> duration) {
+ return _iotune_test_file.invoke_on(shard, [this, buffer_size, duration] (test_file& tf) {
+ return tf.write_workload(buffer_size, test_file::pattern::sequential, 4 * _test_directory.disks_per_array(), duration, serial_rates);
+ });
+ }
+
+ future<io_rates> read_sequential_data(unsigned shard, size_t buffer_size, std::chrono::duration<double> duration) {
+ return _iotune_test_file.invoke_on(shard, [this, buffer_size, duration] (test_file& tf) {
+ return tf.read_workload(buffer_size, test_file::pattern::sequential, 4 * _test_directory.disks_per_array(), duration, serial_rates);
+ });
+ }
+
+ future<io_rates> write_random_data(size_t buffer_size, std::chrono::duration<double> duration) {
+ return _iotune_test_file.map_reduce0([buffer_size, this, duration] (test_file& tf) {
+ return tf.write_workload(buffer_size, test_file::pattern::random, per_shard_io_depth(), duration, sharded_rates.local());
+ }, io_rates(), std::plus<io_rates>());
+ }
+
+ future<io_rates> read_random_data(size_t buffer_size, std::chrono::duration<double> duration) {
+ return _iotune_test_file.map_reduce0([buffer_size, this, duration] (test_file& tf) {
+ return tf.read_workload(buffer_size, test_file::pattern::random, per_shard_io_depth(), duration, sharded_rates.local());
+ }, io_rates(), std::plus<io_rates>());
+ }
+
+private:
+ template <typename Fn>
+ future<uint64_t> saturate(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration, Fn&& workload) {
+ return _iotune_test_file.invoke_on(0, [this, rate_threshold, buffer_size, duration, workload] (test_file& tf) {
+ return (tf.*workload)(buffer_size, test_file::pattern::sequential, 1, duration, serial_rates).then([this, rate_threshold, buffer_size, duration, workload] (io_rates rates) {
+ serial_rates.clear();
+ if (rates.bytes_per_sec < rate_threshold) {
+ // The throughput with the given buffer-size is already "small enough", so
+ // return back its previous value
+ return make_ready_future<uint64_t>(buffer_size * 2);
+ } else {
+ return saturate(rate_threshold, buffer_size / 2, duration, workload);
+ }
+ });
+ });
+ }
+
+public:
+ future<uint64_t> saturate_write(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration) {
+ return saturate(rate_threshold, buffer_size, duration, &test_file::write_workload);
+ }
+
+ future<uint64_t> saturate_read(float rate_threshold, size_t buffer_size, std::chrono::duration<double> duration) {
+ return saturate(rate_threshold, buffer_size, duration, &test_file::read_workload);
+ }
+
+ iotune_multi_shard_context(::evaluation_directory dir)
+ : _test_directory(dir)
+ {}
+};
+
+struct disk_descriptor {
+ std::string mountpoint;
+ uint64_t read_iops;
+ uint64_t read_bw;
+ uint64_t write_iops;
+ uint64_t write_bw;
+ std::optional<uint64_t> read_sat_len;
+ std::optional<uint64_t> write_sat_len;
+};
+
+void string_to_file(sstring conf_file, sstring buf) {
+ auto f = file_desc::open(conf_file, O_WRONLY | O_CLOEXEC | O_CREAT | O_TRUNC, 0664);
+ auto ret = f.write(buf.data(), buf.size());
+ if (!ret || (*ret != buf.size())) {
+ throw std::runtime_error(fmt::format("Can't write {}: {}", conf_file, *ret));
+ }
+}
+
+void write_configuration_file(sstring conf_file, std::string format, sstring properties_file) {
+ sstring buf;
+ if (format == "seastar") {
+ buf = fmt::format("io-properties-file={}\n", properties_file);
+ } else {
+ buf = fmt::format("SEASTAR_IO=\"--io-properties-file={}\"\n", properties_file);
+ }
+ string_to_file(conf_file, buf);
+}
+
+void write_property_file(sstring conf_file, std::vector<disk_descriptor> disk_descriptors) {
+ YAML::Emitter out;
+ out << YAML::BeginMap;
+ out << YAML::Key << "disks";
+ out << YAML::BeginSeq;
+ for (auto& desc : disk_descriptors) {
+ out << YAML::BeginMap;
+ out << YAML::Key << "mountpoint" << YAML::Value << desc.mountpoint;
+ out << YAML::Key << "read_iops" << YAML::Value << desc.read_iops;
+ out << YAML::Key << "read_bandwidth" << YAML::Value << desc.read_bw;
+ out << YAML::Key << "write_iops" << YAML::Value << desc.write_iops;
+ out << YAML::Key << "write_bandwidth" << YAML::Value << desc.write_bw;
+ if (desc.read_sat_len) {
+ out << YAML::Key << "read_saturation_length" << YAML::Value << *desc.read_sat_len;
+ }
+ if (desc.write_sat_len) {
+ out << YAML::Key << "write_saturation_length" << YAML::Value << *desc.write_sat_len;
+ }
+ out << YAML::EndMap;
+ }
+ out << YAML::EndSeq;
+ out << YAML::EndMap;
+ out << YAML::Newline;
+
+ string_to_file(conf_file, sstring(out.c_str(), out.size()));
+}
+
+// Returns the mountpoint of a path. It works by walking backwards from the canonical path
+// (absolute, with symlinks resolved), until we find a point that crosses a device ID.
+fs::path mountpoint_of(sstring filename) {
+ fs::path mnt_candidate = fs::canonical(fs::path(filename));
+ std::optional<dev_t> candidate_id = {};
+ auto current = mnt_candidate;
+ do {
+ auto f = open_directory(current.string()).get0();
+ auto st = f.stat().get0();
+ if ((candidate_id) && (*candidate_id != st.st_dev)) {
+ return mnt_candidate;
+ }
+ mnt_candidate = current;
+ candidate_id = st.st_dev;
+ current = current.parent_path();
+ } while (mnt_candidate != current);
+
+ return mnt_candidate;
+}
+
+int main(int ac, char** av) {
+ namespace bpo = boost::program_options;
+ bool fs_check = false;
+
+ app_template::config app_cfg;
+ app_cfg.name = "IOTune";
+
+ app_template app(std::move(app_cfg));
+ auto opt_add = app.add_options();
+ opt_add
+ ("evaluation-directory", bpo::value<std::vector<sstring>>()->required(), "directory where to execute the evaluation")
+ ("properties-file", bpo::value<sstring>(), "path in which to write the YAML file")
+ ("options-file", bpo::value<sstring>(), "path in which to write the legacy conf file")
+ ("duration", bpo::value<unsigned>()->default_value(120), "time, in seconds, for which to run the test")
+ ("format", bpo::value<sstring>()->default_value("seastar"), "Configuration file format (seastar | envfile)")
+ ("fs-check", bpo::bool_switch(&fs_check), "perform FS check only")
+ ("accuracy", bpo::value<unsigned>()->default_value(3), "acceptable deviation of measurements (percents)")
+ ("saturation", bpo::value<sstring>()->default_value(""), "measure saturation lengths (read | write | both) (this is very slow!)")
+ ;
+
+ return app.run(ac, av, [&] {
+ return seastar::async([&] {
+ auto& configuration = app.configuration();
+ auto eval_dirs = configuration["evaluation-directory"].as<std::vector<sstring>>();
+ auto format = configuration["format"].as<sstring>();
+ auto duration = std::chrono::duration<double>(configuration["duration"].as<unsigned>() * 1s);
+ auto accuracy = configuration["accuracy"].as<unsigned>();
+ auto saturation = configuration["saturation"].as<sstring>();
+
+ bool read_saturation, write_saturation;
+ if (saturation == "") {
+ read_saturation = false;
+ write_saturation = false;
+ } else if (saturation == "both") {
+ read_saturation = true;
+ write_saturation = true;
+ } else if (saturation == "read") {
+ read_saturation = true;
+ write_saturation = false;
+ } else if (saturation == "write") {
+ read_saturation = false;
+ write_saturation = true;
+ } else {
+ fmt::print("Bad --saturation value\n");
+ return 1;
+ }
+
+ std::vector<disk_descriptor> disk_descriptors;
+ std::unordered_map<sstring, sstring> mountpoint_map;
+ // We want to evaluate once per mountpoint, but we still want to write in one of the
+ // directories that we were provided - we may not have permissions to write into the
+ // mountpoint itself. If we are passed more than one directory per mountpoint, we don't
+ // really care to which one we write, so this simple hash will do.
+ for (auto& eval_dir : eval_dirs) {
+ mountpoint_map[mountpoint_of(eval_dir).string()] = eval_dir;
+ }
+ for (auto eval: mountpoint_map) {
+ auto mountpoint = eval.first;
+ auto eval_dir = eval.second;
+
+ if (!filesystem_has_good_aio_support(eval_dir, false)) {
+ iotune_logger.error("Exception when qualifying filesystem at {}", eval_dir);
+ return 1;
+ }
+
+ auto rec = 10000000000ULL;
+ auto avail = fs_avail(eval_dir).get0();
+ if (avail < rec) {
+ uint64_t val;
+ const char* units;
+ if (avail >= 1000000000) {
+ val = (avail + 500000000) / 1000000000;
+ units = "GB";
+ } else if (avail >= 1000000) {
+ val = (avail + 500000) / 1000000;
+ units = "MB";
+ } else {
+ val = avail;
+ units = "bytes";
+ }
+ iotune_logger.warn("Available space on filesystem at {}: {} {}: is less than recommended: {} GB",
+ eval_dir, val, units, rec / 1000000000ULL);
+ }
+
+ iotune_logger.info("{} passed sanity checks", eval_dir);
+ if (fs_check) {
+ continue;
+ }
+
+ // Directory is the same object for all tests.
+ ::evaluation_directory test_directory(eval_dir);
+ test_directory.discover_directory().get();
+ iotune_logger.info("Disk parameters: max_iodepth={} disks_per_array={} minimum_io_size={}",
+ test_directory.max_iodepth(), test_directory.disks_per_array(), test_directory.minimum_io_size());
+
+ ::iotune_multi_shard_context iotune_tests(test_directory);
+ iotune_tests.start().get();
+ auto stop = defer([&iotune_tests] () noexcept {
+ try {
+ iotune_tests.stop().get();
+ } catch (...) {
+ fmt::print("Error occurred during iotune context shutdown: {}", std::current_exception());
+ abort();
+ }
+ });
+
+ row_stats rates;
+ auto accuracy_msg = [accuracy, &rates] {
+ auto stdev = rates.stdev_percents() * 100.0;
+ return (accuracy == 0 || stdev > accuracy) ? fmt::format(" (deviation {}%)", int(round(stdev))) : std::string("");
+ };
+
+ iotune_tests.create_data_file().get();
+
+ fmt::print("Starting Evaluation. This may take a while...\n");
+ fmt::print("Measuring sequential write bandwidth: ");
+ std::cout.flush();
+ io_rates write_bw;
+ size_t sequential_buffer_size = 1 << 20;
+ for (unsigned shard = 0; shard < smp::count; ++shard) {
+ write_bw += iotune_tests.write_sequential_data(shard, sequential_buffer_size, duration * 0.70 / smp::count).get0();
+ }
+ write_bw.bytes_per_sec /= smp::count;
+ rates = iotune_tests.get_serial_rates().get0();
+ fmt::print("{} MB/s{}\n", uint64_t(write_bw.bytes_per_sec / (1024 * 1024)), accuracy_msg());
+
+ std::optional<uint64_t> write_sat;
+
+ if (write_saturation) {
+ fmt::print("Measuring write saturation length: ");
+ std::cout.flush();
+ write_sat = iotune_tests.saturate_write(write_bw.bytes_per_sec * (1.0 - rates.stdev_percents()), sequential_buffer_size/2, duration * 0.70).get0();
+ fmt::print("{}\n", *write_sat);
+ }
+
+ fmt::print("Measuring sequential read bandwidth: ");
+ std::cout.flush();
+ auto read_bw = iotune_tests.read_sequential_data(0, sequential_buffer_size, duration * 0.1).get0();
+ rates = iotune_tests.get_serial_rates().get0();
+ fmt::print("{} MB/s{}\n", uint64_t(read_bw.bytes_per_sec / (1024 * 1024)), accuracy_msg());
+
+ std::optional<uint64_t> read_sat;
+
+ if (read_saturation) {
+ fmt::print("Measuring read saturation length: ");
+ std::cout.flush();
+ read_sat = iotune_tests.saturate_read(read_bw.bytes_per_sec * (1.0 - rates.stdev_percents()), sequential_buffer_size/2, duration * 0.1).get0();
+ fmt::print("{}\n", *read_sat);
+ }
+
+ fmt::print("Measuring random write IOPS: ");
+ std::cout.flush();
+ auto write_iops = iotune_tests.write_random_data(test_directory.minimum_io_size(), duration * 0.1).get0();
+ rates = iotune_tests.get_sharded_worst_rates().get0();
+ fmt::print("{} IOPS{}\n", uint64_t(write_iops.iops), accuracy_msg());
+
+ fmt::print("Measuring random read IOPS: ");
+ std::cout.flush();
+ auto read_iops = iotune_tests.read_random_data(test_directory.minimum_io_size(), duration * 0.1).get0();
+ rates = iotune_tests.get_sharded_worst_rates().get0();
+ fmt::print("{} IOPS{}\n", uint64_t(read_iops.iops), accuracy_msg());
+
+ struct disk_descriptor desc;
+ desc.mountpoint = mountpoint;
+ desc.read_iops = read_iops.iops;
+ desc.read_bw = read_bw.bytes_per_sec;
+ desc.read_sat_len = read_sat;
+ desc.write_iops = write_iops.iops;
+ desc.write_bw = write_bw.bytes_per_sec;
+ desc.write_sat_len = write_sat;
+ disk_descriptors.push_back(std::move(desc));
+ }
+
+ if (fs_check) {
+ return 0;
+ }
+
+ auto file = "properties file";
+ try {
+ if (configuration.count("properties-file")) {
+ fmt::print("Writing result to {}\n", configuration["properties-file"].as<sstring>());
+ write_property_file(configuration["properties-file"].as<sstring>(), disk_descriptors);
+ }
+
+ file = "configuration file";
+ if (configuration.count("options-file")) {
+ fmt::print("Writing result to {}\n", configuration["options-file"].as<sstring>());
+ write_configuration_file(configuration["options-file"].as<sstring>(), format, configuration["properties-file"].as<sstring>());
+ }
+ } catch (...) {
+ iotune_logger.error("Exception when writing {}: {}.\nPlease add the above values manually to your seastar command line.", file, std::current_exception());
+ return 1;
+ }
+ return 0;
+ });
+ });
+}