summaryrefslogtreecommitdiffstats
path: root/src/libutil/cxx
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
commit133a45c109da5310add55824db21af5239951f93 (patch)
treeba6ac4c0a950a0dda56451944315d66409923918 /src/libutil/cxx
parentInitial commit. (diff)
downloadrspamd-upstream.tar.xz
rspamd-upstream.zip
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/libutil/cxx')
-rw-r--r--src/libutil/cxx/error.hxx161
-rw-r--r--src/libutil/cxx/file_util.cxx457
-rw-r--r--src/libutil/cxx/file_util.hxx312
-rw-r--r--src/libutil/cxx/hash_util.hxx109
-rw-r--r--src/libutil/cxx/local_shared_ptr.hxx440
-rw-r--r--src/libutil/cxx/utf8_util.cxx421
-rw-r--r--src/libutil/cxx/utf8_util.h85
-rw-r--r--src/libutil/cxx/util.hxx238
-rw-r--r--src/libutil/cxx/util_tests.cxx82
9 files changed, 2305 insertions, 0 deletions
diff --git a/src/libutil/cxx/error.hxx b/src/libutil/cxx/error.hxx
new file mode 100644
index 0000000..4689d42
--- /dev/null
+++ b/src/libutil/cxx/error.hxx
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2024 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_ERROR_HXX
+#define RSPAMD_ERROR_HXX
+#pragma once
+
+#include "config.h"
+#include <string>
+#include <string_view>
+#include <cstdint>
+#include <optional>
+
+/***
+ * This unit is used to represent Rspamd C++ errors in a way to interoperate
+ * with C code if needed and avoid allocations for static strings
+ */
+namespace rspamd::util {
+
+enum class error_category : std::uint8_t {
+ INFORMAL,
+ IMPORTANT,
+ CRITICAL
+};
+
+struct error {
+public:
+ /**
+ * Construct from a static string, this string must live long enough to outlive this object
+ * @param msg
+ * @param code
+ * @param category
+ */
+ error(const char *msg, int code, error_category category = error_category::INFORMAL)
+ : error_message(msg), error_code(code), category(category)
+ {
+ }
+ /**
+ * Construct error from a temporary string taking membership
+ * @param msg
+ * @param code
+ * @param category
+ */
+ error(std::string &&msg, int code, error_category category = error_category::INFORMAL)
+ : error_code(code), category(category)
+ {
+ static_storage = std::move(msg);
+ error_message = static_storage.value();
+ }
+ /**
+ * Construct error from another string copying it into own storage
+ * @param msg
+ * @param code
+ * @param category
+ */
+ error(const std::string &msg, int code, error_category category = error_category::INFORMAL)
+ : error_code(code), category(category)
+ {
+ static_storage = msg;
+ error_message = static_storage.value();
+ }
+
+ error(const error &other)
+ : error_code(other.error_code), category(other.category)
+ {
+ if (other.static_storage) {
+ static_storage = other.static_storage;
+ error_message = static_storage.value();
+ }
+ else {
+ error_message = other.error_message;
+ }
+ }
+
+ error(error &&other) noexcept
+ {
+ *this = std::move(other);
+ }
+
+ error &operator=(error &&other) noexcept
+ {
+ if (other.static_storage.has_value()) {
+ std::swap(static_storage, other.static_storage);
+ error_message = static_storage.value();
+ }
+ else {
+ std::swap(error_message, other.error_message);
+ }
+ std::swap(other.error_code, error_code);
+ std::swap(other.category, category);
+
+ return *this;
+ }
+
+ /**
+ * Convert into GError
+ * @return
+ */
+ auto into_g_error() const -> GError *
+ {
+ return g_error_new(g_quark_from_static_string("rspamd"), error_code, "%s",
+ error_message.data());
+ }
+
+ /**
+ * Convenience alias for the `into_g_error`
+ * @param err
+ */
+ auto into_g_error_set(GError **err) const -> void
+ {
+ if (err && *err == nullptr) {
+ *err = into_g_error();
+ }
+ }
+
+ /**
+ * Convert into GError
+ * @return
+ */
+ auto into_g_error(GQuark quark) const -> GError *
+ {
+ return g_error_new(quark, error_code, "%s",
+ error_message.data());
+ }
+
+ /**
+ * Convenience alias for the `into_g_error`
+ * @param err
+ */
+ auto into_g_error_set(GQuark quark, GError **err) const -> void
+ {
+ if (err && *err == nullptr) {
+ *err = into_g_error(quark);
+ }
+ }
+
+public:
+ std::string_view error_message;
+ int error_code;
+ error_category category;
+
+private:
+ std::optional<std::string> static_storage;
+};
+
+}// namespace rspamd::util
+
+#endif//RSPAMD_ERROR_HXX
diff --git a/src/libutil/cxx/file_util.cxx b/src/libutil/cxx/file_util.cxx
new file mode 100644
index 0000000..2f031f0
--- /dev/null
+++ b/src/libutil/cxx/file_util.cxx
@@ -0,0 +1,457 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "file_util.hxx"
+#include <fmt/core.h>
+#include "libutil/util.h"
+#include "libutil/unix-std.h"
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+
+#include "doctest/doctest.h"
+
+namespace rspamd::util {
+
+auto raii_file::open(const char *fname, int flags) -> tl::expected<raii_file, error>
+{
+ int oflags = flags;
+#ifdef O_CLOEXEC
+ oflags |= O_CLOEXEC;
+#endif
+
+ if (fname == nullptr) {
+ return tl::make_unexpected(error{"cannot open file; filename is nullptr", EINVAL, error_category::CRITICAL});
+ }
+
+ auto fd = ::open(fname, oflags);
+
+ if (fd == -1) {
+ return tl::make_unexpected(error{fmt::format("cannot open file {}: {}", fname, ::strerror(errno)), errno});
+ }
+
+ auto ret = raii_file{fname, fd, false};
+
+ if (fstat(ret.fd, &ret.st) == -1) {
+ return tl::make_unexpected(error{fmt::format("cannot stat file {}: {}", fname, ::strerror(errno)), errno});
+ }
+
+ return ret;
+}
+
+auto raii_file::create(const char *fname, int flags, int perms) -> tl::expected<raii_file, error>
+{
+ int oflags = flags | O_CREAT;
+#ifdef O_CLOEXEC
+ oflags |= O_CLOEXEC;
+#endif
+
+ if (fname == nullptr) {
+ return tl::make_unexpected(error{"cannot create file; filename is nullptr", EINVAL, error_category::CRITICAL});
+ }
+
+ auto fd = ::open(fname, oflags, perms);
+
+ if (fd == -1) {
+ return tl::make_unexpected(error{fmt::format("cannot create file {}: {}", fname, ::strerror(errno)), errno});
+ }
+
+ auto ret = raii_file{fname, fd, false};
+
+ if (fstat(ret.fd, &ret.st) == -1) {
+ return tl::make_unexpected(error{fmt::format("cannot stat file {}: {}", fname, ::strerror(errno)), errno});
+ }
+
+ return ret;
+}
+
+auto raii_file::create_temp(const char *fname, int flags, int perms) -> tl::expected<raii_file, error>
+{
+ int oflags = flags;
+#ifdef O_CLOEXEC
+ oflags |= O_CLOEXEC | O_CREAT | O_EXCL;
+#endif
+ if (fname == nullptr) {
+ return tl::make_unexpected(error{"cannot open file; filename is nullptr", EINVAL, error_category::CRITICAL});
+ }
+
+ auto fd = ::open(fname, oflags, perms);
+
+ if (fd == -1) {
+ return tl::make_unexpected(error{fmt::format("cannot create file {}: {}", fname, ::strerror(errno)), errno});
+ }
+
+ auto ret = raii_file{fname, fd, true};
+
+ if (fstat(ret.fd, &ret.st) == -1) {
+ return tl::make_unexpected(error{fmt::format("cannot stat file {}: {}", fname, ::strerror(errno)), errno});
+ }
+
+ return ret;
+}
+
+auto raii_file::mkstemp(const char *pattern, int flags, int perms) -> tl::expected<raii_file, error>
+{
+ int oflags = flags;
+#ifdef O_CLOEXEC
+ oflags |= O_CLOEXEC | O_CREAT | O_EXCL;
+#endif
+ if (pattern == nullptr) {
+ return tl::make_unexpected(error{"cannot open file; pattern is nullptr", EINVAL, error_category::CRITICAL});
+ }
+
+ std::string mutable_pattern = pattern;
+
+ auto fd = g_mkstemp_full(mutable_pattern.data(), oflags, perms);
+
+ if (fd == -1) {
+ return tl::make_unexpected(error{fmt::format("cannot create file {}: {}", pattern, ::strerror(errno)), errno});
+ }
+
+ auto ret = raii_file{mutable_pattern.c_str(), fd, true};
+
+ if (fstat(ret.fd, &ret.st) == -1) {
+ return tl::make_unexpected(error{fmt::format("cannot stat file {}: {}",
+ mutable_pattern, ::strerror(errno)),
+ errno});
+ }
+
+ return ret;
+}
+
+raii_file::~raii_file() noexcept
+{
+ if (fd != -1) {
+ if (temp) {
+ (void) unlink(fname.c_str());
+ }
+ close(fd);
+ }
+}
+
+auto raii_file::update_stat() noexcept -> bool
+{
+ return fstat(fd, &st) != -1;
+}
+
+raii_file::raii_file(const char *fname, int fd, bool temp)
+ : fd(fd), temp(temp)
+{
+ std::size_t nsz;
+
+ /* Normalize path */
+ this->fname = fname;
+ rspamd_normalize_path_inplace(this->fname.data(), this->fname.size(), &nsz);
+ this->fname.resize(nsz);
+}
+
+
+raii_locked_file::~raii_locked_file() noexcept
+{
+ if (fd != -1) {
+ (void) rspamd_file_unlock(fd, FALSE);
+ }
+}
+
+auto raii_locked_file::lock_raii_file(raii_file &&unlocked) -> tl::expected<raii_locked_file, error>
+{
+ if (!rspamd_file_lock(unlocked.get_fd(), TRUE)) {
+ return tl::make_unexpected(
+ error{fmt::format("cannot lock file {}: {}", unlocked.get_name(), ::strerror(errno)), errno});
+ }
+
+ return raii_locked_file{std::move(unlocked)};
+}
+
+auto raii_locked_file::unlock() -> raii_file
+{
+ if (fd != -1) {
+ (void) rspamd_file_unlock(fd, FALSE);
+ }
+
+ return raii_file{static_cast<raii_file &&>(std::move(*this))};
+}
+
+raii_mmaped_file::raii_mmaped_file(raii_file &&file, void *map, std::size_t sz)
+ : file(std::move(file)), map(map), map_size(sz)
+{
+}
+
+auto raii_mmaped_file::mmap_shared(raii_file &&file,
+ int flags, std::int64_t offset) -> tl::expected<raii_mmaped_file, error>
+{
+ void *map;
+
+ if (file.get_stat().st_size < offset || offset < 0) {
+ return tl::make_unexpected(error{
+ fmt::format("cannot mmap file {} due to incorrect offset; offset={}, size={}",
+ file.get_name(), offset, file.get_size()),
+ EINVAL});
+ }
+ /* Update stat on file to ensure it is up-to-date */
+ file.update_stat();
+ map = mmap(nullptr, (std::size_t)(file.get_size() - offset), flags, MAP_SHARED, file.get_fd(), offset);
+
+ if (map == MAP_FAILED) {
+ return tl::make_unexpected(error{fmt::format("cannot mmap file {}: {}",
+ file.get_name(), ::strerror(errno)),
+ errno});
+ }
+
+ return raii_mmaped_file{std::move(file), map, (std::size_t)(file.get_size() - offset)};
+}
+
+auto raii_mmaped_file::mmap_shared(const char *fname, int open_flags,
+ int mmap_flags, std::int64_t offset) -> tl::expected<raii_mmaped_file, error>
+{
+ auto file = raii_file::open(fname, open_flags);
+
+ if (!file.has_value()) {
+ return tl::make_unexpected(file.error());
+ }
+
+ return raii_mmaped_file::mmap_shared(std::move(file.value()), mmap_flags, offset);
+}
+
+raii_mmaped_file::~raii_mmaped_file()
+{
+ if (map != nullptr) {
+ munmap(map, map_size);
+ }
+}
+
+raii_mmaped_file::raii_mmaped_file(raii_mmaped_file &&other) noexcept
+ : file(std::move(other.file))
+{
+ std::swap(map, other.map);
+ std::swap(map_size, other.map_size);
+}
+
+auto raii_file_sink::create(const char *fname, int flags, int perms,
+ const char *suffix) -> tl::expected<raii_file_sink, error>
+{
+ if (!fname || !suffix) {
+ return tl::make_unexpected(error{"cannot create file; filename is nullptr", EINVAL, error_category::CRITICAL});
+ }
+
+ auto tmp_fname = fmt::format("{}.{}", fname, suffix);
+ auto file = raii_locked_file::create(tmp_fname.c_str(), flags, perms);
+
+ if (!file.has_value()) {
+ return tl::make_unexpected(file.error());
+ }
+
+ return raii_file_sink{std::move(file.value()), fname, std::move(tmp_fname)};
+}
+
+auto raii_file_sink::write_output() -> bool
+{
+ if (success) {
+ /* We cannot write output twice */
+ return false;
+ }
+
+ if (rename(tmp_fname.c_str(), output_fname.c_str()) == -1) {
+ return false;
+ }
+
+ success = true;
+
+ return true;
+}
+
+raii_file_sink::~raii_file_sink()
+{
+ if (!success) {
+ /* Unlink sink */
+ unlink(tmp_fname.c_str());
+ }
+}
+
+raii_file_sink::raii_file_sink(raii_locked_file &&_file, const char *_output, std::string &&_tmp_fname)
+ : file(std::move(_file)), output_fname(_output), tmp_fname(std::move(_tmp_fname)), success(false)
+{
+}
+
+raii_file_sink::raii_file_sink(raii_file_sink &&other) noexcept
+ : file(std::move(other.file)),
+ output_fname(std::move(other.output_fname)),
+ tmp_fname(std::move(other.tmp_fname)),
+ success(other.success)
+{
+}
+
+namespace tests {
+template<class T>
+static auto test_read_file(const T &f)
+{
+ auto fd = f.get_fd();
+ (void) ::lseek(fd, 0, SEEK_SET);
+ std::string buf('\0', (std::size_t) f.get_size());
+ ::read(fd, buf.data(), buf.size());
+ return buf;
+}
+template<class T>
+static auto test_write_file(const T &f, const std::string_view &buf)
+{
+ auto fd = f.get_fd();
+ (void) ::lseek(fd, 0, SEEK_SET);
+ return ::write(fd, buf.data(), buf.size());
+}
+auto random_fname(std::string_view extension)
+{
+ const auto *tmpdir = getenv("TMPDIR");
+ if (tmpdir == nullptr) {
+ tmpdir = G_DIR_SEPARATOR_S "tmp";
+ }
+
+ std::string out_fname{tmpdir};
+ out_fname += G_DIR_SEPARATOR_S;
+
+ char hexbuf[32];
+ rspamd_random_hex(hexbuf, sizeof(hexbuf));
+ out_fname.append((const char *) hexbuf, sizeof(hexbuf));
+ if (!extension.empty()) {
+ out_fname.append(".");
+ out_fname.append(extension);
+ }
+
+ return out_fname;
+}
+TEST_SUITE("loked files utils")
+{
+
+ TEST_CASE("create and delete file")
+ {
+ auto fname = random_fname("tmp");
+ {
+ auto raii_locked_file = raii_locked_file::create_temp(fname.c_str(), O_RDONLY, 00600);
+ CHECK(raii_locked_file.has_value());
+ CHECK(raii_locked_file.value().get_extension() == "tmp");
+ CHECK(::access(fname.c_str(), R_OK) == 0);
+ }
+ // File must be deleted after this call
+ auto ret = ::access(fname.c_str(), R_OK);
+ auto serrno = errno;
+ CHECK(ret == -1);
+ CHECK(serrno == ENOENT);
+ // Create one more time
+ {
+ auto raii_locked_file = raii_locked_file::create_temp(fname.c_str(), O_RDONLY, 00600);
+ CHECK(raii_locked_file.has_value());
+ CHECK(::access(fname.c_str(), R_OK) == 0);
+ }
+ ret = ::access(fname.c_str(), R_OK);
+ serrno = errno;
+ CHECK(ret == -1);
+ CHECK(serrno == ENOENT);
+ }
+
+ TEST_CASE("check lock")
+ {
+ auto fname = random_fname("");
+ {
+ auto raii_locked_file = raii_locked_file::create_temp(fname.c_str(), O_RDONLY, 00600);
+ CHECK(raii_locked_file.has_value());
+ CHECK(raii_locked_file.value().get_extension() == "");
+ CHECK(::access(fname.c_str(), R_OK) == 0);
+ auto raii_locked_file2 = raii_locked_file::open(fname.c_str(), O_RDONLY);
+ CHECK(!raii_locked_file2.has_value());
+ CHECK(::access(fname.c_str(), R_OK) == 0);
+ }
+ // File must be deleted after this call
+ auto ret = ::access(fname.c_str(), R_OK);
+ auto serrno = errno;
+ CHECK(ret == -1);
+ CHECK(serrno == ENOENT);
+ }
+
+ auto get_tmpdir()->std::string
+ {
+ const auto *tmpdir = getenv("TMPDIR");
+ if (tmpdir == nullptr) {
+ tmpdir = G_DIR_SEPARATOR_S "tmp";
+ }
+
+ std::size_t sz;
+ std::string mut_fname = tmpdir;
+ rspamd_normalize_path_inplace(mut_fname.data(), mut_fname.size(), &sz);
+ mut_fname.resize(sz);
+
+ if (!mut_fname.ends_with(G_DIR_SEPARATOR)) {
+ mut_fname += G_DIR_SEPARATOR;
+ }
+
+ return mut_fname;
+ }
+
+ TEST_CASE("tempfile")
+ {
+ std::string tmpname;
+ const std::string tmpdir{get_tmpdir()};
+ {
+ auto raii_locked_file = raii_locked_file::mkstemp(std::string(tmpdir + G_DIR_SEPARATOR_S + "doctest-XXXXXXXX").c_str(),
+ O_RDONLY, 00600);
+ CHECK(raii_locked_file.has_value());
+ CHECK(raii_locked_file.value().get_dir() == tmpdir);
+ CHECK(access(raii_locked_file.value().get_name().data(), R_OK) == 0);
+ auto raii_locked_file2 = raii_locked_file::open(raii_locked_file.value().get_name().data(), O_RDONLY);
+ CHECK(!raii_locked_file2.has_value());
+ CHECK(access(raii_locked_file.value().get_name().data(), R_OK) == 0);
+ tmpname = raii_locked_file.value().get_name();
+ }
+ // File must be deleted after this call
+ auto ret = ::access(tmpname.c_str(), R_OK);
+ auto serrno = errno;
+ CHECK(ret == -1);
+ CHECK(serrno == ENOENT);
+ }
+
+ TEST_CASE("mmap")
+ {
+ std::string tmpname;
+ const std::string tmpdir{get_tmpdir()};
+ {
+ auto raii_file = raii_file::mkstemp(std::string(tmpdir + G_DIR_SEPARATOR_S + "doctest-XXXXXXXX").c_str(),
+ O_RDWR | O_CREAT | O_EXCL, 00600);
+ CHECK(raii_file.has_value());
+ CHECK(raii_file->get_dir() == tmpdir);
+ CHECK(access(raii_file->get_name().data(), R_OK) == 0);
+ tmpname = std::string{raii_file->get_name()};
+ char payload[] = {'1', '2', '3'};
+ CHECK(write(raii_file->get_fd(), payload, sizeof(payload)) == sizeof(payload));
+ auto mmapped_file1 = raii_mmaped_file::mmap_shared(std::move(raii_file.value()), PROT_READ | PROT_WRITE);
+ CHECK(mmapped_file1.has_value());
+ CHECK(!raii_file->is_valid());
+ CHECK(mmapped_file1->get_size() == sizeof(payload));
+ CHECK(memcmp(mmapped_file1->get_map(), payload, sizeof(payload)) == 0);
+ *(char *) mmapped_file1->get_map() = '2';
+ auto mmapped_file2 = raii_mmaped_file::mmap_shared(tmpname.c_str(), O_RDONLY, PROT_READ);
+ CHECK(mmapped_file2.has_value());
+ CHECK(mmapped_file2->get_size() == sizeof(payload));
+ CHECK(memcmp(mmapped_file2->get_map(), payload, sizeof(payload)) != 0);
+ CHECK(memcmp(mmapped_file2->get_map(), mmapped_file1->get_map(), sizeof(payload)) == 0);
+ }
+ // File must be deleted after this call
+ auto ret = ::access(tmpname.c_str(), R_OK);
+ auto serrno = errno;
+ CHECK(ret == -1);
+ CHECK(serrno == ENOENT);
+ }
+
+}// TEST_SUITE
+
+}// namespace tests
+
+}// namespace rspamd::util
diff --git a/src/libutil/cxx/file_util.hxx b/src/libutil/cxx/file_util.hxx
new file mode 100644
index 0000000..4528905
--- /dev/null
+++ b/src/libutil/cxx/file_util.hxx
@@ -0,0 +1,312 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_FILE_UTIL_HXX
+#define RSPAMD_FILE_UTIL_HXX
+#pragma once
+
+#include "config.h"
+#include "contrib/expected/expected.hpp"
+#include "libutil/cxx/error.hxx"
+#include <string>
+#include <sys/stat.h>
+
+namespace rspamd::util {
+/**
+ * A simple RAII object to contain a move only file descriptor
+ * A file is unlocked and closed when not needed
+ */
+struct raii_file {
+public:
+ virtual ~raii_file() noexcept;
+
+ static auto open(const char *fname, int flags) -> tl::expected<raii_file, error>;
+ static auto open(const std::string &fname, int flags) -> tl::expected<raii_file, error>
+ {
+ return open(fname.c_str(), flags);
+ };
+ static auto create(const char *fname, int flags, int perms) -> tl::expected<raii_file, error>;
+ static auto create(const std::string &fname, int flags, int perms) -> tl::expected<raii_file, error>
+ {
+ return create(fname.c_str(), flags, perms);
+ };
+
+ static auto create_temp(const char *fname, int flags, int perms) -> tl::expected<raii_file, error>;
+ static auto mkstemp(const char *pattern, int flags, int perms) -> tl::expected<raii_file, error>;
+
+ auto get_fd() const -> int
+ {
+ return fd;
+ }
+
+ auto get_stat() const -> const struct stat &
+ {
+ return st;
+ };
+
+ auto get_size() const -> std::size_t
+ {
+ return st.st_size;
+ };
+
+ auto get_name() const -> std::string_view
+ {
+ return std::string_view{fname};
+ }
+
+ auto get_dir() const -> std::string_view
+ {
+ auto sep_pos = fname.rfind(G_DIR_SEPARATOR);
+
+ if (sep_pos == std::string::npos) {
+ return std::string_view{fname};
+ }
+
+ while (sep_pos >= 1 && fname[sep_pos - 1] == G_DIR_SEPARATOR) {
+ sep_pos--;
+ }
+
+ return std::string_view{fname.c_str(), sep_pos + 1};
+ }
+
+ auto get_extension() const -> std::string_view
+ {
+ auto sep_pos = fname.rfind(G_DIR_SEPARATOR);
+
+ if (sep_pos == std::string::npos) {
+ sep_pos = 0;
+ }
+
+ auto filename = std::string_view{fname.c_str() + sep_pos};
+ auto dot_pos = filename.find('.');
+
+ if (dot_pos == std::string::npos) {
+ return std::string_view{};
+ }
+ else {
+ return std::string_view{filename.data() + dot_pos + 1, filename.size() - dot_pos - 1};
+ }
+ }
+
+ raii_file &operator=(raii_file &&other) noexcept
+ {
+ std::swap(fd, other.fd);
+ std::swap(temp, other.temp);
+ std::swap(fname, other.fname);
+ std::swap(st, other.st);
+
+ return *this;
+ }
+
+ raii_file(raii_file &&other) noexcept
+ {
+ *this = std::move(other);
+ }
+
+ /**
+ * Prevent file from being deleted
+ * @return
+ */
+ auto make_immortal() noexcept
+ {
+ temp = false;
+ }
+
+ /**
+ * Performs fstat on an opened file to refresh internal stat
+ * @return
+ */
+ auto update_stat() noexcept -> bool;
+
+ auto is_valid() noexcept -> bool
+ {
+ return fd != -1;
+ }
+
+ /* Do not allow copy/default ctor */
+ const raii_file &operator=(const raii_file &other) = delete;
+ raii_file() = delete;
+ raii_file(const raii_file &other) = delete;
+
+protected:
+ int fd = -1;
+ bool temp;
+ std::string fname;
+ struct stat st;
+
+ explicit raii_file(const char *fname, int fd, bool temp);
+};
+/**
+ * A simple RAII object to contain a file descriptor with an flock wrap
+ * A file is unlocked and closed when not needed
+ */
+struct raii_locked_file final : public raii_file {
+public:
+ ~raii_locked_file() noexcept override;
+
+ static auto open(const char *fname, int flags) -> tl::expected<raii_locked_file, error>
+ {
+ auto locked = raii_file::open(fname, flags).and_then([]<class T>(T &&file) {
+ return lock_raii_file(std::forward<T>(file));
+ });
+
+ return locked;
+ }
+ static auto create(const char *fname, int flags, int perms) -> tl::expected<raii_locked_file, error>
+ {
+ auto locked = raii_file::create(fname, flags, perms).and_then([]<class T>(T &&file) {
+ return lock_raii_file(std::forward<T>(file));
+ });
+
+ return locked;
+ }
+ static auto create_temp(const char *fname, int flags, int perms) -> tl::expected<raii_locked_file, error>
+ {
+ auto locked = raii_file::create_temp(fname, flags, perms).and_then([]<class T>(T &&file) {
+ return lock_raii_file(std::forward<T>(file));
+ });
+
+ return locked;
+ }
+ static auto mkstemp(const char *pattern, int flags, int perms) -> tl::expected<raii_locked_file, error>
+ {
+ auto locked = raii_file::mkstemp(pattern, flags, perms).and_then([]<class T>(T &&file) {
+ return lock_raii_file(std::forward<T>(file));
+ });
+
+ return locked;
+ }
+
+ raii_locked_file &operator=(raii_locked_file &&other) noexcept
+ {
+ std::swap(fd, other.fd);
+ std::swap(temp, other.temp);
+ std::swap(fname, other.fname);
+ std::swap(st, other.st);
+
+ return *this;
+ }
+
+ /**
+ * Unlock a locked file and return back unlocked file transferring ownership.
+ * A locked file cannot be used after this method.
+ */
+ auto unlock() -> raii_file;
+
+ raii_locked_file(raii_locked_file &&other) noexcept
+ : raii_file(static_cast<raii_file &&>(std::move(other)))
+ {
+ }
+ /* Do not allow copy/default ctor */
+ const raii_locked_file &operator=(const raii_locked_file &other) = delete;
+ raii_locked_file() = delete;
+ raii_locked_file(const raii_locked_file &other) = delete;
+
+private:
+ static auto lock_raii_file(raii_file &&unlocked) -> tl::expected<raii_locked_file, error>;
+ raii_locked_file(raii_file &&other) noexcept
+ : raii_file(std::move(other))
+ {
+ }
+ explicit raii_locked_file(const char *fname, int fd, bool temp)
+ : raii_file(fname, fd, temp)
+ {
+ }
+};
+
+/**
+ * A mmap wrapper on top of a locked file
+ */
+struct raii_mmaped_file final {
+ ~raii_mmaped_file();
+ static auto mmap_shared(raii_file &&file, int flags, std::int64_t offset = 0) -> tl::expected<raii_mmaped_file, error>;
+ static auto mmap_shared(const char *fname, int open_flags, int mmap_flags, std::int64_t offset = 0) -> tl::expected<raii_mmaped_file, error>;
+ // Returns a constant pointer to the underlying map
+ auto get_map() const -> void *
+ {
+ return map;
+ }
+ auto get_file() const -> const raii_file &
+ {
+ return file;
+ }
+ // Passes the ownership of the mmaped memory to the callee
+ auto steal_map() -> std::tuple<void *, std::size_t>
+ {
+ auto ret = std::make_tuple(this->map, map_size);
+ this->map = nullptr;
+ return ret;
+ }
+
+ auto get_size() const -> std::size_t
+ {
+ return file.get_stat().st_size;
+ }
+
+ raii_mmaped_file &operator=(raii_mmaped_file &&other) noexcept
+ {
+ std::swap(map, other.map);
+ std::swap(map_size, other.map_size);
+ file = std::move(other.file);
+
+ return *this;
+ }
+
+ raii_mmaped_file(raii_mmaped_file &&other) noexcept;
+
+ /* Do not allow copy/default ctor */
+ const raii_mmaped_file &operator=(const raii_mmaped_file &other) = delete;
+ raii_mmaped_file() = delete;
+ raii_mmaped_file(const raii_mmaped_file &other) = delete;
+
+private:
+ /* Is intended to be used with map_shared */
+ explicit raii_mmaped_file(raii_file &&_file, void *_map, std::size_t sz);
+ raii_file file;
+ void *map = nullptr;
+ std::size_t map_size;
+};
+
+/**
+ * A helper to have a file to write that will be renamed to the
+ * target file if successful or deleted in the case of failure
+ */
+struct raii_file_sink final {
+ static auto create(const char *fname, int flags, int perms, const char *suffix = "new")
+ -> tl::expected<raii_file_sink, error>;
+ auto write_output() -> bool;
+ ~raii_file_sink();
+ auto get_fd() const -> int
+ {
+ return file.get_fd();
+ }
+
+ raii_file_sink(raii_file_sink &&other) noexcept;
+ /* Do not allow copy/default ctor */
+ const raii_file_sink &operator=(const raii_file_sink &other) = delete;
+ raii_file_sink() = delete;
+ raii_file_sink(const raii_file_sink &other) = delete;
+
+private:
+ explicit raii_file_sink(raii_locked_file &&_file, const char *_output, std::string &&_tmp_fname);
+ raii_locked_file file;
+ std::string output_fname;
+ std::string tmp_fname;
+ bool success;
+};
+
+}// namespace rspamd::util
+
+#endif//RSPAMD_FILE_UTIL_HXX
diff --git a/src/libutil/cxx/hash_util.hxx b/src/libutil/cxx/hash_util.hxx
new file mode 100644
index 0000000..05f3d97
--- /dev/null
+++ b/src/libutil/cxx/hash_util.hxx
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_HASH_UTIL_HXX
+#define RSPAMD_HASH_UTIL_HXX
+
+#pragma once
+
+#include <string_view>
+#include <string>
+#include "contrib/ankerl/unordered_dense.h"
+
+
+namespace rspamd {
+/*
+ * Transparent smart pointers hashing
+ */
+template<typename T>
+struct smart_ptr_equal {
+ using is_transparent = void; /* We want to find values in a set of shared_ptr by reference */
+ auto operator()(const std::shared_ptr<T> &a, const std::shared_ptr<T> &b) const
+ {
+ return (*a) == (*b);
+ }
+ auto operator()(const std::shared_ptr<T> &a, const T &b) const
+ {
+ return (*a) == b;
+ }
+ auto operator()(const T &a, const std::shared_ptr<T> &b) const
+ {
+ return a == (*b);
+ }
+ auto operator()(const std::unique_ptr<T> &a, const std::unique_ptr<T> &b) const
+ {
+ return (*a) == (*b);
+ }
+ auto operator()(const std::unique_ptr<T> &a, const T &b) const
+ {
+ return (*a) == b;
+ }
+ auto operator()(const T &a, const std::unique_ptr<T> &b) const
+ {
+ return a == (*b);
+ }
+};
+
+template<typename T>
+struct smart_ptr_hash {
+ using is_transparent = void; /* We want to find values in a set of shared_ptr by reference */
+ using is_avalanching = void;
+ auto operator()(const std::shared_ptr<T> &a) const
+ {
+ return std::hash<T>()(*a);
+ }
+ auto operator()(const std::unique_ptr<T> &a) const
+ {
+ return std::hash<T>()(*a);
+ }
+ auto operator()(const T &a) const
+ {
+ return std::hash<T>()(a);
+ }
+};
+
+/* Enable lookup by string view */
+struct smart_str_equal {
+ using is_transparent = void;
+ auto operator()(const std::string &a, const std::string &b) const
+ {
+ return a == b;
+ }
+ auto operator()(const std::string_view &a, const std::string &b) const
+ {
+ return a == b;
+ }
+ auto operator()(const std::string &a, const std::string_view &b) const
+ {
+ return a == b;
+ }
+};
+
+struct smart_str_hash {
+ using is_transparent = void;
+ using is_avalanching = void;
+ auto operator()(const std::string &a) const
+ {
+ return ankerl::unordered_dense::hash<std::string>()(a);
+ }
+ auto operator()(const std::string_view &a) const
+ {
+ return ankerl::unordered_dense::hash<std::string_view>()(a);
+ }
+};
+
+}// namespace rspamd
+
+#endif//RSPAMD_HASH_UTIL_HXX
diff --git a/src/libutil/cxx/local_shared_ptr.hxx b/src/libutil/cxx/local_shared_ptr.hxx
new file mode 100644
index 0000000..78ed5ba
--- /dev/null
+++ b/src/libutil/cxx/local_shared_ptr.hxx
@@ -0,0 +1,440 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_LOCAL_SHARED_PTR_HXX
+#define RSPAMD_LOCAL_SHARED_PTR_HXX
+
+#pragma once
+
+#include <memory>
+#include <algorithm> // for std::swap
+#include <cstddef> // for std::size_t
+#include <functional>// for std::less
+
+/*
+ * Smart pointers with no atomic refcounts to speed up Rspamd which is
+ * apparently single threaded
+ */
+namespace rspamd {
+
+namespace detail {
+
+class ref_cnt {
+public:
+ using refcount_t = int;
+
+ constexpr auto add_shared() -> refcount_t
+ {
+ return ++ref_shared;
+ }
+ constexpr auto add_weak() -> refcount_t
+ {
+ return ++ref_weak;
+ }
+ constexpr auto release_shared() -> refcount_t
+ {
+ return --ref_shared;
+ }
+ constexpr auto release_weak() -> refcount_t
+ {
+ return --ref_weak;
+ }
+ constexpr auto shared_count() const -> refcount_t
+ {
+ return ref_shared;
+ }
+ constexpr auto weak_count() const -> refcount_t
+ {
+ return ref_weak;
+ }
+ virtual ~ref_cnt()
+ {
+ }
+ virtual void dispose() = 0;
+
+private:
+ refcount_t ref_weak = 0;
+ refcount_t ref_shared = 1;
+};
+
+template<class T>
+class obj_and_refcnt : public ref_cnt {
+private:
+ typedef typename std::aligned_storage<sizeof(T), std::alignment_of<T>::value>::type storage_type;
+ storage_type storage;
+ bool initialized;
+ virtual void dispose() override
+ {
+ if (initialized) {
+ T *p = reinterpret_cast<T *>(&storage);
+ p->~T();
+ initialized = false;
+ }
+ }
+
+public:
+ template<typename... Args>
+ explicit obj_and_refcnt(Args &&...args)
+ : initialized(true)
+ {
+ new (&storage) T(std::forward<Args>(args)...);
+ }
+ auto get(void) -> T *
+ {
+ if (initialized) {
+ return reinterpret_cast<T *>(&storage);
+ }
+
+ return nullptr;
+ }
+ virtual ~obj_and_refcnt() = default;
+};
+
+template<class T, class D = typename std::default_delete<T>>
+class ptr_and_refcnt : public ref_cnt {
+private:
+ T *ptr;
+ D deleter;
+ virtual void dispose() override
+ {
+ deleter(ptr);
+ ptr = nullptr;
+ }
+
+public:
+ explicit ptr_and_refcnt(T *_ptr, D &&d = std::default_delete<T>())
+ : ptr(_ptr),
+ deleter(std::move(d))
+ {
+ }
+ virtual ~ptr_and_refcnt() = default;
+};
+
+}// namespace detail
+
+template<class T>
+class local_weak_ptr;
+
+template<class T>
+class local_shared_ptr {
+public:
+ typedef T element_type;
+ typedef local_weak_ptr<T> weak_type;
+
+ // Simplified comparing to libc++, no custom deleter and no rebind here
+ // constructors:
+ constexpr local_shared_ptr() noexcept
+ : px(nullptr), cnt(nullptr)
+ {
+ }
+
+ template<class Y, typename std::enable_if<
+ std::is_convertible<Y *, element_type *>::value, bool>::type = true>
+ explicit local_shared_ptr(Y *p)
+ : px(p), cnt(new detail::ptr_and_refcnt(p))
+ {
+ }
+
+ // custom deleter
+ template<class Y, class D, typename std::enable_if<std::is_convertible<Y *, element_type *>::value, bool>::type = true>
+ explicit local_shared_ptr(Y *p, D &&d)
+ : px(p), cnt(new detail::ptr_and_refcnt<Y, D>(p, std::forward<D>(d)))
+ {
+ }
+
+ local_shared_ptr(const local_shared_ptr &r) noexcept
+ : px(r.px), cnt(r.cnt)
+ {
+ if (cnt) {
+ cnt->add_shared();
+ }
+ }
+ local_shared_ptr(local_shared_ptr &&r) noexcept
+ : px(r.px), cnt(r.cnt)
+ {
+ r.px = nullptr;
+ r.cnt = nullptr;
+ }
+ template<class Y>
+ explicit local_shared_ptr(const local_weak_ptr<Y> &r)
+ : px(r.px), cnt(r.cnt)
+ {
+ if (cnt) {
+ cnt->add_shared();
+ }
+ }
+ local_shared_ptr(std::nullptr_t)
+ : local_shared_ptr()
+ {
+ }
+
+ ~local_shared_ptr()
+ {
+ if (cnt) {
+ if (cnt->release_shared() <= 0) {
+ cnt->dispose();
+
+ if (cnt->weak_count() == 0) {
+ delete cnt;
+ }
+ }
+ }
+ }
+
+ // assignment:
+ local_shared_ptr &operator=(const local_shared_ptr &r) noexcept
+ {
+ local_shared_ptr(r).swap(*this);
+ return *this;
+ }
+ local_shared_ptr &operator=(local_shared_ptr &&r) noexcept
+ {
+ local_shared_ptr(std::move(r)).swap(*this);
+ return *this;
+ }
+
+ // Mutators
+ void swap(local_shared_ptr &r) noexcept
+ {
+ std::swap(this->cnt, r.cnt);
+ std::swap(this->px, r.px);
+ }
+ void reset() noexcept
+ {
+ local_shared_ptr().swap(*this);
+ }
+
+ // Observers:
+ T *get() const noexcept
+ {
+ return px;
+ }
+
+ T &operator*() const noexcept
+ {
+ return *px;
+ }
+ T *operator->() const noexcept
+ {
+ return px;
+ }
+ long use_count() const noexcept
+ {
+ if (cnt) {
+ return cnt->shared_count();
+ }
+
+ return 0;
+ }
+ bool unique() const noexcept
+ {
+ return use_count() == 1;
+ }
+
+ explicit operator bool() const noexcept
+ {
+ return px != nullptr;
+ }
+
+ template<class Y, typename std::enable_if<
+ std::is_convertible<Y *, element_type *>::value, bool>::type = true>
+ auto operator==(const local_shared_ptr<Y> &other) const -> bool
+ {
+ return px == other.px;
+ }
+
+ template<class Y, typename std::enable_if<
+ std::is_convertible<Y *, element_type *>::value, bool>::type = true>
+ auto operator<(const local_shared_ptr<Y> &other) const -> auto
+ {
+ return *px < *other.px;
+ }
+
+private:
+ T *px;// contained pointer
+ detail::ref_cnt *cnt;
+
+ template<class _T, class... Args>
+ friend local_shared_ptr<_T> local_make_shared(Args &&...args);
+ friend class local_weak_ptr<T>;
+};
+
+template<class T, class... Args>
+local_shared_ptr<T> local_make_shared(Args &&...args)
+{
+ local_shared_ptr<T> ptr;
+ auto tmp_object = new detail::obj_and_refcnt<T>(std::forward<Args>(args)...);
+ ptr.px = tmp_object->get();
+ ptr.cnt = tmp_object;
+
+ return ptr;
+}
+
+template<class T>
+class local_weak_ptr {
+public:
+ typedef T element_type;
+
+ // constructors
+ constexpr local_weak_ptr() noexcept
+ : px(nullptr), cnt(nullptr)
+ {
+ }
+ template<class Y, typename std::enable_if<
+ std::is_convertible<Y *, element_type *>::value, bool>::type = true>
+ local_weak_ptr(local_shared_ptr<Y> const &r) noexcept
+ : px(r.px), cnt(r.cnt)
+ {
+ if (cnt) {
+ cnt->add_weak();
+ }
+ }
+
+ local_weak_ptr(local_weak_ptr const &r) noexcept
+ : px(r.px), cnt(r.cnt)
+ {
+ if (cnt) {
+ cnt->add_weak();
+ }
+ }
+ local_weak_ptr(local_weak_ptr &&r) noexcept
+ : px(r.px), cnt(r.cnt)
+ {
+ r.px = nullptr;
+ r.cnt = nullptr;
+ }
+
+ ~local_weak_ptr()
+ {
+ if (cnt) {
+ if (cnt->release_weak() <= 0 && cnt->shared_count() == 0) {
+ delete cnt;
+ }
+ }
+ }
+
+ // assignment
+ local_weak_ptr &operator=(local_weak_ptr const &r) noexcept
+ {
+ local_weak_ptr(r).swap(*this);
+ return *this;
+ }
+ local_weak_ptr &operator=(local_shared_ptr<T> const &r) noexcept
+ {
+ local_weak_ptr(r).swap(*this);
+ return *this;
+ }
+
+ template<class Y, typename std::enable_if<
+ std::is_convertible<Y *, element_type *>::value, bool>::type = true>
+ local_weak_ptr &operator=(local_weak_ptr<Y> const &r) noexcept
+ {
+ local_weak_ptr(r).swap(*this);
+ return *this;
+ }
+ local_weak_ptr &operator=(local_weak_ptr &&r) noexcept
+ {
+ local_weak_ptr(std::move(r)).swap(*this);
+ return *this;
+ }
+
+ // modifiers
+ void swap(local_weak_ptr &r) noexcept
+ {
+ std::swap(this->cnt, r.cnt);
+ std::swap(this->px, r.px);
+ }
+ void reset() noexcept
+ {
+ local_weak_ptr().swap(*this);
+ }
+
+ // observers
+ long use_count() const noexcept
+ {
+ if (cnt) {
+ return cnt->shared_count();
+ }
+ return 0;
+ }
+ bool expired() const noexcept
+ {
+ if (cnt) {
+ return cnt->shared_count() == 0;
+ }
+
+ return true;
+ }
+
+ local_shared_ptr<T> lock() const noexcept
+ {
+ local_shared_ptr<T> tmp;
+ tmp.cnt = cnt;
+
+ if (cnt) {
+ cnt->add_shared();
+ tmp.px = px;
+ }
+
+ return tmp;
+ }
+
+private:
+ element_type *px;
+ detail::ref_cnt *cnt;
+};
+
+
+}// namespace rspamd
+
+/* Hashing stuff */
+namespace std {
+template<class T>
+struct hash<rspamd::local_shared_ptr<T>> {
+ inline auto operator()(const rspamd::local_shared_ptr<T> &p) const -> auto
+ {
+ if (!p) {
+ throw std::logic_error("no hash for dangling pointer");
+ }
+ return hash<T>()(*p.get());
+ }
+};
+template<class T>
+struct hash<rspamd::local_weak_ptr<T>> {
+ inline auto operator()(const rspamd::local_weak_ptr<T> &p) const -> auto
+ {
+ if (!p) {
+ throw std::logic_error("no hash for dangling pointer");
+ }
+ return hash<T>()(*p.get());
+ }
+};
+
+template<class T>
+inline void swap(rspamd::local_shared_ptr<T> &x, rspamd::local_shared_ptr<T> &y) noexcept
+{
+ x.swap(y);
+}
+
+template<class T>
+inline void swap(rspamd::local_weak_ptr<T> &x, rspamd::local_weak_ptr<T> &y) noexcept
+{
+ x.swap(y);
+}
+
+}// namespace std
+
+#endif//RSPAMD_LOCAL_SHARED_PTR_HXX
diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx
new file mode 100644
index 0000000..5fc83ca
--- /dev/null
+++ b/src/libutil/cxx/utf8_util.cxx
@@ -0,0 +1,421 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define U_CHARSET_IS_UTF8 1
+#include <unicode/utypes.h>
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <unicode/normalizer2.h>
+#include <unicode/schriter.h>
+#include <unicode/coll.h>
+#include <unicode/translit.h>
+#include <utility>
+#include <tuple>
+#include <string>
+#include <limits>
+#include <memory>
+
+#include "utf8_util.h"
+#include "str_util.h"
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+const char *
+rspamd_string_unicode_trim_inplace(const char *str, size_t *len)
+{
+ const auto *p = str, *end = str + *len;
+ auto i = 0;
+
+ while (i < *len) {
+ UChar32 uc;
+ auto prev_i = i;
+
+ U8_NEXT(p, i, *len, uc);
+
+ if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+ i = prev_i;
+ break;
+ }
+ }
+
+ p += i;
+ (*len) -= i;
+ i = end - p;
+ auto *ret = p;
+
+ if (i > 0) {
+
+ while (i > 0) {
+ UChar32 uc;
+ auto prev_i = i;
+
+ U8_PREV(p, 0, i, uc);
+
+ if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) {
+ i = prev_i;
+ break;
+ }
+ }
+
+ *len = i;
+ }
+
+ return ret;
+}
+
+enum rspamd_utf8_normalise_result
+rspamd_normalise_unicode_inplace(char *start, size_t *len)
+{
+ UErrorCode uc_err = U_ZERO_ERROR;
+ const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err);
+ static icu::UnicodeSet zw_spaces{};
+
+ if (!zw_spaces.isFrozen()) {
+ /* Add zw spaces to the set */
+ zw_spaces.add(0x200B);
+ /* TODO: ZW non joiner, it might be used for ligatures, so it should possibly be excluded as well */
+ zw_spaces.add(0x200C);
+ /* See github issue #4290 for explanation. It seems that the ZWJ has many legit use cases */
+ //zw_spaces.add(0x200D);
+ zw_spaces.add(0xFEF);
+ zw_spaces.add(0x00AD);
+ zw_spaces.freeze();
+ }
+
+ int ret = RSPAMD_UNICODE_NORM_NORMAL;
+
+ g_assert(U_SUCCESS(uc_err));
+
+ auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len));
+ auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ return RSPAMD_UNICODE_NORM_ERROR;
+ }
+
+ /* Filter zero width spaces and push resulting string back */
+ const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t {
+ icu::StringCharacterIterator it{input};
+ size_t i = 0;
+
+ while (it.hasNext()) {
+ /* libicu is very 'special' if it comes to 'safe' macro */
+ if (i >= *len) {
+ ret |= RSPAMD_UNICODE_NORM_ERROR;
+ break;
+ }
+
+ auto uc = it.next32PostInc();
+
+ if (zw_spaces.contains(uc)) {
+ ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES;
+ }
+ else {
+ UBool err = 0;
+
+ if (uc == 0xFFFD) {
+ ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
+ }
+ U8_APPEND((uint8_t *) start, i, *len, uc, err);
+
+ if (err) {
+ ret |= RSPAMD_UNICODE_NORM_ERROR;
+ break;
+ }
+ }
+ }
+
+ return i;
+ };
+
+ if (is_normal != UNORM_YES) {
+ /* Need to normalise */
+ ret |= RSPAMD_UNICODE_NORM_UNNORMAL;
+
+ auto normalised = nfkc_norm->normalize(uc_string, uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ return RSPAMD_UNICODE_NORM_ERROR;
+ }
+
+ *len = filter_zw_spaces_and_push_back(normalised);
+ }
+ else {
+ *len = filter_zw_spaces_and_push_back(uc_string);
+ }
+
+ return static_cast<enum rspamd_utf8_normalise_result>(ret);
+}
+
+gchar *
+rspamd_utf8_transliterate(const gchar *start, gsize len, gsize *target_len)
+{
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ static std::unique_ptr<icu::Transliterator> transliterator;
+
+ if (!transliterator) {
+ UParseError parse_err;
+ static const auto rules = icu::UnicodeString{":: Any-Latin;"
+ ":: [:Nonspacing Mark:] Remove;"
+ ":: [:Punctuation:] Remove;"
+ ":: [:Symbol:] Remove;"
+ ":: [:Format:] Remove;"
+ ":: Latin-ASCII;"
+ ":: Lower();"
+ ":: NULL;"
+ "[:Space Separator:] > ' '"};
+ transliterator = std::unique_ptr<icu::Transliterator>(
+ icu::Transliterator::createFromRules("RspamdTranslit", rules, UTRANS_FORWARD, parse_err, uc_err));
+
+ if (U_FAILURE(uc_err) || !transliterator) {
+ auto context = icu::UnicodeString(parse_err.postContext, sizeof(parse_err.preContext) / sizeof(UChar));
+ g_error("fatal error: cannot init libicu transliteration engine: %s, line: %d, offset: %d",
+ u_errorName(uc_err), parse_err.line, parse_err.offset);
+ abort();
+ }
+ }
+
+ auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, len));
+ transliterator->transliterate(uc_string);
+
+ // We assume that all characters are now ascii
+ auto dest_len = uc_string.length();
+ gchar *dest = (gchar *) g_malloc(dest_len + 1);
+ auto sink = icu::CheckedArrayByteSink(dest, dest_len);
+ uc_string.toUTF8(sink);
+
+ *target_len = sink.NumberOfBytesWritten();
+ dest[*target_len] = '\0';
+
+ return dest;
+}
+
+struct rspamd_icu_collate_storage {
+ icu::Collator *collator = nullptr;
+ rspamd_icu_collate_storage()
+ {
+ UErrorCode uc_err = U_ZERO_ERROR;
+ collator = icu::Collator::createInstance(icu::Locale::getEnglish(), uc_err);
+
+ if (U_FAILURE(uc_err) || collator == nullptr) {
+ g_error("fatal error: cannot init libicu collation engine: %s",
+ u_errorName(uc_err));
+ abort();
+ }
+ /* Ignore all difference except functional */
+ collator->setStrength(icu::Collator::PRIMARY);
+ }
+
+ ~rspamd_icu_collate_storage()
+ {
+ if (collator) {
+ delete collator;
+ }
+ }
+};
+
+static rspamd_icu_collate_storage collate_storage;
+
+int rspamd_utf8_strcmp_sizes(const char *s1, gsize n1, const char *s2, gsize n2)
+{
+ if (n1 >= std::numeric_limits<int>::max() || n2 >= std::numeric_limits<int>::max()) {
+ /*
+ * It's hard to say what to do here... But libicu wants int, so we fall
+ * back to g_ascii_strcasecmp which can deal with size_t
+ */
+ if (n1 == n2) {
+ return g_ascii_strncasecmp(s1, s2, n1);
+ }
+ else {
+ return n1 - n2;
+ }
+ }
+
+ UErrorCode success = U_ZERO_ERROR;
+ auto res = collate_storage.collator->compareUTF8({s1, (int) n1}, {s2, (int) n2},
+ success);
+
+ switch (res) {
+ case UCOL_EQUAL:
+ return 0;
+ case UCOL_GREATER:
+ return 1;
+ case UCOL_LESS:
+ default:
+ return -1;
+ }
+}
+
+int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n)
+{
+ return rspamd_utf8_strcmp_sizes(s1, n, s2, n);
+}
+
+TEST_SUITE("utf8 utils")
+{
+ TEST_CASE("utf8 normalise")
+ {
+ std::tuple<const char *, const char *, int> cases[] = {
+ {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL},
+ {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL},
+ /* Zero width spaces */
+ {"\xE2\x80\x8B"
+ "те"
+ "\xE2\x80\x8B"
+ "ст",
+ "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES},
+ /* Special case of diacritic */
+ {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL},
+ // String containing a non-joiner character
+ {"س\u200Cت", "ست", RSPAMD_UNICODE_NORM_ZERO_SPACES},
+ // String containing a soft hyphen
+ {"in\u00ADter\u00ADest\u00ADing", "interesting", RSPAMD_UNICODE_NORM_ZERO_SPACES},
+ // String with ligature
+ {"fish", "fish", RSPAMD_UNICODE_NORM_UNNORMAL},
+ // String with accented characters and zero-width spaces
+ {"café\u200Blatté\u200C", "cafélatté", RSPAMD_UNICODE_NORM_ZERO_SPACES},
+ /* Same with zw spaces */
+ {"13\u200C_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ",
+ RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ZERO_SPACES},
+ /* Buffer overflow case */
+ {"u\xC2\xC2\xC2\xC2\xC2\xC2"
+ "abcdef"
+ "abcdef",
+ "u\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD",
+ RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ERROR},
+ // String with a mix of special characters, ligatures, and zero-width spaces
+ {"fish\u200Bcafé\u200C\u200Dlatté\u200D\u00AD", "fishcafé\u200Dlatté\u200D", RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ZERO_SPACES},
+ // Empty string
+ {"", "", RSPAMD_UNICODE_NORM_NORMAL},
+ };
+
+ for (const auto &c: cases) {
+ std::string cpy{std::get<0>(c)};
+ auto ns = cpy.size();
+ auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns);
+ cpy.resize(ns);
+ CHECK(cpy == std::string(std::get<1>(c)));
+ CHECK(res == std::get<2>(c));
+ }
+ }
+
+ TEST_CASE("utf8 trim")
+ {
+ std::pair<const char *, const char *> cases[] = {
+ {" \u200B"
+ "abc ",
+ "abc"},
+ {" ", ""},
+ {" a", "a"},
+ {"a ", "a"},
+ {"a a", "a a"},
+ {"abc", "abc"},
+ {"a ", "a"},
+ {" abc ", "abc"},
+ {" abc ", "abc"},
+ {" \xE2\x80\x8B"
+ "a\xE2\x80\x8B"
+ "bc ",
+ "a\xE2\x80\x8B"
+ "bc"},
+ {" \xE2\x80\x8B"
+ "abc\xE2\x80\x8B ",
+ "abc"},
+ {" \xE2\x80\x8B"
+ "abc \xE2\x80\x8B ",
+ "abc"},
+ };
+
+ for (const auto &c: cases) {
+ std::string cpy{c.first};
+ auto ns = cpy.size();
+ auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns);
+ std::string res{nstart, ns};
+ CHECK(res == std::string{c.second});
+ }
+ }
+
+
+ TEST_CASE("utf8 strcmp")
+ {
+ std::tuple<const char *, const char *, int, int> cases[] = {
+ {"abc", "abc", -1, 0},
+ {"", "", -1, 0},
+ {"aBc", "AbC", -1, 0},
+ {"abc", "ab", 2, 0},
+ {"теСт", "ТесТ", -1, 0},
+ {"теСт", "Тезт", 4, 0},
+ {"теСт", "Тезт", -1, 1},
+ {"abc", "ABD", -1, -1},
+ {"\0a\0", "\0a\1", 2, 0},
+ {"\0a\0", "\0b\1", 3, -1},
+ };
+
+ for (const auto &c: cases) {
+ auto [s1, s2, n, expected] = c;
+ if (n == -1) {
+ n = MIN(strlen(s1), strlen(s2));
+ }
+ SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str())
+ {
+ auto ret = rspamd_utf8_strcmp(s1, s2, n);
+ CHECK(ret == expected);
+ }
+ }
+ }
+
+ TEST_CASE("transliterate")
+ {
+ using namespace std::literals;
+ std::tuple<std::string_view, const char *> cases[] = {
+ {"abc"sv, "abc"},
+ {""sv, ""},
+ {"тест"sv, "test"},
+ // Diacritic to ascii
+ {"Ύ"sv, "y"},
+ // Chinese to pinyin
+ {"你好"sv, "ni hao"},
+ // Japanese to romaji
+ {"こんにちは"sv, "konnichiha"},
+ // Devanagari to latin
+ {"नमस्ते"sv, "namaste"},
+ // Arabic to latin
+ {"مرحبا"sv, "mrhba"},
+ // Remove of punctuation
+ {"a.b.c"sv, "abc"},
+ // Lowercase
+ {"ABC"sv, "abc"},
+ // Remove zero-width spaces
+ {"\xE2\x80\x8B"
+ "abc\xE2\x80\x8B"
+ "def"sv,
+ "abcdef"},
+ };
+
+ for (const auto &c: cases) {
+ auto [s1, s2] = c;
+ SUBCASE((std::string("test case: ") + std::string(s1) + " => " + s2).c_str())
+ {
+ gsize tlen;
+ auto *ret = rspamd_utf8_transliterate(s1.data(), s1.length(), &tlen);
+ CHECK(tlen == strlen(s2));
+ CHECK(strcmp(s2, ret) == 0);
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h
new file mode 100644
index 0000000..044beae
--- /dev/null
+++ b/src/libutil/cxx/utf8_util.h
@@ -0,0 +1,85 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifndef RSPAMD_UTF8_UTIL_H
+#define RSPAMD_UTF8_UTIL_H
+
+#include "config.h"
+#include "mem_pool.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Removes all unicode spaces from a string
+ * @param str start of the string
+ * @param len length
+ * @return new length of the string trimmed
+ */
+const char *rspamd_string_unicode_trim_inplace(const char *str, size_t *len);
+
+enum rspamd_utf8_normalise_result {
+ RSPAMD_UNICODE_NORM_NORMAL = 0,
+ RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
+ RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
+ RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
+ RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
+};
+
+/**
+ * Gets a string in UTF8 and normalises it to NFKC_Casefold form
+ * @param pool optional memory pool used for logging purposes
+ * @param start
+ * @param len
+ * @return TRUE if a string has been normalised
+ */
+enum rspamd_utf8_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len);
+
+/**
+ * Transliterate a string to ASCII
+ * @param start
+ * @param len
+ * @param target_len
+ * @return a new string that should be freed with g_free
+ */
+gchar *rspamd_utf8_transliterate(const gchar *start, gsize len, gsize *target_len);
+
+/**
+ * Compare two strings using libicu collator
+ * @param s1
+ * @param s2
+ * @param n
+ * @return an integer greater than, equal to, or less than 0, according as the string s1 is greater than, equal to, or less than the string s2.
+ */
+int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n);
+/**
+ * Similar to rspamd_utf8_strcmp but accepts two sizes
+ * @param s1
+ * @param n1
+ * @param s2
+ * @param n2
+ * @return
+ */
+int rspamd_utf8_strcmp_sizes(const char *s1, gsize n1, const char *s2, gsize n2);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif//RSPAMD_UTF8_UTIL_H
diff --git a/src/libutil/cxx/util.hxx b/src/libutil/cxx/util.hxx
new file mode 100644
index 0000000..32ec0b5
--- /dev/null
+++ b/src/libutil/cxx/util.hxx
@@ -0,0 +1,238 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef RSPAMD_UTIL_HXX
+#define RSPAMD_UTIL_HXX
+
+#pragma once
+
+#include <memory>
+#include <array>
+#include <string_view>
+#include <optional>
+#include <tuple>
+#include <algorithm>
+
+/*
+ * Common C++ utilities
+ */
+
+namespace rspamd {
+/*
+ * Creates std::array from a standard C style array with automatic size calculation
+ */
+template<typename... Ts>
+constexpr auto array_of(Ts &&...t) -> std::array<typename std::decay_t<typename std::common_type_t<Ts...>>, sizeof...(Ts)>
+{
+ using T = typename std::decay_t<typename std::common_type_t<Ts...>>;
+ return {{std::forward<T>(t)...}};
+}
+
+/**
+ * Find a value in a map
+ * @tparam C Map type
+ * @tparam K Key type
+ * @tparam V Value type
+ * @param c Map to search
+ * @param k Key to search
+ * @return Value if found or std::nullopt otherwise
+ */
+template<class C, class K, class V = typename C::mapped_type, typename std::enable_if_t<std::is_constructible_v<typename C::key_type, K> && std::is_constructible_v<typename C::mapped_type, V>, bool> = false>
+constexpr auto find_map(const C &c, const K &k) -> std::optional<std::reference_wrapper<const V>>
+{
+ auto f = c.find(k);
+
+ if (f != c.end()) {
+ return std::cref<V>(f->second);
+ }
+
+ return std::nullopt;
+}
+
+
+template<typename It>
+inline constexpr auto make_string_view_from_it(It begin, It end)
+{
+ using result_type = std::string_view;
+
+ return result_type{((begin != end) ? &*begin : nullptr),
+ (typename result_type::size_type) std::max(std::distance(begin, end),
+ (typename result_type::difference_type) 0)};
+}
+
+/**
+ * Iterate over lines in a string, newline characters are dropped
+ * @tparam S
+ * @tparam F
+ * @param input
+ * @param functor
+ * @return
+ */
+template<class S, class F, typename std::enable_if_t<std::is_invocable_v<F, std::string_view> && std::is_constructible_v<std::string_view, S>, bool> = true>
+inline auto string_foreach_line(const S &input, const F &functor)
+{
+ auto it = input.begin();
+ auto end = input.end();
+
+ while (it != end) {
+ auto next = std::find(it, end, '\n');
+ while (next >= it && (*next == '\n' || *next == '\r')) {
+ --next;
+ }
+ functor(make_string_view_from_it(it, next));
+ it = next;
+
+ if (it != end) {
+ ++it;
+ }
+ }
+}
+
+/**
+ * Iterate over elements in a string
+ * @tparam S string type
+ * @tparam D delimiter type
+ * @tparam F functor type
+ * @param input string to iterate
+ * @param delim delimiter to use
+ * @param functor functor to call
+ * @param ignore_empty ignore empty elements
+ * @return nothing
+ */
+template<class S, class D, class F,
+ typename std::enable_if_t<std::is_invocable_v<F, std::string_view> && std::is_constructible_v<std::string_view, S> && std::is_constructible_v<std::string_view, D>, bool> = true>
+inline auto string_foreach_delim(const S &input, const D &delim, const F &functor, const bool ignore_empty = true) -> void
+{
+ size_t first = 0;
+ auto sv_input = std::string_view{input};
+ auto sv_delim = std::string_view{delim};
+
+ while (first < sv_input.size()) {
+ const auto second = sv_input.find_first_of(sv_delim, first);
+
+ if (first != second || !ignore_empty) {
+ functor(sv_input.substr(first, second - first));
+ }
+
+ if (second == std::string_view::npos) {
+ break;
+ }
+
+ first = second + 1;
+ }
+}
+
+/**
+ * Split string on a character
+ * @tparam S string type
+ * @param input string to split
+ * @param chr character to split on
+ * @return pair of strings
+ */
+template<class S, typename std::enable_if_t<std::is_constructible_v<std::string_view, S>, bool> = true>
+inline auto string_split_on(const S &input, std::string_view::value_type chr) -> std::pair<std::string_view, std::string_view>
+{
+ auto pos = std::find(std::begin(input), std::end(input), chr);
+
+ if (pos != input.end()) {
+ auto first = std::string_view{std::begin(input), static_cast<std::size_t>(std::distance(std::begin(input), pos))};
+ while (*pos == chr && pos != input.end()) {
+ ++pos;
+ }
+ auto last = std::string_view{pos, static_cast<std::size_t>(std::distance(pos, std::end(input)))};
+
+ return {first, last};
+ }
+
+ return {std::string_view{input}, std::string_view{}};
+}
+
+/**
+ * Enumerate for range loop
+ * @tparam T iterable type
+ * @tparam TIter iterator type
+ * @param iterable iterable object
+ * @return iterator object
+ */
+template<typename T,
+ typename TIter = decltype(std::begin(std::declval<T>())),
+ typename = decltype(std::end(std::declval<T>()))>
+constexpr auto enumerate(T &&iterable)
+{
+ struct iterator {
+ size_t i;
+ TIter iter;
+ bool operator!=(const iterator &other) const
+ {
+ return iter != other.iter;
+ }
+ void operator++()
+ {
+ ++i;
+ ++iter;
+ }
+ auto operator*() const
+ {
+ return std::tie(i, *iter);
+ }
+ };
+ struct iterable_wrapper {
+ T iterable;
+ auto begin()
+ {
+ return iterator{0, std::begin(iterable)};
+ }
+ auto end()
+ {
+ return iterator{0, std::end(iterable)};
+ }
+ };
+ return iterable_wrapper{std::forward<T>(iterable)};
+}
+
+/**
+ * Allocator that cleans up memory in a secure way on destruction
+ * @tparam T
+ */
+template<class T>
+class secure_mem_allocator : public std::allocator<T> {
+public:
+ using value_type = typename std::allocator<T>::value_type;
+ using size_type = typename std::allocator<T>::size_type;
+ template<class U>
+ struct rebind {
+ typedef secure_mem_allocator<U> other;
+ };
+ secure_mem_allocator() noexcept = default;
+ secure_mem_allocator(const secure_mem_allocator &_) noexcept
+ : std::allocator<T>(_)
+ {
+ }
+ template<class U>
+ explicit secure_mem_allocator(const secure_mem_allocator<U> &) noexcept
+ {
+ }
+
+ void deallocate(value_type *p, size_type num) noexcept
+ {
+ rspamd_explicit_memzero((void *) p, num);
+ std::allocator<T>::deallocate(p, num);
+ }
+};
+
+
+}// namespace rspamd
+
+#endif//RSPAMD_UTIL_HXX
diff --git a/src/libutil/cxx/util_tests.cxx b/src/libutil/cxx/util_tests.cxx
new file mode 100644
index 0000000..6c3c177
--- /dev/null
+++ b/src/libutil/cxx/util_tests.cxx
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util.hxx"
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+#include <vector>
+
+using namespace rspamd;
+using namespace std::literals::string_view_literals;
+
+TEST_SUITE("cxx utils")
+{
+ TEST_CASE("string_split_on")
+ {
+ std::tuple<std::string_view, char, std::pair<std::string_view, std::string_view>> cases[] = {
+ {"test test"sv, ' ', std::pair{"test"sv, "test"sv}},
+ {"test test"sv, ' ', std::pair{"test"sv, "test"sv}},
+ {"test test "sv, ' ', std::pair{"test"sv, "test "sv}},
+ {"testtest "sv, ' ', std::pair{"testtest"sv, ""sv}},
+ {" testtest "sv, ' ', std::pair{""sv, "testtest "sv}},
+ {"testtest"sv, ' ', std::pair{"testtest"sv, ""sv}},
+ {""sv, ' ', std::pair{""sv, ""sv}},
+ };
+
+ for (const auto &c: cases) {
+ auto res = string_split_on(std::get<0>(c), std::get<1>(c));
+ auto expected = std::get<2>(c);
+ CHECK(res.first == expected.first);
+ CHECK(res.second == expected.second);
+ }
+ }
+
+ TEST_CASE("string_foreach_delim")
+ {
+ std::tuple<std::string_view, std::string_view, std::pair<std::vector<std::string_view>, std::vector<std::string_view>>> cases[] = {
+ {"test"sv, ","sv, {{"test"}, {"test"}}},
+ {"test,test"sv, ","sv, {{"test", "test"}, {"test", "test"}}},
+ {"test, test"sv, ", "sv, {{"test", "test"}, {"test", "", "test"}}},
+ {"test, test,,"sv, ", "sv, {{"test", "test"}, {"test", "", "test", ""}}},
+ };
+
+ for (const auto &c: cases) {
+ auto res = std::vector<std::string_view>();
+ string_foreach_delim(std::get<0>(c), std::get<1>(c), [&](const auto &v) {
+ res.push_back(v);
+ });
+
+ auto compare_vec = []<class T>(const std::vector<T> &v1, const std::vector<T> &v2) {
+ CHECK(v1.size() == v2.size());
+ for (size_t i = 0; i < v1.size(); ++i) {
+ CHECK(v1[i] == v2[i]);
+ }
+ };
+
+ compare_vec(res, std::get<2>(c).first);
+
+ res.clear();
+ // Perform the same test but with no skip empty
+ string_foreach_delim(
+ std::get<0>(c), std::get<1>(c), [&](const auto &v) {
+ res.push_back(v);
+ },
+ false);
+ compare_vec(res, std::get<2>(c).second);
+ }
+ }
+} \ No newline at end of file