diff options
Diffstat (limited to '')
-rw-r--r-- | src/libutil/cxx/error.hxx | 161 | ||||
-rw-r--r-- | src/libutil/cxx/file_util.cxx | 457 | ||||
-rw-r--r-- | src/libutil/cxx/file_util.hxx | 312 | ||||
-rw-r--r-- | src/libutil/cxx/hash_util.hxx | 109 | ||||
-rw-r--r-- | src/libutil/cxx/local_shared_ptr.hxx | 440 | ||||
-rw-r--r-- | src/libutil/cxx/utf8_util.cxx | 421 | ||||
-rw-r--r-- | src/libutil/cxx/utf8_util.h | 85 | ||||
-rw-r--r-- | src/libutil/cxx/util.hxx | 238 | ||||
-rw-r--r-- | src/libutil/cxx/util_tests.cxx | 82 |
9 files changed, 2305 insertions, 0 deletions
diff --git a/src/libutil/cxx/error.hxx b/src/libutil/cxx/error.hxx new file mode 100644 index 0000000..4689d42 --- /dev/null +++ b/src/libutil/cxx/error.hxx @@ -0,0 +1,161 @@ +/* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_ERROR_HXX +#define RSPAMD_ERROR_HXX +#pragma once + +#include "config.h" +#include <string> +#include <string_view> +#include <cstdint> +#include <optional> + +/*** + * This unit is used to represent Rspamd C++ errors in a way to interoperate + * with C code if needed and avoid allocations for static strings + */ +namespace rspamd::util { + +enum class error_category : std::uint8_t { + INFORMAL, + IMPORTANT, + CRITICAL +}; + +struct error { +public: + /** + * Construct from a static string, this string must live long enough to outlive this object + * @param msg + * @param code + * @param category + */ + error(const char *msg, int code, error_category category = error_category::INFORMAL) + : error_message(msg), error_code(code), category(category) + { + } + /** + * Construct error from a temporary string taking membership + * @param msg + * @param code + * @param category + */ + error(std::string &&msg, int code, error_category category = error_category::INFORMAL) + : error_code(code), category(category) + { + static_storage = std::move(msg); + error_message = static_storage.value(); + } + /** + * Construct error from another string copying it into own storage + * @param msg + * @param code + * @param category + */ + error(const std::string &msg, int code, error_category category = error_category::INFORMAL) + : error_code(code), category(category) + { + static_storage = msg; + error_message = static_storage.value(); + } + + error(const error &other) + : error_code(other.error_code), category(other.category) + { + if (other.static_storage) { + static_storage = other.static_storage; + error_message = static_storage.value(); + } + else { + error_message = other.error_message; + } + } + + error(error &&other) noexcept + { + *this = std::move(other); + } + + error &operator=(error &&other) noexcept + { + if (other.static_storage.has_value()) { + std::swap(static_storage, other.static_storage); + error_message = static_storage.value(); + } + else { + std::swap(error_message, other.error_message); + } + std::swap(other.error_code, error_code); + std::swap(other.category, category); + + return *this; + } + + /** + * Convert into GError + * @return + */ + auto into_g_error() const -> GError * + { + return g_error_new(g_quark_from_static_string("rspamd"), error_code, "%s", + error_message.data()); + } + + /** + * Convenience alias for the `into_g_error` + * @param err + */ + auto into_g_error_set(GError **err) const -> void + { + if (err && *err == nullptr) { + *err = into_g_error(); + } + } + + /** + * Convert into GError + * @return + */ + auto into_g_error(GQuark quark) const -> GError * + { + return g_error_new(quark, error_code, "%s", + error_message.data()); + } + + /** + * Convenience alias for the `into_g_error` + * @param err + */ + auto into_g_error_set(GQuark quark, GError **err) const -> void + { + if (err && *err == nullptr) { + *err = into_g_error(quark); + } + } + +public: + std::string_view error_message; + int error_code; + error_category category; + +private: + std::optional<std::string> static_storage; +}; + +}// namespace rspamd::util + +#endif//RSPAMD_ERROR_HXX diff --git a/src/libutil/cxx/file_util.cxx b/src/libutil/cxx/file_util.cxx new file mode 100644 index 0000000..2f031f0 --- /dev/null +++ b/src/libutil/cxx/file_util.cxx @@ -0,0 +1,457 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "file_util.hxx" +#include <fmt/core.h> +#include "libutil/util.h" +#include "libutil/unix-std.h" + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL + +#include "doctest/doctest.h" + +namespace rspamd::util { + +auto raii_file::open(const char *fname, int flags) -> tl::expected<raii_file, error> +{ + int oflags = flags; +#ifdef O_CLOEXEC + oflags |= O_CLOEXEC; +#endif + + if (fname == nullptr) { + return tl::make_unexpected(error{"cannot open file; filename is nullptr", EINVAL, error_category::CRITICAL}); + } + + auto fd = ::open(fname, oflags); + + if (fd == -1) { + return tl::make_unexpected(error{fmt::format("cannot open file {}: {}", fname, ::strerror(errno)), errno}); + } + + auto ret = raii_file{fname, fd, false}; + + if (fstat(ret.fd, &ret.st) == -1) { + return tl::make_unexpected(error{fmt::format("cannot stat file {}: {}", fname, ::strerror(errno)), errno}); + } + + return ret; +} + +auto raii_file::create(const char *fname, int flags, int perms) -> tl::expected<raii_file, error> +{ + int oflags = flags | O_CREAT; +#ifdef O_CLOEXEC + oflags |= O_CLOEXEC; +#endif + + if (fname == nullptr) { + return tl::make_unexpected(error{"cannot create file; filename is nullptr", EINVAL, error_category::CRITICAL}); + } + + auto fd = ::open(fname, oflags, perms); + + if (fd == -1) { + return tl::make_unexpected(error{fmt::format("cannot create file {}: {}", fname, ::strerror(errno)), errno}); + } + + auto ret = raii_file{fname, fd, false}; + + if (fstat(ret.fd, &ret.st) == -1) { + return tl::make_unexpected(error{fmt::format("cannot stat file {}: {}", fname, ::strerror(errno)), errno}); + } + + return ret; +} + +auto raii_file::create_temp(const char *fname, int flags, int perms) -> tl::expected<raii_file, error> +{ + int oflags = flags; +#ifdef O_CLOEXEC + oflags |= O_CLOEXEC | O_CREAT | O_EXCL; +#endif + if (fname == nullptr) { + return tl::make_unexpected(error{"cannot open file; filename is nullptr", EINVAL, error_category::CRITICAL}); + } + + auto fd = ::open(fname, oflags, perms); + + if (fd == -1) { + return tl::make_unexpected(error{fmt::format("cannot create file {}: {}", fname, ::strerror(errno)), errno}); + } + + auto ret = raii_file{fname, fd, true}; + + if (fstat(ret.fd, &ret.st) == -1) { + return tl::make_unexpected(error{fmt::format("cannot stat file {}: {}", fname, ::strerror(errno)), errno}); + } + + return ret; +} + +auto raii_file::mkstemp(const char *pattern, int flags, int perms) -> tl::expected<raii_file, error> +{ + int oflags = flags; +#ifdef O_CLOEXEC + oflags |= O_CLOEXEC | O_CREAT | O_EXCL; +#endif + if (pattern == nullptr) { + return tl::make_unexpected(error{"cannot open file; pattern is nullptr", EINVAL, error_category::CRITICAL}); + } + + std::string mutable_pattern = pattern; + + auto fd = g_mkstemp_full(mutable_pattern.data(), oflags, perms); + + if (fd == -1) { + return tl::make_unexpected(error{fmt::format("cannot create file {}: {}", pattern, ::strerror(errno)), errno}); + } + + auto ret = raii_file{mutable_pattern.c_str(), fd, true}; + + if (fstat(ret.fd, &ret.st) == -1) { + return tl::make_unexpected(error{fmt::format("cannot stat file {}: {}", + mutable_pattern, ::strerror(errno)), + errno}); + } + + return ret; +} + +raii_file::~raii_file() noexcept +{ + if (fd != -1) { + if (temp) { + (void) unlink(fname.c_str()); + } + close(fd); + } +} + +auto raii_file::update_stat() noexcept -> bool +{ + return fstat(fd, &st) != -1; +} + +raii_file::raii_file(const char *fname, int fd, bool temp) + : fd(fd), temp(temp) +{ + std::size_t nsz; + + /* Normalize path */ + this->fname = fname; + rspamd_normalize_path_inplace(this->fname.data(), this->fname.size(), &nsz); + this->fname.resize(nsz); +} + + +raii_locked_file::~raii_locked_file() noexcept +{ + if (fd != -1) { + (void) rspamd_file_unlock(fd, FALSE); + } +} + +auto raii_locked_file::lock_raii_file(raii_file &&unlocked) -> tl::expected<raii_locked_file, error> +{ + if (!rspamd_file_lock(unlocked.get_fd(), TRUE)) { + return tl::make_unexpected( + error{fmt::format("cannot lock file {}: {}", unlocked.get_name(), ::strerror(errno)), errno}); + } + + return raii_locked_file{std::move(unlocked)}; +} + +auto raii_locked_file::unlock() -> raii_file +{ + if (fd != -1) { + (void) rspamd_file_unlock(fd, FALSE); + } + + return raii_file{static_cast<raii_file &&>(std::move(*this))}; +} + +raii_mmaped_file::raii_mmaped_file(raii_file &&file, void *map, std::size_t sz) + : file(std::move(file)), map(map), map_size(sz) +{ +} + +auto raii_mmaped_file::mmap_shared(raii_file &&file, + int flags, std::int64_t offset) -> tl::expected<raii_mmaped_file, error> +{ + void *map; + + if (file.get_stat().st_size < offset || offset < 0) { + return tl::make_unexpected(error{ + fmt::format("cannot mmap file {} due to incorrect offset; offset={}, size={}", + file.get_name(), offset, file.get_size()), + EINVAL}); + } + /* Update stat on file to ensure it is up-to-date */ + file.update_stat(); + map = mmap(nullptr, (std::size_t)(file.get_size() - offset), flags, MAP_SHARED, file.get_fd(), offset); + + if (map == MAP_FAILED) { + return tl::make_unexpected(error{fmt::format("cannot mmap file {}: {}", + file.get_name(), ::strerror(errno)), + errno}); + } + + return raii_mmaped_file{std::move(file), map, (std::size_t)(file.get_size() - offset)}; +} + +auto raii_mmaped_file::mmap_shared(const char *fname, int open_flags, + int mmap_flags, std::int64_t offset) -> tl::expected<raii_mmaped_file, error> +{ + auto file = raii_file::open(fname, open_flags); + + if (!file.has_value()) { + return tl::make_unexpected(file.error()); + } + + return raii_mmaped_file::mmap_shared(std::move(file.value()), mmap_flags, offset); +} + +raii_mmaped_file::~raii_mmaped_file() +{ + if (map != nullptr) { + munmap(map, map_size); + } +} + +raii_mmaped_file::raii_mmaped_file(raii_mmaped_file &&other) noexcept + : file(std::move(other.file)) +{ + std::swap(map, other.map); + std::swap(map_size, other.map_size); +} + +auto raii_file_sink::create(const char *fname, int flags, int perms, + const char *suffix) -> tl::expected<raii_file_sink, error> +{ + if (!fname || !suffix) { + return tl::make_unexpected(error{"cannot create file; filename is nullptr", EINVAL, error_category::CRITICAL}); + } + + auto tmp_fname = fmt::format("{}.{}", fname, suffix); + auto file = raii_locked_file::create(tmp_fname.c_str(), flags, perms); + + if (!file.has_value()) { + return tl::make_unexpected(file.error()); + } + + return raii_file_sink{std::move(file.value()), fname, std::move(tmp_fname)}; +} + +auto raii_file_sink::write_output() -> bool +{ + if (success) { + /* We cannot write output twice */ + return false; + } + + if (rename(tmp_fname.c_str(), output_fname.c_str()) == -1) { + return false; + } + + success = true; + + return true; +} + +raii_file_sink::~raii_file_sink() +{ + if (!success) { + /* Unlink sink */ + unlink(tmp_fname.c_str()); + } +} + +raii_file_sink::raii_file_sink(raii_locked_file &&_file, const char *_output, std::string &&_tmp_fname) + : file(std::move(_file)), output_fname(_output), tmp_fname(std::move(_tmp_fname)), success(false) +{ +} + +raii_file_sink::raii_file_sink(raii_file_sink &&other) noexcept + : file(std::move(other.file)), + output_fname(std::move(other.output_fname)), + tmp_fname(std::move(other.tmp_fname)), + success(other.success) +{ +} + +namespace tests { +template<class T> +static auto test_read_file(const T &f) +{ + auto fd = f.get_fd(); + (void) ::lseek(fd, 0, SEEK_SET); + std::string buf('\0', (std::size_t) f.get_size()); + ::read(fd, buf.data(), buf.size()); + return buf; +} +template<class T> +static auto test_write_file(const T &f, const std::string_view &buf) +{ + auto fd = f.get_fd(); + (void) ::lseek(fd, 0, SEEK_SET); + return ::write(fd, buf.data(), buf.size()); +} +auto random_fname(std::string_view extension) +{ + const auto *tmpdir = getenv("TMPDIR"); + if (tmpdir == nullptr) { + tmpdir = G_DIR_SEPARATOR_S "tmp"; + } + + std::string out_fname{tmpdir}; + out_fname += G_DIR_SEPARATOR_S; + + char hexbuf[32]; + rspamd_random_hex(hexbuf, sizeof(hexbuf)); + out_fname.append((const char *) hexbuf, sizeof(hexbuf)); + if (!extension.empty()) { + out_fname.append("."); + out_fname.append(extension); + } + + return out_fname; +} +TEST_SUITE("loked files utils") +{ + + TEST_CASE("create and delete file") + { + auto fname = random_fname("tmp"); + { + auto raii_locked_file = raii_locked_file::create_temp(fname.c_str(), O_RDONLY, 00600); + CHECK(raii_locked_file.has_value()); + CHECK(raii_locked_file.value().get_extension() == "tmp"); + CHECK(::access(fname.c_str(), R_OK) == 0); + } + // File must be deleted after this call + auto ret = ::access(fname.c_str(), R_OK); + auto serrno = errno; + CHECK(ret == -1); + CHECK(serrno == ENOENT); + // Create one more time + { + auto raii_locked_file = raii_locked_file::create_temp(fname.c_str(), O_RDONLY, 00600); + CHECK(raii_locked_file.has_value()); + CHECK(::access(fname.c_str(), R_OK) == 0); + } + ret = ::access(fname.c_str(), R_OK); + serrno = errno; + CHECK(ret == -1); + CHECK(serrno == ENOENT); + } + + TEST_CASE("check lock") + { + auto fname = random_fname(""); + { + auto raii_locked_file = raii_locked_file::create_temp(fname.c_str(), O_RDONLY, 00600); + CHECK(raii_locked_file.has_value()); + CHECK(raii_locked_file.value().get_extension() == ""); + CHECK(::access(fname.c_str(), R_OK) == 0); + auto raii_locked_file2 = raii_locked_file::open(fname.c_str(), O_RDONLY); + CHECK(!raii_locked_file2.has_value()); + CHECK(::access(fname.c_str(), R_OK) == 0); + } + // File must be deleted after this call + auto ret = ::access(fname.c_str(), R_OK); + auto serrno = errno; + CHECK(ret == -1); + CHECK(serrno == ENOENT); + } + + auto get_tmpdir()->std::string + { + const auto *tmpdir = getenv("TMPDIR"); + if (tmpdir == nullptr) { + tmpdir = G_DIR_SEPARATOR_S "tmp"; + } + + std::size_t sz; + std::string mut_fname = tmpdir; + rspamd_normalize_path_inplace(mut_fname.data(), mut_fname.size(), &sz); + mut_fname.resize(sz); + + if (!mut_fname.ends_with(G_DIR_SEPARATOR)) { + mut_fname += G_DIR_SEPARATOR; + } + + return mut_fname; + } + + TEST_CASE("tempfile") + { + std::string tmpname; + const std::string tmpdir{get_tmpdir()}; + { + auto raii_locked_file = raii_locked_file::mkstemp(std::string(tmpdir + G_DIR_SEPARATOR_S + "doctest-XXXXXXXX").c_str(), + O_RDONLY, 00600); + CHECK(raii_locked_file.has_value()); + CHECK(raii_locked_file.value().get_dir() == tmpdir); + CHECK(access(raii_locked_file.value().get_name().data(), R_OK) == 0); + auto raii_locked_file2 = raii_locked_file::open(raii_locked_file.value().get_name().data(), O_RDONLY); + CHECK(!raii_locked_file2.has_value()); + CHECK(access(raii_locked_file.value().get_name().data(), R_OK) == 0); + tmpname = raii_locked_file.value().get_name(); + } + // File must be deleted after this call + auto ret = ::access(tmpname.c_str(), R_OK); + auto serrno = errno; + CHECK(ret == -1); + CHECK(serrno == ENOENT); + } + + TEST_CASE("mmap") + { + std::string tmpname; + const std::string tmpdir{get_tmpdir()}; + { + auto raii_file = raii_file::mkstemp(std::string(tmpdir + G_DIR_SEPARATOR_S + "doctest-XXXXXXXX").c_str(), + O_RDWR | O_CREAT | O_EXCL, 00600); + CHECK(raii_file.has_value()); + CHECK(raii_file->get_dir() == tmpdir); + CHECK(access(raii_file->get_name().data(), R_OK) == 0); + tmpname = std::string{raii_file->get_name()}; + char payload[] = {'1', '2', '3'}; + CHECK(write(raii_file->get_fd(), payload, sizeof(payload)) == sizeof(payload)); + auto mmapped_file1 = raii_mmaped_file::mmap_shared(std::move(raii_file.value()), PROT_READ | PROT_WRITE); + CHECK(mmapped_file1.has_value()); + CHECK(!raii_file->is_valid()); + CHECK(mmapped_file1->get_size() == sizeof(payload)); + CHECK(memcmp(mmapped_file1->get_map(), payload, sizeof(payload)) == 0); + *(char *) mmapped_file1->get_map() = '2'; + auto mmapped_file2 = raii_mmaped_file::mmap_shared(tmpname.c_str(), O_RDONLY, PROT_READ); + CHECK(mmapped_file2.has_value()); + CHECK(mmapped_file2->get_size() == sizeof(payload)); + CHECK(memcmp(mmapped_file2->get_map(), payload, sizeof(payload)) != 0); + CHECK(memcmp(mmapped_file2->get_map(), mmapped_file1->get_map(), sizeof(payload)) == 0); + } + // File must be deleted after this call + auto ret = ::access(tmpname.c_str(), R_OK); + auto serrno = errno; + CHECK(ret == -1); + CHECK(serrno == ENOENT); + } + +}// TEST_SUITE + +}// namespace tests + +}// namespace rspamd::util diff --git a/src/libutil/cxx/file_util.hxx b/src/libutil/cxx/file_util.hxx new file mode 100644 index 0000000..4528905 --- /dev/null +++ b/src/libutil/cxx/file_util.hxx @@ -0,0 +1,312 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_FILE_UTIL_HXX +#define RSPAMD_FILE_UTIL_HXX +#pragma once + +#include "config.h" +#include "contrib/expected/expected.hpp" +#include "libutil/cxx/error.hxx" +#include <string> +#include <sys/stat.h> + +namespace rspamd::util { +/** + * A simple RAII object to contain a move only file descriptor + * A file is unlocked and closed when not needed + */ +struct raii_file { +public: + virtual ~raii_file() noexcept; + + static auto open(const char *fname, int flags) -> tl::expected<raii_file, error>; + static auto open(const std::string &fname, int flags) -> tl::expected<raii_file, error> + { + return open(fname.c_str(), flags); + }; + static auto create(const char *fname, int flags, int perms) -> tl::expected<raii_file, error>; + static auto create(const std::string &fname, int flags, int perms) -> tl::expected<raii_file, error> + { + return create(fname.c_str(), flags, perms); + }; + + static auto create_temp(const char *fname, int flags, int perms) -> tl::expected<raii_file, error>; + static auto mkstemp(const char *pattern, int flags, int perms) -> tl::expected<raii_file, error>; + + auto get_fd() const -> int + { + return fd; + } + + auto get_stat() const -> const struct stat & + { + return st; + }; + + auto get_size() const -> std::size_t + { + return st.st_size; + }; + + auto get_name() const -> std::string_view + { + return std::string_view{fname}; + } + + auto get_dir() const -> std::string_view + { + auto sep_pos = fname.rfind(G_DIR_SEPARATOR); + + if (sep_pos == std::string::npos) { + return std::string_view{fname}; + } + + while (sep_pos >= 1 && fname[sep_pos - 1] == G_DIR_SEPARATOR) { + sep_pos--; + } + + return std::string_view{fname.c_str(), sep_pos + 1}; + } + + auto get_extension() const -> std::string_view + { + auto sep_pos = fname.rfind(G_DIR_SEPARATOR); + + if (sep_pos == std::string::npos) { + sep_pos = 0; + } + + auto filename = std::string_view{fname.c_str() + sep_pos}; + auto dot_pos = filename.find('.'); + + if (dot_pos == std::string::npos) { + return std::string_view{}; + } + else { + return std::string_view{filename.data() + dot_pos + 1, filename.size() - dot_pos - 1}; + } + } + + raii_file &operator=(raii_file &&other) noexcept + { + std::swap(fd, other.fd); + std::swap(temp, other.temp); + std::swap(fname, other.fname); + std::swap(st, other.st); + + return *this; + } + + raii_file(raii_file &&other) noexcept + { + *this = std::move(other); + } + + /** + * Prevent file from being deleted + * @return + */ + auto make_immortal() noexcept + { + temp = false; + } + + /** + * Performs fstat on an opened file to refresh internal stat + * @return + */ + auto update_stat() noexcept -> bool; + + auto is_valid() noexcept -> bool + { + return fd != -1; + } + + /* Do not allow copy/default ctor */ + const raii_file &operator=(const raii_file &other) = delete; + raii_file() = delete; + raii_file(const raii_file &other) = delete; + +protected: + int fd = -1; + bool temp; + std::string fname; + struct stat st; + + explicit raii_file(const char *fname, int fd, bool temp); +}; +/** + * A simple RAII object to contain a file descriptor with an flock wrap + * A file is unlocked and closed when not needed + */ +struct raii_locked_file final : public raii_file { +public: + ~raii_locked_file() noexcept override; + + static auto open(const char *fname, int flags) -> tl::expected<raii_locked_file, error> + { + auto locked = raii_file::open(fname, flags).and_then([]<class T>(T &&file) { + return lock_raii_file(std::forward<T>(file)); + }); + + return locked; + } + static auto create(const char *fname, int flags, int perms) -> tl::expected<raii_locked_file, error> + { + auto locked = raii_file::create(fname, flags, perms).and_then([]<class T>(T &&file) { + return lock_raii_file(std::forward<T>(file)); + }); + + return locked; + } + static auto create_temp(const char *fname, int flags, int perms) -> tl::expected<raii_locked_file, error> + { + auto locked = raii_file::create_temp(fname, flags, perms).and_then([]<class T>(T &&file) { + return lock_raii_file(std::forward<T>(file)); + }); + + return locked; + } + static auto mkstemp(const char *pattern, int flags, int perms) -> tl::expected<raii_locked_file, error> + { + auto locked = raii_file::mkstemp(pattern, flags, perms).and_then([]<class T>(T &&file) { + return lock_raii_file(std::forward<T>(file)); + }); + + return locked; + } + + raii_locked_file &operator=(raii_locked_file &&other) noexcept + { + std::swap(fd, other.fd); + std::swap(temp, other.temp); + std::swap(fname, other.fname); + std::swap(st, other.st); + + return *this; + } + + /** + * Unlock a locked file and return back unlocked file transferring ownership. + * A locked file cannot be used after this method. + */ + auto unlock() -> raii_file; + + raii_locked_file(raii_locked_file &&other) noexcept + : raii_file(static_cast<raii_file &&>(std::move(other))) + { + } + /* Do not allow copy/default ctor */ + const raii_locked_file &operator=(const raii_locked_file &other) = delete; + raii_locked_file() = delete; + raii_locked_file(const raii_locked_file &other) = delete; + +private: + static auto lock_raii_file(raii_file &&unlocked) -> tl::expected<raii_locked_file, error>; + raii_locked_file(raii_file &&other) noexcept + : raii_file(std::move(other)) + { + } + explicit raii_locked_file(const char *fname, int fd, bool temp) + : raii_file(fname, fd, temp) + { + } +}; + +/** + * A mmap wrapper on top of a locked file + */ +struct raii_mmaped_file final { + ~raii_mmaped_file(); + static auto mmap_shared(raii_file &&file, int flags, std::int64_t offset = 0) -> tl::expected<raii_mmaped_file, error>; + static auto mmap_shared(const char *fname, int open_flags, int mmap_flags, std::int64_t offset = 0) -> tl::expected<raii_mmaped_file, error>; + // Returns a constant pointer to the underlying map + auto get_map() const -> void * + { + return map; + } + auto get_file() const -> const raii_file & + { + return file; + } + // Passes the ownership of the mmaped memory to the callee + auto steal_map() -> std::tuple<void *, std::size_t> + { + auto ret = std::make_tuple(this->map, map_size); + this->map = nullptr; + return ret; + } + + auto get_size() const -> std::size_t + { + return file.get_stat().st_size; + } + + raii_mmaped_file &operator=(raii_mmaped_file &&other) noexcept + { + std::swap(map, other.map); + std::swap(map_size, other.map_size); + file = std::move(other.file); + + return *this; + } + + raii_mmaped_file(raii_mmaped_file &&other) noexcept; + + /* Do not allow copy/default ctor */ + const raii_mmaped_file &operator=(const raii_mmaped_file &other) = delete; + raii_mmaped_file() = delete; + raii_mmaped_file(const raii_mmaped_file &other) = delete; + +private: + /* Is intended to be used with map_shared */ + explicit raii_mmaped_file(raii_file &&_file, void *_map, std::size_t sz); + raii_file file; + void *map = nullptr; + std::size_t map_size; +}; + +/** + * A helper to have a file to write that will be renamed to the + * target file if successful or deleted in the case of failure + */ +struct raii_file_sink final { + static auto create(const char *fname, int flags, int perms, const char *suffix = "new") + -> tl::expected<raii_file_sink, error>; + auto write_output() -> bool; + ~raii_file_sink(); + auto get_fd() const -> int + { + return file.get_fd(); + } + + raii_file_sink(raii_file_sink &&other) noexcept; + /* Do not allow copy/default ctor */ + const raii_file_sink &operator=(const raii_file_sink &other) = delete; + raii_file_sink() = delete; + raii_file_sink(const raii_file_sink &other) = delete; + +private: + explicit raii_file_sink(raii_locked_file &&_file, const char *_output, std::string &&_tmp_fname); + raii_locked_file file; + std::string output_fname; + std::string tmp_fname; + bool success; +}; + +}// namespace rspamd::util + +#endif//RSPAMD_FILE_UTIL_HXX diff --git a/src/libutil/cxx/hash_util.hxx b/src/libutil/cxx/hash_util.hxx new file mode 100644 index 0000000..05f3d97 --- /dev/null +++ b/src/libutil/cxx/hash_util.hxx @@ -0,0 +1,109 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_HASH_UTIL_HXX +#define RSPAMD_HASH_UTIL_HXX + +#pragma once + +#include <string_view> +#include <string> +#include "contrib/ankerl/unordered_dense.h" + + +namespace rspamd { +/* + * Transparent smart pointers hashing + */ +template<typename T> +struct smart_ptr_equal { + using is_transparent = void; /* We want to find values in a set of shared_ptr by reference */ + auto operator()(const std::shared_ptr<T> &a, const std::shared_ptr<T> &b) const + { + return (*a) == (*b); + } + auto operator()(const std::shared_ptr<T> &a, const T &b) const + { + return (*a) == b; + } + auto operator()(const T &a, const std::shared_ptr<T> &b) const + { + return a == (*b); + } + auto operator()(const std::unique_ptr<T> &a, const std::unique_ptr<T> &b) const + { + return (*a) == (*b); + } + auto operator()(const std::unique_ptr<T> &a, const T &b) const + { + return (*a) == b; + } + auto operator()(const T &a, const std::unique_ptr<T> &b) const + { + return a == (*b); + } +}; + +template<typename T> +struct smart_ptr_hash { + using is_transparent = void; /* We want to find values in a set of shared_ptr by reference */ + using is_avalanching = void; + auto operator()(const std::shared_ptr<T> &a) const + { + return std::hash<T>()(*a); + } + auto operator()(const std::unique_ptr<T> &a) const + { + return std::hash<T>()(*a); + } + auto operator()(const T &a) const + { + return std::hash<T>()(a); + } +}; + +/* Enable lookup by string view */ +struct smart_str_equal { + using is_transparent = void; + auto operator()(const std::string &a, const std::string &b) const + { + return a == b; + } + auto operator()(const std::string_view &a, const std::string &b) const + { + return a == b; + } + auto operator()(const std::string &a, const std::string_view &b) const + { + return a == b; + } +}; + +struct smart_str_hash { + using is_transparent = void; + using is_avalanching = void; + auto operator()(const std::string &a) const + { + return ankerl::unordered_dense::hash<std::string>()(a); + } + auto operator()(const std::string_view &a) const + { + return ankerl::unordered_dense::hash<std::string_view>()(a); + } +}; + +}// namespace rspamd + +#endif//RSPAMD_HASH_UTIL_HXX diff --git a/src/libutil/cxx/local_shared_ptr.hxx b/src/libutil/cxx/local_shared_ptr.hxx new file mode 100644 index 0000000..78ed5ba --- /dev/null +++ b/src/libutil/cxx/local_shared_ptr.hxx @@ -0,0 +1,440 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_LOCAL_SHARED_PTR_HXX +#define RSPAMD_LOCAL_SHARED_PTR_HXX + +#pragma once + +#include <memory> +#include <algorithm> // for std::swap +#include <cstddef> // for std::size_t +#include <functional>// for std::less + +/* + * Smart pointers with no atomic refcounts to speed up Rspamd which is + * apparently single threaded + */ +namespace rspamd { + +namespace detail { + +class ref_cnt { +public: + using refcount_t = int; + + constexpr auto add_shared() -> refcount_t + { + return ++ref_shared; + } + constexpr auto add_weak() -> refcount_t + { + return ++ref_weak; + } + constexpr auto release_shared() -> refcount_t + { + return --ref_shared; + } + constexpr auto release_weak() -> refcount_t + { + return --ref_weak; + } + constexpr auto shared_count() const -> refcount_t + { + return ref_shared; + } + constexpr auto weak_count() const -> refcount_t + { + return ref_weak; + } + virtual ~ref_cnt() + { + } + virtual void dispose() = 0; + +private: + refcount_t ref_weak = 0; + refcount_t ref_shared = 1; +}; + +template<class T> +class obj_and_refcnt : public ref_cnt { +private: + typedef typename std::aligned_storage<sizeof(T), std::alignment_of<T>::value>::type storage_type; + storage_type storage; + bool initialized; + virtual void dispose() override + { + if (initialized) { + T *p = reinterpret_cast<T *>(&storage); + p->~T(); + initialized = false; + } + } + +public: + template<typename... Args> + explicit obj_and_refcnt(Args &&...args) + : initialized(true) + { + new (&storage) T(std::forward<Args>(args)...); + } + auto get(void) -> T * + { + if (initialized) { + return reinterpret_cast<T *>(&storage); + } + + return nullptr; + } + virtual ~obj_and_refcnt() = default; +}; + +template<class T, class D = typename std::default_delete<T>> +class ptr_and_refcnt : public ref_cnt { +private: + T *ptr; + D deleter; + virtual void dispose() override + { + deleter(ptr); + ptr = nullptr; + } + +public: + explicit ptr_and_refcnt(T *_ptr, D &&d = std::default_delete<T>()) + : ptr(_ptr), + deleter(std::move(d)) + { + } + virtual ~ptr_and_refcnt() = default; +}; + +}// namespace detail + +template<class T> +class local_weak_ptr; + +template<class T> +class local_shared_ptr { +public: + typedef T element_type; + typedef local_weak_ptr<T> weak_type; + + // Simplified comparing to libc++, no custom deleter and no rebind here + // constructors: + constexpr local_shared_ptr() noexcept + : px(nullptr), cnt(nullptr) + { + } + + template<class Y, typename std::enable_if< + std::is_convertible<Y *, element_type *>::value, bool>::type = true> + explicit local_shared_ptr(Y *p) + : px(p), cnt(new detail::ptr_and_refcnt(p)) + { + } + + // custom deleter + template<class Y, class D, typename std::enable_if<std::is_convertible<Y *, element_type *>::value, bool>::type = true> + explicit local_shared_ptr(Y *p, D &&d) + : px(p), cnt(new detail::ptr_and_refcnt<Y, D>(p, std::forward<D>(d))) + { + } + + local_shared_ptr(const local_shared_ptr &r) noexcept + : px(r.px), cnt(r.cnt) + { + if (cnt) { + cnt->add_shared(); + } + } + local_shared_ptr(local_shared_ptr &&r) noexcept + : px(r.px), cnt(r.cnt) + { + r.px = nullptr; + r.cnt = nullptr; + } + template<class Y> + explicit local_shared_ptr(const local_weak_ptr<Y> &r) + : px(r.px), cnt(r.cnt) + { + if (cnt) { + cnt->add_shared(); + } + } + local_shared_ptr(std::nullptr_t) + : local_shared_ptr() + { + } + + ~local_shared_ptr() + { + if (cnt) { + if (cnt->release_shared() <= 0) { + cnt->dispose(); + + if (cnt->weak_count() == 0) { + delete cnt; + } + } + } + } + + // assignment: + local_shared_ptr &operator=(const local_shared_ptr &r) noexcept + { + local_shared_ptr(r).swap(*this); + return *this; + } + local_shared_ptr &operator=(local_shared_ptr &&r) noexcept + { + local_shared_ptr(std::move(r)).swap(*this); + return *this; + } + + // Mutators + void swap(local_shared_ptr &r) noexcept + { + std::swap(this->cnt, r.cnt); + std::swap(this->px, r.px); + } + void reset() noexcept + { + local_shared_ptr().swap(*this); + } + + // Observers: + T *get() const noexcept + { + return px; + } + + T &operator*() const noexcept + { + return *px; + } + T *operator->() const noexcept + { + return px; + } + long use_count() const noexcept + { + if (cnt) { + return cnt->shared_count(); + } + + return 0; + } + bool unique() const noexcept + { + return use_count() == 1; + } + + explicit operator bool() const noexcept + { + return px != nullptr; + } + + template<class Y, typename std::enable_if< + std::is_convertible<Y *, element_type *>::value, bool>::type = true> + auto operator==(const local_shared_ptr<Y> &other) const -> bool + { + return px == other.px; + } + + template<class Y, typename std::enable_if< + std::is_convertible<Y *, element_type *>::value, bool>::type = true> + auto operator<(const local_shared_ptr<Y> &other) const -> auto + { + return *px < *other.px; + } + +private: + T *px;// contained pointer + detail::ref_cnt *cnt; + + template<class _T, class... Args> + friend local_shared_ptr<_T> local_make_shared(Args &&...args); + friend class local_weak_ptr<T>; +}; + +template<class T, class... Args> +local_shared_ptr<T> local_make_shared(Args &&...args) +{ + local_shared_ptr<T> ptr; + auto tmp_object = new detail::obj_and_refcnt<T>(std::forward<Args>(args)...); + ptr.px = tmp_object->get(); + ptr.cnt = tmp_object; + + return ptr; +} + +template<class T> +class local_weak_ptr { +public: + typedef T element_type; + + // constructors + constexpr local_weak_ptr() noexcept + : px(nullptr), cnt(nullptr) + { + } + template<class Y, typename std::enable_if< + std::is_convertible<Y *, element_type *>::value, bool>::type = true> + local_weak_ptr(local_shared_ptr<Y> const &r) noexcept + : px(r.px), cnt(r.cnt) + { + if (cnt) { + cnt->add_weak(); + } + } + + local_weak_ptr(local_weak_ptr const &r) noexcept + : px(r.px), cnt(r.cnt) + { + if (cnt) { + cnt->add_weak(); + } + } + local_weak_ptr(local_weak_ptr &&r) noexcept + : px(r.px), cnt(r.cnt) + { + r.px = nullptr; + r.cnt = nullptr; + } + + ~local_weak_ptr() + { + if (cnt) { + if (cnt->release_weak() <= 0 && cnt->shared_count() == 0) { + delete cnt; + } + } + } + + // assignment + local_weak_ptr &operator=(local_weak_ptr const &r) noexcept + { + local_weak_ptr(r).swap(*this); + return *this; + } + local_weak_ptr &operator=(local_shared_ptr<T> const &r) noexcept + { + local_weak_ptr(r).swap(*this); + return *this; + } + + template<class Y, typename std::enable_if< + std::is_convertible<Y *, element_type *>::value, bool>::type = true> + local_weak_ptr &operator=(local_weak_ptr<Y> const &r) noexcept + { + local_weak_ptr(r).swap(*this); + return *this; + } + local_weak_ptr &operator=(local_weak_ptr &&r) noexcept + { + local_weak_ptr(std::move(r)).swap(*this); + return *this; + } + + // modifiers + void swap(local_weak_ptr &r) noexcept + { + std::swap(this->cnt, r.cnt); + std::swap(this->px, r.px); + } + void reset() noexcept + { + local_weak_ptr().swap(*this); + } + + // observers + long use_count() const noexcept + { + if (cnt) { + return cnt->shared_count(); + } + return 0; + } + bool expired() const noexcept + { + if (cnt) { + return cnt->shared_count() == 0; + } + + return true; + } + + local_shared_ptr<T> lock() const noexcept + { + local_shared_ptr<T> tmp; + tmp.cnt = cnt; + + if (cnt) { + cnt->add_shared(); + tmp.px = px; + } + + return tmp; + } + +private: + element_type *px; + detail::ref_cnt *cnt; +}; + + +}// namespace rspamd + +/* Hashing stuff */ +namespace std { +template<class T> +struct hash<rspamd::local_shared_ptr<T>> { + inline auto operator()(const rspamd::local_shared_ptr<T> &p) const -> auto + { + if (!p) { + throw std::logic_error("no hash for dangling pointer"); + } + return hash<T>()(*p.get()); + } +}; +template<class T> +struct hash<rspamd::local_weak_ptr<T>> { + inline auto operator()(const rspamd::local_weak_ptr<T> &p) const -> auto + { + if (!p) { + throw std::logic_error("no hash for dangling pointer"); + } + return hash<T>()(*p.get()); + } +}; + +template<class T> +inline void swap(rspamd::local_shared_ptr<T> &x, rspamd::local_shared_ptr<T> &y) noexcept +{ + x.swap(y); +} + +template<class T> +inline void swap(rspamd::local_weak_ptr<T> &x, rspamd::local_weak_ptr<T> &y) noexcept +{ + x.swap(y); +} + +}// namespace std + +#endif//RSPAMD_LOCAL_SHARED_PTR_HXX diff --git a/src/libutil/cxx/utf8_util.cxx b/src/libutil/cxx/utf8_util.cxx new file mode 100644 index 0000000..5fc83ca --- /dev/null +++ b/src/libutil/cxx/utf8_util.cxx @@ -0,0 +1,421 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define U_CHARSET_IS_UTF8 1 +#include <unicode/utypes.h> +#include <unicode/utf8.h> +#include <unicode/uchar.h> +#include <unicode/normalizer2.h> +#include <unicode/schriter.h> +#include <unicode/coll.h> +#include <unicode/translit.h> +#include <utility> +#include <tuple> +#include <string> +#include <limits> +#include <memory> + +#include "utf8_util.h" +#include "str_util.h" + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +const char * +rspamd_string_unicode_trim_inplace(const char *str, size_t *len) +{ + const auto *p = str, *end = str + *len; + auto i = 0; + + while (i < *len) { + UChar32 uc; + auto prev_i = i; + + U8_NEXT(p, i, *len, uc); + + if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) { + i = prev_i; + break; + } + } + + p += i; + (*len) -= i; + i = end - p; + auto *ret = p; + + if (i > 0) { + + while (i > 0) { + UChar32 uc; + auto prev_i = i; + + U8_PREV(p, 0, i, uc); + + if (!u_isUWhiteSpace(uc) && !IS_ZERO_WIDTH_SPACE(uc)) { + i = prev_i; + break; + } + } + + *len = i; + } + + return ret; +} + +enum rspamd_utf8_normalise_result +rspamd_normalise_unicode_inplace(char *start, size_t *len) +{ + UErrorCode uc_err = U_ZERO_ERROR; + const auto *nfkc_norm = icu::Normalizer2::getNFKCInstance(uc_err); + static icu::UnicodeSet zw_spaces{}; + + if (!zw_spaces.isFrozen()) { + /* Add zw spaces to the set */ + zw_spaces.add(0x200B); + /* TODO: ZW non joiner, it might be used for ligatures, so it should possibly be excluded as well */ + zw_spaces.add(0x200C); + /* See github issue #4290 for explanation. It seems that the ZWJ has many legit use cases */ + //zw_spaces.add(0x200D); + zw_spaces.add(0xFEF); + zw_spaces.add(0x00AD); + zw_spaces.freeze(); + } + + int ret = RSPAMD_UNICODE_NORM_NORMAL; + + g_assert(U_SUCCESS(uc_err)); + + auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, *len)); + auto is_normal = nfkc_norm->quickCheck(uc_string, uc_err); + + if (!U_SUCCESS(uc_err)) { + return RSPAMD_UNICODE_NORM_ERROR; + } + + /* Filter zero width spaces and push resulting string back */ + const auto filter_zw_spaces_and_push_back = [&](const icu::UnicodeString &input) -> size_t { + icu::StringCharacterIterator it{input}; + size_t i = 0; + + while (it.hasNext()) { + /* libicu is very 'special' if it comes to 'safe' macro */ + if (i >= *len) { + ret |= RSPAMD_UNICODE_NORM_ERROR; + break; + } + + auto uc = it.next32PostInc(); + + if (zw_spaces.contains(uc)) { + ret |= RSPAMD_UNICODE_NORM_ZERO_SPACES; + } + else { + UBool err = 0; + + if (uc == 0xFFFD) { + ret |= RSPAMD_UNICODE_NORM_UNNORMAL; + } + U8_APPEND((uint8_t *) start, i, *len, uc, err); + + if (err) { + ret |= RSPAMD_UNICODE_NORM_ERROR; + break; + } + } + } + + return i; + }; + + if (is_normal != UNORM_YES) { + /* Need to normalise */ + ret |= RSPAMD_UNICODE_NORM_UNNORMAL; + + auto normalised = nfkc_norm->normalize(uc_string, uc_err); + + if (!U_SUCCESS(uc_err)) { + return RSPAMD_UNICODE_NORM_ERROR; + } + + *len = filter_zw_spaces_and_push_back(normalised); + } + else { + *len = filter_zw_spaces_and_push_back(uc_string); + } + + return static_cast<enum rspamd_utf8_normalise_result>(ret); +} + +gchar * +rspamd_utf8_transliterate(const gchar *start, gsize len, gsize *target_len) +{ + UErrorCode uc_err = U_ZERO_ERROR; + + static std::unique_ptr<icu::Transliterator> transliterator; + + if (!transliterator) { + UParseError parse_err; + static const auto rules = icu::UnicodeString{":: Any-Latin;" + ":: [:Nonspacing Mark:] Remove;" + ":: [:Punctuation:] Remove;" + ":: [:Symbol:] Remove;" + ":: [:Format:] Remove;" + ":: Latin-ASCII;" + ":: Lower();" + ":: NULL;" + "[:Space Separator:] > ' '"}; + transliterator = std::unique_ptr<icu::Transliterator>( + icu::Transliterator::createFromRules("RspamdTranslit", rules, UTRANS_FORWARD, parse_err, uc_err)); + + if (U_FAILURE(uc_err) || !transliterator) { + auto context = icu::UnicodeString(parse_err.postContext, sizeof(parse_err.preContext) / sizeof(UChar)); + g_error("fatal error: cannot init libicu transliteration engine: %s, line: %d, offset: %d", + u_errorName(uc_err), parse_err.line, parse_err.offset); + abort(); + } + } + + auto uc_string = icu::UnicodeString::fromUTF8(icu::StringPiece(start, len)); + transliterator->transliterate(uc_string); + + // We assume that all characters are now ascii + auto dest_len = uc_string.length(); + gchar *dest = (gchar *) g_malloc(dest_len + 1); + auto sink = icu::CheckedArrayByteSink(dest, dest_len); + uc_string.toUTF8(sink); + + *target_len = sink.NumberOfBytesWritten(); + dest[*target_len] = '\0'; + + return dest; +} + +struct rspamd_icu_collate_storage { + icu::Collator *collator = nullptr; + rspamd_icu_collate_storage() + { + UErrorCode uc_err = U_ZERO_ERROR; + collator = icu::Collator::createInstance(icu::Locale::getEnglish(), uc_err); + + if (U_FAILURE(uc_err) || collator == nullptr) { + g_error("fatal error: cannot init libicu collation engine: %s", + u_errorName(uc_err)); + abort(); + } + /* Ignore all difference except functional */ + collator->setStrength(icu::Collator::PRIMARY); + } + + ~rspamd_icu_collate_storage() + { + if (collator) { + delete collator; + } + } +}; + +static rspamd_icu_collate_storage collate_storage; + +int rspamd_utf8_strcmp_sizes(const char *s1, gsize n1, const char *s2, gsize n2) +{ + if (n1 >= std::numeric_limits<int>::max() || n2 >= std::numeric_limits<int>::max()) { + /* + * It's hard to say what to do here... But libicu wants int, so we fall + * back to g_ascii_strcasecmp which can deal with size_t + */ + if (n1 == n2) { + return g_ascii_strncasecmp(s1, s2, n1); + } + else { + return n1 - n2; + } + } + + UErrorCode success = U_ZERO_ERROR; + auto res = collate_storage.collator->compareUTF8({s1, (int) n1}, {s2, (int) n2}, + success); + + switch (res) { + case UCOL_EQUAL: + return 0; + case UCOL_GREATER: + return 1; + case UCOL_LESS: + default: + return -1; + } +} + +int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n) +{ + return rspamd_utf8_strcmp_sizes(s1, n, s2, n); +} + +TEST_SUITE("utf8 utils") +{ + TEST_CASE("utf8 normalise") + { + std::tuple<const char *, const char *, int> cases[] = { + {"abc", "abc", RSPAMD_UNICODE_NORM_NORMAL}, + {"тест", "тест", RSPAMD_UNICODE_NORM_NORMAL}, + /* Zero width spaces */ + {"\xE2\x80\x8B" + "те" + "\xE2\x80\x8B" + "ст", + "тест", RSPAMD_UNICODE_NORM_ZERO_SPACES}, + /* Special case of diacritic */ + {"13_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", RSPAMD_UNICODE_NORM_UNNORMAL}, + // String containing a non-joiner character + {"س\u200Cت", "ست", RSPAMD_UNICODE_NORM_ZERO_SPACES}, + // String containing a soft hyphen + {"in\u00ADter\u00ADest\u00ADing", "interesting", RSPAMD_UNICODE_NORM_ZERO_SPACES}, + // String with ligature + {"fish", "fish", RSPAMD_UNICODE_NORM_UNNORMAL}, + // String with accented characters and zero-width spaces + {"café\u200Blatté\u200C", "cafélatté", RSPAMD_UNICODE_NORM_ZERO_SPACES}, + /* Same with zw spaces */ + {"13\u200C_\u0020\u0308\u0301\u038e\u03ab", "13_ ̈́ΎΫ", + RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ZERO_SPACES}, + /* Buffer overflow case */ + {"u\xC2\xC2\xC2\xC2\xC2\xC2" + "abcdef" + "abcdef", + "u\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD", + RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ERROR}, + // String with a mix of special characters, ligatures, and zero-width spaces + {"fish\u200Bcafé\u200C\u200Dlatté\u200D\u00AD", "fishcafé\u200Dlatté\u200D", RSPAMD_UNICODE_NORM_UNNORMAL | RSPAMD_UNICODE_NORM_ZERO_SPACES}, + // Empty string + {"", "", RSPAMD_UNICODE_NORM_NORMAL}, + }; + + for (const auto &c: cases) { + std::string cpy{std::get<0>(c)}; + auto ns = cpy.size(); + auto res = rspamd_normalise_unicode_inplace(cpy.data(), &ns); + cpy.resize(ns); + CHECK(cpy == std::string(std::get<1>(c))); + CHECK(res == std::get<2>(c)); + } + } + + TEST_CASE("utf8 trim") + { + std::pair<const char *, const char *> cases[] = { + {" \u200B" + "abc ", + "abc"}, + {" ", ""}, + {" a", "a"}, + {"a ", "a"}, + {"a a", "a a"}, + {"abc", "abc"}, + {"a ", "a"}, + {" abc ", "abc"}, + {" abc ", "abc"}, + {" \xE2\x80\x8B" + "a\xE2\x80\x8B" + "bc ", + "a\xE2\x80\x8B" + "bc"}, + {" \xE2\x80\x8B" + "abc\xE2\x80\x8B ", + "abc"}, + {" \xE2\x80\x8B" + "abc \xE2\x80\x8B ", + "abc"}, + }; + + for (const auto &c: cases) { + std::string cpy{c.first}; + auto ns = cpy.size(); + auto *nstart = rspamd_string_unicode_trim_inplace(cpy.data(), &ns); + std::string res{nstart, ns}; + CHECK(res == std::string{c.second}); + } + } + + + TEST_CASE("utf8 strcmp") + { + std::tuple<const char *, const char *, int, int> cases[] = { + {"abc", "abc", -1, 0}, + {"", "", -1, 0}, + {"aBc", "AbC", -1, 0}, + {"abc", "ab", 2, 0}, + {"теСт", "ТесТ", -1, 0}, + {"теСт", "Тезт", 4, 0}, + {"теСт", "Тезт", -1, 1}, + {"abc", "ABD", -1, -1}, + {"\0a\0", "\0a\1", 2, 0}, + {"\0a\0", "\0b\1", 3, -1}, + }; + + for (const auto &c: cases) { + auto [s1, s2, n, expected] = c; + if (n == -1) { + n = MIN(strlen(s1), strlen(s2)); + } + SUBCASE((std::string("test case: ") + s1 + " <=> " + s2).c_str()) + { + auto ret = rspamd_utf8_strcmp(s1, s2, n); + CHECK(ret == expected); + } + } + } + + TEST_CASE("transliterate") + { + using namespace std::literals; + std::tuple<std::string_view, const char *> cases[] = { + {"abc"sv, "abc"}, + {""sv, ""}, + {"тест"sv, "test"}, + // Diacritic to ascii + {"Ύ"sv, "y"}, + // Chinese to pinyin + {"你好"sv, "ni hao"}, + // Japanese to romaji + {"こんにちは"sv, "konnichiha"}, + // Devanagari to latin + {"नमस्ते"sv, "namaste"}, + // Arabic to latin + {"مرحبا"sv, "mrhba"}, + // Remove of punctuation + {"a.b.c"sv, "abc"}, + // Lowercase + {"ABC"sv, "abc"}, + // Remove zero-width spaces + {"\xE2\x80\x8B" + "abc\xE2\x80\x8B" + "def"sv, + "abcdef"}, + }; + + for (const auto &c: cases) { + auto [s1, s2] = c; + SUBCASE((std::string("test case: ") + std::string(s1) + " => " + s2).c_str()) + { + gsize tlen; + auto *ret = rspamd_utf8_transliterate(s1.data(), s1.length(), &tlen); + CHECK(tlen == strlen(s2)); + CHECK(strcmp(s2, ret) == 0); + } + } + } +}
\ No newline at end of file diff --git a/src/libutil/cxx/utf8_util.h b/src/libutil/cxx/utf8_util.h new file mode 100644 index 0000000..044beae --- /dev/null +++ b/src/libutil/cxx/utf8_util.h @@ -0,0 +1,85 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_UTF8_UTIL_H +#define RSPAMD_UTF8_UTIL_H + +#include "config.h" +#include "mem_pool.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Removes all unicode spaces from a string + * @param str start of the string + * @param len length + * @return new length of the string trimmed + */ +const char *rspamd_string_unicode_trim_inplace(const char *str, size_t *len); + +enum rspamd_utf8_normalise_result { + RSPAMD_UNICODE_NORM_NORMAL = 0, + RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0), + RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1), + RSPAMD_UNICODE_NORM_ERROR = (1 << 2), + RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3) +}; + +/** + * Gets a string in UTF8 and normalises it to NFKC_Casefold form + * @param pool optional memory pool used for logging purposes + * @param start + * @param len + * @return TRUE if a string has been normalised + */ +enum rspamd_utf8_normalise_result rspamd_normalise_unicode_inplace(gchar *start, gsize *len); + +/** + * Transliterate a string to ASCII + * @param start + * @param len + * @param target_len + * @return a new string that should be freed with g_free + */ +gchar *rspamd_utf8_transliterate(const gchar *start, gsize len, gsize *target_len); + +/** + * Compare two strings using libicu collator + * @param s1 + * @param s2 + * @param n + * @return an integer greater than, equal to, or less than 0, according as the string s1 is greater than, equal to, or less than the string s2. + */ +int rspamd_utf8_strcmp(const char *s1, const char *s2, gsize n); +/** + * Similar to rspamd_utf8_strcmp but accepts two sizes + * @param s1 + * @param n1 + * @param s2 + * @param n2 + * @return + */ +int rspamd_utf8_strcmp_sizes(const char *s1, gsize n1, const char *s2, gsize n2); + +#ifdef __cplusplus +} +#endif + +#endif//RSPAMD_UTF8_UTIL_H diff --git a/src/libutil/cxx/util.hxx b/src/libutil/cxx/util.hxx new file mode 100644 index 0000000..32ec0b5 --- /dev/null +++ b/src/libutil/cxx/util.hxx @@ -0,0 +1,238 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_UTIL_HXX +#define RSPAMD_UTIL_HXX + +#pragma once + +#include <memory> +#include <array> +#include <string_view> +#include <optional> +#include <tuple> +#include <algorithm> + +/* + * Common C++ utilities + */ + +namespace rspamd { +/* + * Creates std::array from a standard C style array with automatic size calculation + */ +template<typename... Ts> +constexpr auto array_of(Ts &&...t) -> std::array<typename std::decay_t<typename std::common_type_t<Ts...>>, sizeof...(Ts)> +{ + using T = typename std::decay_t<typename std::common_type_t<Ts...>>; + return {{std::forward<T>(t)...}}; +} + +/** + * Find a value in a map + * @tparam C Map type + * @tparam K Key type + * @tparam V Value type + * @param c Map to search + * @param k Key to search + * @return Value if found or std::nullopt otherwise + */ +template<class C, class K, class V = typename C::mapped_type, typename std::enable_if_t<std::is_constructible_v<typename C::key_type, K> && std::is_constructible_v<typename C::mapped_type, V>, bool> = false> +constexpr auto find_map(const C &c, const K &k) -> std::optional<std::reference_wrapper<const V>> +{ + auto f = c.find(k); + + if (f != c.end()) { + return std::cref<V>(f->second); + } + + return std::nullopt; +} + + +template<typename It> +inline constexpr auto make_string_view_from_it(It begin, It end) +{ + using result_type = std::string_view; + + return result_type{((begin != end) ? &*begin : nullptr), + (typename result_type::size_type) std::max(std::distance(begin, end), + (typename result_type::difference_type) 0)}; +} + +/** + * Iterate over lines in a string, newline characters are dropped + * @tparam S + * @tparam F + * @param input + * @param functor + * @return + */ +template<class S, class F, typename std::enable_if_t<std::is_invocable_v<F, std::string_view> && std::is_constructible_v<std::string_view, S>, bool> = true> +inline auto string_foreach_line(const S &input, const F &functor) +{ + auto it = input.begin(); + auto end = input.end(); + + while (it != end) { + auto next = std::find(it, end, '\n'); + while (next >= it && (*next == '\n' || *next == '\r')) { + --next; + } + functor(make_string_view_from_it(it, next)); + it = next; + + if (it != end) { + ++it; + } + } +} + +/** + * Iterate over elements in a string + * @tparam S string type + * @tparam D delimiter type + * @tparam F functor type + * @param input string to iterate + * @param delim delimiter to use + * @param functor functor to call + * @param ignore_empty ignore empty elements + * @return nothing + */ +template<class S, class D, class F, + typename std::enable_if_t<std::is_invocable_v<F, std::string_view> && std::is_constructible_v<std::string_view, S> && std::is_constructible_v<std::string_view, D>, bool> = true> +inline auto string_foreach_delim(const S &input, const D &delim, const F &functor, const bool ignore_empty = true) -> void +{ + size_t first = 0; + auto sv_input = std::string_view{input}; + auto sv_delim = std::string_view{delim}; + + while (first < sv_input.size()) { + const auto second = sv_input.find_first_of(sv_delim, first); + + if (first != second || !ignore_empty) { + functor(sv_input.substr(first, second - first)); + } + + if (second == std::string_view::npos) { + break; + } + + first = second + 1; + } +} + +/** + * Split string on a character + * @tparam S string type + * @param input string to split + * @param chr character to split on + * @return pair of strings + */ +template<class S, typename std::enable_if_t<std::is_constructible_v<std::string_view, S>, bool> = true> +inline auto string_split_on(const S &input, std::string_view::value_type chr) -> std::pair<std::string_view, std::string_view> +{ + auto pos = std::find(std::begin(input), std::end(input), chr); + + if (pos != input.end()) { + auto first = std::string_view{std::begin(input), static_cast<std::size_t>(std::distance(std::begin(input), pos))}; + while (*pos == chr && pos != input.end()) { + ++pos; + } + auto last = std::string_view{pos, static_cast<std::size_t>(std::distance(pos, std::end(input)))}; + + return {first, last}; + } + + return {std::string_view{input}, std::string_view{}}; +} + +/** + * Enumerate for range loop + * @tparam T iterable type + * @tparam TIter iterator type + * @param iterable iterable object + * @return iterator object + */ +template<typename T, + typename TIter = decltype(std::begin(std::declval<T>())), + typename = decltype(std::end(std::declval<T>()))> +constexpr auto enumerate(T &&iterable) +{ + struct iterator { + size_t i; + TIter iter; + bool operator!=(const iterator &other) const + { + return iter != other.iter; + } + void operator++() + { + ++i; + ++iter; + } + auto operator*() const + { + return std::tie(i, *iter); + } + }; + struct iterable_wrapper { + T iterable; + auto begin() + { + return iterator{0, std::begin(iterable)}; + } + auto end() + { + return iterator{0, std::end(iterable)}; + } + }; + return iterable_wrapper{std::forward<T>(iterable)}; +} + +/** + * Allocator that cleans up memory in a secure way on destruction + * @tparam T + */ +template<class T> +class secure_mem_allocator : public std::allocator<T> { +public: + using value_type = typename std::allocator<T>::value_type; + using size_type = typename std::allocator<T>::size_type; + template<class U> + struct rebind { + typedef secure_mem_allocator<U> other; + }; + secure_mem_allocator() noexcept = default; + secure_mem_allocator(const secure_mem_allocator &_) noexcept + : std::allocator<T>(_) + { + } + template<class U> + explicit secure_mem_allocator(const secure_mem_allocator<U> &) noexcept + { + } + + void deallocate(value_type *p, size_type num) noexcept + { + rspamd_explicit_memzero((void *) p, num); + std::allocator<T>::deallocate(p, num); + } +}; + + +}// namespace rspamd + +#endif//RSPAMD_UTIL_HXX diff --git a/src/libutil/cxx/util_tests.cxx b/src/libutil/cxx/util_tests.cxx new file mode 100644 index 0000000..6c3c177 --- /dev/null +++ b/src/libutil/cxx/util_tests.cxx @@ -0,0 +1,82 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util.hxx" + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" +#include <vector> + +using namespace rspamd; +using namespace std::literals::string_view_literals; + +TEST_SUITE("cxx utils") +{ + TEST_CASE("string_split_on") + { + std::tuple<std::string_view, char, std::pair<std::string_view, std::string_view>> cases[] = { + {"test test"sv, ' ', std::pair{"test"sv, "test"sv}}, + {"test test"sv, ' ', std::pair{"test"sv, "test"sv}}, + {"test test "sv, ' ', std::pair{"test"sv, "test "sv}}, + {"testtest "sv, ' ', std::pair{"testtest"sv, ""sv}}, + {" testtest "sv, ' ', std::pair{""sv, "testtest "sv}}, + {"testtest"sv, ' ', std::pair{"testtest"sv, ""sv}}, + {""sv, ' ', std::pair{""sv, ""sv}}, + }; + + for (const auto &c: cases) { + auto res = string_split_on(std::get<0>(c), std::get<1>(c)); + auto expected = std::get<2>(c); + CHECK(res.first == expected.first); + CHECK(res.second == expected.second); + } + } + + TEST_CASE("string_foreach_delim") + { + std::tuple<std::string_view, std::string_view, std::pair<std::vector<std::string_view>, std::vector<std::string_view>>> cases[] = { + {"test"sv, ","sv, {{"test"}, {"test"}}}, + {"test,test"sv, ","sv, {{"test", "test"}, {"test", "test"}}}, + {"test, test"sv, ", "sv, {{"test", "test"}, {"test", "", "test"}}}, + {"test, test,,"sv, ", "sv, {{"test", "test"}, {"test", "", "test", ""}}}, + }; + + for (const auto &c: cases) { + auto res = std::vector<std::string_view>(); + string_foreach_delim(std::get<0>(c), std::get<1>(c), [&](const auto &v) { + res.push_back(v); + }); + + auto compare_vec = []<class T>(const std::vector<T> &v1, const std::vector<T> &v2) { + CHECK(v1.size() == v2.size()); + for (size_t i = 0; i < v1.size(); ++i) { + CHECK(v1[i] == v2[i]); + } + }; + + compare_vec(res, std::get<2>(c).first); + + res.clear(); + // Perform the same test but with no skip empty + string_foreach_delim( + std::get<0>(c), std::get<1>(c), [&](const auto &v) { + res.push_back(v); + }, + false); + compare_vec(res, std::get<2>(c).second); + } + } +}
\ No newline at end of file |