diff options
Diffstat (limited to '')
-rw-r--r-- | src/libserver/hyperscan_tools.cxx | 627 |
1 files changed, 627 insertions, 0 deletions
diff --git a/src/libserver/hyperscan_tools.cxx b/src/libserver/hyperscan_tools.cxx new file mode 100644 index 0000000..7d1ecf3 --- /dev/null +++ b/src/libserver/hyperscan_tools.cxx @@ -0,0 +1,627 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" + +#ifdef WITH_HYPERSCAN +#include <string> +#include <filesystem> +#include "contrib/ankerl/unordered_dense.h" +#include "contrib/ankerl/svector.h" +#include "fmt/core.h" +#include "libutil/cxx/file_util.hxx" +#include "libutil/cxx/error.hxx" +#include "hs.h" +#include "logger.h" +#include "worker_util.h" +#include "hyperscan_tools.h" + +#include <glob.h> /* for glob */ +#include <unistd.h> /* for unlink */ +#include <optional> +#include <cstdlib> /* for std::getenv */ +#include "unix-std.h" +#include "rspamd_control.h" + +#define HYPERSCAN_LOG_TAG "hsxxxx" + +// Hyperscan does not provide any API to check validity of it's databases +// However, it is required for us to perform migrations properly without +// failing at `hs_alloc_scratch` phase or even `hs_scan` which is **way too late** +// Hence, we have to check hyperscan internal guts to prevent that situation... + +#ifdef HS_MAJOR +#ifndef HS_VERSION_32BIT +#define HS_VERSION_32BIT ((HS_MAJOR << 24) | (HS_MINOR << 16) | (HS_PATCH << 8) | 0) +#endif +#endif// defined(HS_MAJOR) + +#if !defined(HS_DB_VERSION) && defined(HS_VERSION_32BIT) +#define HS_DB_VERSION HS_VERSION_32BIT +#endif + +#ifndef HS_DB_MAGIC +#define HS_DB_MAGIC (0xdbdbdbdbU) +#endif + +#define msg_info_hyperscan(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "hyperscan", HYPERSCAN_LOG_TAG, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_hyperscan_lambda(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "hyperscan", HYPERSCAN_LOG_TAG, \ + log_func, \ + __VA_ARGS__) +#define msg_err_hyperscan(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "hyperscan", HYPERSCAN_LOG_TAG, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_hyperscan(...) rspamd_conditional_debug_fast(nullptr, nullptr, \ + rspamd_hyperscan_log_id, "hyperscan", HYPERSCAN_LOG_TAG, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_hyperscan_lambda(...) rspamd_conditional_debug_fast(nullptr, nullptr, \ + rspamd_hyperscan_log_id, "hyperscan", HYPERSCAN_LOG_TAG, \ + log_func, \ + __VA_ARGS__) + +INIT_LOG_MODULE_PUBLIC(hyperscan) + +namespace rspamd::util { + +/* + * A singleton class that is responsible for deletion of the outdated hyperscan files + * One issue is that it must know about HS files in all workers, which is a problem + * TODO: we need to export hyperscan caches from all workers to a single place where + * we can clean them up (probably, to the main process) + */ +class hs_known_files_cache { +private: + // These fields are filled when we add new known cache files + ankerl::svector<std::string, 4> cache_dirs; + ankerl::svector<std::string, 8> cache_extensions; + ankerl::unordered_dense::set<std::string> known_cached_files; + bool loaded = false; + +private: + hs_known_files_cache() = default; + + virtual ~hs_known_files_cache() + { + // Cleanup cache dir + cleanup_maybe(); + } + +public: + hs_known_files_cache(const hs_known_files_cache &) = delete; + hs_known_files_cache(hs_known_files_cache &&) = delete; + + static auto get() -> hs_known_files_cache & + { + static hs_known_files_cache *singleton = nullptr; + + if (singleton == nullptr) { + singleton = new hs_known_files_cache; + } + + return *singleton; + } + + void add_cached_file(const raii_file &file) + { + auto fpath = std::filesystem::path{file.get_name()}; + std::error_code ec; + + fpath = std::filesystem::canonical(fpath, ec); + + if (ec && ec.value() != 0) { + msg_err_hyperscan("invalid path: \"%s\", error message: %s", fpath.c_str(), ec.message().c_str()); + return; + } + + auto dir = fpath.parent_path(); + auto ext = fpath.extension(); + + if (std::find_if(cache_dirs.begin(), cache_dirs.end(), + [&](const auto &item) { return item == dir; }) == std::end(cache_dirs)) { + cache_dirs.emplace_back(std::string{dir}); + } + if (std::find_if(cache_extensions.begin(), cache_extensions.end(), + [&](const auto &item) { return item == ext; }) == std::end(cache_extensions)) { + cache_extensions.emplace_back(std::string{ext}); + } + + auto is_known = known_cached_files.insert(fpath.string()); + msg_debug_hyperscan("added %s hyperscan file: %s", + is_known.second ? "new" : "already known", + fpath.c_str()); + } + + void add_cached_file(const char *fname) + { + auto fpath = std::filesystem::path{fname}; + std::error_code ec; + + fpath = std::filesystem::canonical(fpath, ec); + + if (ec && ec.value() != 0) { + msg_err_hyperscan("invalid path: \"%s\", error message: %s", fname, ec.message().c_str()); + return; + } + + auto dir = fpath.parent_path(); + auto ext = fpath.extension(); + + if (std::find_if(cache_dirs.begin(), cache_dirs.end(), + [&](const auto &item) { return item == dir; }) == std::end(cache_dirs)) { + cache_dirs.emplace_back(dir.string()); + } + if (std::find_if(cache_extensions.begin(), cache_extensions.end(), + [&](const auto &item) { return item == ext; }) == std::end(cache_extensions)) { + cache_extensions.emplace_back(ext.string()); + } + + auto is_known = known_cached_files.insert(fpath.string()); + msg_debug_hyperscan("added %s hyperscan file: %s", + is_known.second ? "new" : "already known", + fpath.c_str()); + } + + void delete_cached_file(const char *fname) + { + auto fpath = std::filesystem::path{fname}; + std::error_code ec; + + fpath = std::filesystem::canonical(fpath, ec); + + if (ec && ec.value() != 0) { + msg_err_hyperscan("invalid path to remove: \"%s\", error message: %s", + fname, ec.message().c_str()); + return; + } + + if (fpath.empty()) { + msg_err_hyperscan("attempt to remove an empty hyperscan file!"); + return; + } + + if (unlink(fpath.c_str()) == -1) { + msg_err_hyperscan("cannot remove hyperscan file %s: %s", + fpath.c_str(), strerror(errno)); + } + else { + msg_debug_hyperscan("removed hyperscan file %s", fpath.c_str()); + } + + known_cached_files.erase(fpath.string()); + } + + auto cleanup_maybe() -> void + { + auto env_cleanup_disable = std::getenv("RSPAMD_NO_CLEANUP"); + /* We clean dir merely if we are running from the main process */ + if (rspamd_current_worker == nullptr && env_cleanup_disable == nullptr && loaded) { + const auto *log_func = RSPAMD_LOG_FUNC; + auto cleanup_dir = [&](std::string_view dir) -> void { + for (const auto &ext: cache_extensions) { + glob_t globbuf; + + auto glob_pattern = fmt::format("{}{}*{}", + dir, G_DIR_SEPARATOR_S, ext); + msg_debug_hyperscan_lambda("perform glob for pattern: %s", + glob_pattern.c_str()); + memset(&globbuf, 0, sizeof(globbuf)); + + if (glob(glob_pattern.c_str(), 0, nullptr, &globbuf) == 0) { + for (auto i = 0; i < globbuf.gl_pathc; i++) { + auto path = std::string{globbuf.gl_pathv[i]}; + std::size_t nsz; + struct stat st; + + rspamd_normalize_path_inplace(path.data(), path.size(), &nsz); + path.resize(nsz); + + if (stat(path.c_str(), &st) == -1) { + msg_debug_hyperscan_lambda("cannot stat file %s: %s", + path.c_str(), strerror(errno)); + continue; + } + + if (S_ISREG(st.st_mode)) { + if (!known_cached_files.contains(path)) { + msg_info_hyperscan_lambda("remove stale hyperscan file %s", path.c_str()); + unlink(path.c_str()); + } + else { + msg_debug_hyperscan_lambda("found known hyperscan file %s, size: %Hz", + path.c_str(), st.st_size); + } + } + } + } + + globfree(&globbuf); + } + }; + + for (const auto &dir: cache_dirs) { + msg_info_hyperscan("cleaning up directory %s", dir.c_str()); + cleanup_dir(dir); + } + + cache_dirs.clear(); + cache_extensions.clear(); + known_cached_files.clear(); + } + else if (rspamd_current_worker == nullptr && env_cleanup_disable != nullptr) { + msg_info_hyperscan("disable hyperscan cleanup: env variable RSPAMD_NO_CLEANUP is set"); + } + else if (!loaded) { + msg_info_hyperscan("disable hyperscan cleanup: not loaded"); + } + } + + auto notice_loaded() -> void + { + loaded = true; + } +}; + + +/** + * This is a higher level representation of the cached hyperscan file + */ +struct hs_shared_database { + hs_database_t *db = nullptr; /**< internal database (might be in a shared memory) */ + std::optional<raii_mmaped_file> maybe_map; + std::string cached_path; + + ~hs_shared_database() + { + if (!maybe_map) { + hs_free_database(db); + } + // Otherwise, handled by maybe_map dtor + } + + explicit hs_shared_database(raii_mmaped_file &&map, hs_database_t *db) + : db(db), maybe_map(std::move(map)) + { + cached_path = maybe_map.value().get_file().get_name(); + } + explicit hs_shared_database(hs_database_t *db, const char *fname) + : db(db), maybe_map(std::nullopt) + { + if (fname) { + cached_path = fname; + } + else { + /* Likely a test case */ + cached_path = ""; + } + } + hs_shared_database(const hs_shared_database &other) = delete; + hs_shared_database() = default; + hs_shared_database(hs_shared_database &&other) noexcept + { + *this = std::move(other); + } + hs_shared_database &operator=(hs_shared_database &&other) noexcept + { + std::swap(db, other.db); + std::swap(maybe_map, other.maybe_map); + return *this; + } +}; + +struct real_hs_db { + std::uint32_t magic; + std::uint32_t version; + std::uint32_t length; + std::uint64_t platform; + std::uint32_t crc32; +}; +static auto +hs_is_valid_database(void *raw, std::size_t len, std::string_view fname) -> tl::expected<bool, std::string> +{ + if (len < sizeof(real_hs_db)) { + return tl::make_unexpected(fmt::format("cannot load hyperscan database from {}: too short", fname)); + } + + static real_hs_db test; + + memcpy(&test, raw, sizeof(test)); + + if (test.magic != HS_DB_MAGIC) { + return tl::make_unexpected(fmt::format("cannot load hyperscan database from {}: invalid magic: {} ({} expected)", + fname, test.magic, HS_DB_MAGIC)); + } + +#ifdef HS_DB_VERSION + if (test.version != HS_DB_VERSION) { + return tl::make_unexpected(fmt::format("cannot load hyperscan database from {}: invalid version: {} ({} expected)", + fname, test.version, HS_DB_VERSION)); + } +#endif + + return true; +} + +static auto +hs_shared_from_unserialized(hs_known_files_cache &hs_cache, raii_mmaped_file &&map) -> tl::expected<hs_shared_database, error> +{ + auto ptr = map.get_map(); + auto db = (hs_database_t *) ptr; + + auto is_valid = hs_is_valid_database(map.get_map(), map.get_size(), map.get_file().get_name()); + if (!is_valid) { + return tl::make_unexpected(error{is_valid.error(), -1, error_category::IMPORTANT}); + } + + hs_cache.add_cached_file(map.get_file()); + return tl::expected<hs_shared_database, error>{tl::in_place, std::move(map), db}; +} + +static auto +hs_shared_from_serialized(hs_known_files_cache &hs_cache, raii_mmaped_file &&map, std::int64_t offset) -> tl::expected<hs_shared_database, error> +{ + hs_database_t *target = nullptr; + + if (auto ret = hs_deserialize_database((const char *) map.get_map() + offset, + map.get_size() - offset, &target); + ret != HS_SUCCESS) { + return tl::make_unexpected(error{"cannot deserialize database", ret}); + } + + hs_cache.add_cached_file(map.get_file()); + return tl::expected<hs_shared_database, error>{tl::in_place, target, map.get_file().get_name().data()}; +} + +auto load_cached_hs_file(const char *fname, std::int64_t offset = 0) -> tl::expected<hs_shared_database, error> +{ + auto &hs_cache = hs_known_files_cache::get(); + const auto *log_func = RSPAMD_LOG_FUNC; + + return raii_mmaped_file::mmap_shared(fname, O_RDONLY, PROT_READ, 0) + .and_then([&]<class T>(T &&cached_serialized) -> tl::expected<hs_shared_database, error> { + if (cached_serialized.get_size() <= offset) { + return tl::make_unexpected(error{"Invalid offset", EINVAL, error_category::CRITICAL}); + } +#if defined(HS_MAJOR) && defined(HS_MINOR) && HS_MAJOR >= 5 && HS_MINOR >= 4 + auto unserialized_fname = fmt::format("{}.unser", fname); + auto unserialized_file = raii_locked_file::create(unserialized_fname.c_str(), O_CREAT | O_RDWR | O_EXCL, + 00644) + .and_then([&](auto &&new_file_locked) -> tl::expected<raii_file, error> { + auto tmpfile_pattern = fmt::format("{}{}hsmp-XXXXXXXXXXXXXXXXXX", + cached_serialized.get_file().get_dir(), G_DIR_SEPARATOR); + auto tmpfile = raii_locked_file::mkstemp(tmpfile_pattern.data(), O_CREAT | O_RDWR | O_EXCL, + 00644); + + if (!tmpfile) { + return tl::make_unexpected(tmpfile.error()); + } + else { + auto &tmpfile_checked = tmpfile.value(); + // Store owned string + auto tmpfile_name = std::string{tmpfile_checked.get_name()}; + std::size_t unserialized_size; + + if (auto ret = hs_serialized_database_size(((const char *) cached_serialized.get_map()) + offset, + cached_serialized.get_size() - offset, &unserialized_size); + ret != HS_SUCCESS) { + return tl::make_unexpected(error{ + fmt::format("cannot get unserialized database size: {}", ret), + EINVAL, + error_category::IMPORTANT}); + } + + msg_debug_hyperscan_lambda("multipattern: create new database in %s; %Hz size", + tmpfile_name.c_str(), unserialized_size); + void *buf; +#ifdef HAVE_GETPAGESIZE + auto page_size = getpagesize(); +#else + auto page_size = sysconf(_SC_PAGESIZE); +#endif + if (page_size == -1) { + page_size = 4096; + } + auto errcode = posix_memalign(&buf, page_size, unserialized_size); + if (errcode != 0 || buf == nullptr) { + return tl::make_unexpected(error{"Cannot allocate memory", + errno, error_category::CRITICAL}); + } + + if (auto ret = hs_deserialize_database_at(((const char *) cached_serialized.get_map()) + offset, + cached_serialized.get_size() - offset, (hs_database_t *) buf); + ret != HS_SUCCESS) { + return tl::make_unexpected(error{ + fmt::format("cannot deserialize hyperscan database: {}", ret), ret}); + } + else { + if (write(tmpfile_checked.get_fd(), buf, unserialized_size) == -1) { + free(buf); + return tl::make_unexpected(error{fmt::format("cannot write to {}: {}", + tmpfile_name, ::strerror(errno)), + errno, error_category::CRITICAL}); + } + else { + free(buf); + /* + * Unlink target file before renaming to avoid + * race condition. + * So what we have is that `new_file_locked` + * will have flock on that file, so it will be + * replaced after unlink safely, and also unlocked. + */ + (void) unlink(unserialized_fname.c_str()); + if (rename(tmpfile_name.c_str(), + unserialized_fname.c_str()) == -1) { + if (errno != EEXIST) { + msg_info_hyperscan_lambda("cannot rename %s -> %s: %s", + tmpfile_name.c_str(), + unserialized_fname.c_str(), + strerror(errno)); + } + } + else { + /* Unlock file but mark it as immortal first to avoid deletion */ + tmpfile_checked.make_immortal(); + (void) tmpfile_checked.unlock(); + } + } + } + /* Reopen in RO mode */ + return raii_file::open(unserialized_fname.c_str(), O_RDONLY); + }; + }) + .or_else([&](auto unused) -> tl::expected<raii_file, error> { + // Cannot create file, so try to open it in RO mode + return raii_file::open(unserialized_fname.c_str(), O_RDONLY); + }); + + tl::expected<hs_shared_database, error> ret; + + if (unserialized_file.has_value()) { + + auto &unserialized_checked = unserialized_file.value(); + + if (unserialized_checked.get_size() == 0) { + /* + * This is a case when we have a file that is currently + * being created by another process. + * We cannot use it! + */ + ret = hs_shared_from_serialized(hs_cache, std::forward<T>(cached_serialized), offset); + } + else { + ret = raii_mmaped_file::mmap_shared(std::move(unserialized_checked), PROT_READ) + .and_then([&]<class U>(U &&mmapped_unserialized) -> auto { + return hs_shared_from_unserialized(hs_cache, std::forward<U>(mmapped_unserialized)); + }); + } + } + else { + ret = hs_shared_from_serialized(hs_cache, std::forward<T>(cached_serialized), offset); + } +#else // defined(HS_MAJOR) && defined(HS_MINOR) && HS_MAJOR >= 5 && HS_MINOR >= 4 + auto ret = hs_shared_from_serialized(hs_cache, std::forward<T>(cached_serialized), offset); +#endif// defined(HS_MAJOR) && defined(HS_MINOR) && HS_MAJOR >= 5 && HS_MINOR >= 4 \ + // Add serialized file to cache merely if we have successfully loaded the actual db + if (ret.has_value()) { + hs_cache.add_cached_file(cached_serialized.get_file()); + } + return ret; + }); +} +}// namespace rspamd::util + +/* C API */ + +#define CXX_DB_FROM_C(obj) (reinterpret_cast<rspamd::util::hs_shared_database *>(obj)) +#define C_DB_FROM_CXX(obj) (reinterpret_cast<rspamd_hyperscan_t *>(obj)) + +rspamd_hyperscan_t * +rspamd_hyperscan_maybe_load(const char *filename, goffset offset) +{ + auto maybe_db = rspamd::util::load_cached_hs_file(filename, offset); + + if (maybe_db.has_value()) { + auto *ndb = new rspamd::util::hs_shared_database; + *ndb = std::move(maybe_db.value()); + return C_DB_FROM_CXX(ndb); + } + else { + auto error = maybe_db.error(); + + switch (error.category) { + case rspamd::util::error_category::CRITICAL: + msg_err_hyperscan("critical error when trying to load cached hyperscan: %s", + error.error_message.data()); + break; + case rspamd::util::error_category::IMPORTANT: + msg_info_hyperscan("error when trying to load cached hyperscan: %s", + error.error_message.data()); + break; + default: + msg_debug_hyperscan("error when trying to load cached hyperscan: %s", + error.error_message.data()); + break; + } + } + + return nullptr; +} + +hs_database_t * +rspamd_hyperscan_get_database(rspamd_hyperscan_t *db) +{ + auto *real_db = CXX_DB_FROM_C(db); + return real_db->db; +} + +rspamd_hyperscan_t * +rspamd_hyperscan_from_raw_db(hs_database_t *db, const char *fname) +{ + auto *ndb = new rspamd::util::hs_shared_database{db, fname}; + + return C_DB_FROM_CXX(ndb); +} + +void rspamd_hyperscan_free(rspamd_hyperscan_t *db, bool invalid) +{ + auto *real_db = CXX_DB_FROM_C(db); + + if (invalid && !real_db->cached_path.empty()) { + rspamd::util::hs_known_files_cache::get().delete_cached_file(real_db->cached_path.c_str()); + } + delete real_db; +} + +void rspamd_hyperscan_notice_known(const char *fname) +{ + rspamd::util::hs_known_files_cache::get().add_cached_file(fname); + + if (rspamd_current_worker != nullptr) { + /* Also notify main process */ + struct rspamd_srv_command notice_cmd; + + if (strlen(fname) >= sizeof(notice_cmd.cmd.hyperscan_cache_file.path)) { + msg_err("internal error: length of the filename %d ('%s') is larger than control buffer path: %d", + (int) strlen(fname), fname, (int) sizeof(notice_cmd.cmd.hyperscan_cache_file.path)); + } + else { + notice_cmd.type = RSPAMD_SRV_NOTICE_HYPERSCAN_CACHE; + rspamd_strlcpy(notice_cmd.cmd.hyperscan_cache_file.path, fname, sizeof(notice_cmd.cmd.hyperscan_cache_file.path)); + rspamd_srv_send_command(rspamd_current_worker, + rspamd_current_worker->srv->event_loop, ¬ice_cmd, -1, + nullptr, + nullptr); + } + } +} + +void rspamd_hyperscan_cleanup_maybe(void) +{ + rspamd::util::hs_known_files_cache::get().cleanup_maybe(); +} + +void rspamd_hyperscan_notice_loaded(void) +{ + rspamd::util::hs_known_files_cache::get().notice_loaded(); +} + +#endif// WITH_HYPERSCAN
\ No newline at end of file |