diff options
Diffstat (limited to '')
122 files changed, 79613 insertions, 0 deletions
diff --git a/src/libserver/CMakeLists.txt b/src/libserver/CMakeLists.txt new file mode 100644 index 0000000..dd17865 --- /dev/null +++ b/src/libserver/CMakeLists.txt @@ -0,0 +1,52 @@ +# Librspamdserver +ADD_SUBDIRECTORY(css) +SET(LIBRSPAMDSERVERSRC + ${CMAKE_CURRENT_SOURCE_DIR}/cfg_utils.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/cfg_rcl.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/composites/composites.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/composites/composites_manager.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/dkim.c + ${CMAKE_CURRENT_SOURCE_DIR}/dns.c + ${CMAKE_CURRENT_SOURCE_DIR}/dynamic_cfg.c + ${CMAKE_CURRENT_SOURCE_DIR}/async_session.c + ${CMAKE_CURRENT_SOURCE_DIR}/fuzzy_backend/fuzzy_backend.c + ${CMAKE_CURRENT_SOURCE_DIR}/fuzzy_backend/fuzzy_backend_sqlite.c + ${CMAKE_CURRENT_SOURCE_DIR}/fuzzy_backend/fuzzy_backend_redis.c + ${CMAKE_CURRENT_SOURCE_DIR}/milter.c + ${CMAKE_CURRENT_SOURCE_DIR}/monitored.c + ${CMAKE_CURRENT_SOURCE_DIR}/protocol.c + ${CMAKE_CURRENT_SOURCE_DIR}/re_cache.c + ${CMAKE_CURRENT_SOURCE_DIR}/redis_pool.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/roll_history.c + ${CMAKE_CURRENT_SOURCE_DIR}/spf.c + ${CMAKE_CURRENT_SOURCE_DIR}/ssl_util.c + ${CMAKE_CURRENT_SOURCE_DIR}/symcache/symcache_impl.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/symcache/symcache_item.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/symcache/symcache_runtime.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/symcache/symcache_c.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/task.c + ${CMAKE_CURRENT_SOURCE_DIR}/url.c + ${CMAKE_CURRENT_SOURCE_DIR}/worker_util.c + ${CMAKE_CURRENT_SOURCE_DIR}/logger/logger.c + ${CMAKE_CURRENT_SOURCE_DIR}/logger/logger_file.c + ${CMAKE_CURRENT_SOURCE_DIR}/logger/logger_syslog.c + ${CMAKE_CURRENT_SOURCE_DIR}/logger/logger_console.c + ${CMAKE_CURRENT_SOURCE_DIR}/http/http_util.c + ${CMAKE_CURRENT_SOURCE_DIR}/http/http_message.c + ${CMAKE_CURRENT_SOURCE_DIR}/http/http_connection.c + ${CMAKE_CURRENT_SOURCE_DIR}/http/http_router.c + ${CMAKE_CURRENT_SOURCE_DIR}/http/http_context.c + ${CMAKE_CURRENT_SOURCE_DIR}/maps/map.c + ${CMAKE_CURRENT_SOURCE_DIR}/maps/map_helpers.c + ${CMAKE_CURRENT_SOURCE_DIR}/html/html_entities.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/html/html.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/html/html_tests.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/hyperscan_tools.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/backtrace.cxx + ${LIBCSSSRC}) + +# Librspamd-server +SET(RSPAMD_SERVER ${LIBRSPAMDSERVERSRC} PARENT_SCOPE) +SET(LIBSERVER_DEPENDS "${LIBCSS_DEPENDS}" PARENT_SCOPE) +SET(LIBSERVER_GENERATED "${LIBCSS_GENERATED}" PARENT_SCOPE) diff --git a/src/libserver/async_session.c b/src/libserver/async_session.c new file mode 100644 index 0000000..baaee62 --- /dev/null +++ b/src/libserver/async_session.c @@ -0,0 +1,364 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "rspamd.h" +#include "contrib/uthash/utlist.h" +#include "contrib/libucl/khash.h" +#include "async_session.h" +#include "cryptobox.h" + +#define RSPAMD_SESSION_FLAG_DESTROYING (1 << 1) +#define RSPAMD_SESSION_FLAG_CLEANUP (1 << 2) + +#define RSPAMD_SESSION_CAN_ADD_EVENT(s) (!((s)->flags & (RSPAMD_SESSION_FLAG_DESTROYING | RSPAMD_SESSION_FLAG_CLEANUP))) + +#define msg_err_session(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "events", session->pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_warn_session(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "events", session->pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_info_session(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "events", session->pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_debug_session(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_events_log_id, "events", session->pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(events) + +/* Average symbols count to optimize hash allocation */ +static struct rspamd_counter_data events_count; + + +struct rspamd_async_event { + const gchar *subsystem; + const gchar *event_source; + event_finalizer_t fin; + void *user_data; +}; + +static inline bool +rspamd_event_equal(const struct rspamd_async_event *ev1, const struct rspamd_async_event *ev2) +{ + return ev1->fin == ev2->fin && ev1->user_data == ev2->user_data; +} + +static inline guint64 +rspamd_event_hash(const struct rspamd_async_event *ev) +{ + union _pointer_fp_thunk { + event_finalizer_t f; + gpointer p; + }; + struct ev_storage { + union _pointer_fp_thunk p; + gpointer ud; + } st; + + st.p.f = ev->fin; + st.ud = ev->user_data; + + return rspamd_cryptobox_fast_hash(&st, sizeof(st), rspamd_hash_seed()); +} + +/* Define **SET** of events */ +KHASH_INIT(rspamd_events_hash, + struct rspamd_async_event *, + char, + false, + rspamd_event_hash, + rspamd_event_equal); + +struct rspamd_async_session { + session_finalizer_t fin; + event_finalizer_t restore; + event_finalizer_t cleanup; + khash_t(rspamd_events_hash) * events; + void *user_data; + rspamd_mempool_t *pool; + guint flags; +}; + +static void +rspamd_session_dtor(gpointer d) +{ + struct rspamd_async_session *s = (struct rspamd_async_session *) d; + + /* Events are usually empty at this point */ + rspamd_set_counter_ema(&events_count, s->events->n_buckets, 0.5); + kh_destroy(rspamd_events_hash, s->events); +} + +struct rspamd_async_session * +rspamd_session_create(rspamd_mempool_t *pool, + session_finalizer_t fin, + event_finalizer_t restore, + event_finalizer_t cleanup, + void *user_data) +{ + struct rspamd_async_session *s; + + s = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_async_session)); + s->pool = pool; + s->fin = fin; + s->restore = restore; + s->cleanup = cleanup; + s->user_data = user_data; + s->events = kh_init(rspamd_events_hash); + + kh_resize(rspamd_events_hash, s->events, MAX(4, events_count.mean)); + rspamd_mempool_add_destructor(pool, rspamd_session_dtor, s); + + return s; +} + +struct rspamd_async_event * +rspamd_session_add_event_full(struct rspamd_async_session *session, + event_finalizer_t fin, + gpointer user_data, + const gchar *subsystem, + const gchar *event_source) +{ + struct rspamd_async_event *new_event; + gint ret; + + if (session == NULL) { + msg_err("session is NULL"); + g_assert_not_reached(); + } + + if (!RSPAMD_SESSION_CAN_ADD_EVENT(session)) { + msg_debug_session("skip adding event subsystem: %s: " + "session is destroying/cleaning", + subsystem); + + return NULL; + } + + new_event = rspamd_mempool_alloc(session->pool, + sizeof(struct rspamd_async_event)); + new_event->fin = fin; + new_event->user_data = user_data; + new_event->subsystem = subsystem; + new_event->event_source = event_source; + + msg_debug_session("added event: %p, pending %d (+1) events, " + "subsystem: %s (%s)", + user_data, + kh_size(session->events), + subsystem, + event_source); + + kh_put(rspamd_events_hash, session->events, new_event, &ret); + g_assert(ret > 0); + + return new_event; +} + +void rspamd_session_remove_event_full(struct rspamd_async_session *session, + event_finalizer_t fin, + void *ud, + const gchar *event_source) +{ + struct rspamd_async_event search_ev, *found_ev; + khiter_t k; + + if (session == NULL) { + msg_err("session is NULL"); + return; + } + + if (!RSPAMD_SESSION_CAN_ADD_EVENT(session)) { + /* Session is already cleaned up, ignore this */ + return; + } + + /* Search for event */ + search_ev.fin = fin; + search_ev.user_data = ud; + k = kh_get(rspamd_events_hash, session->events, &search_ev); + if (k == kh_end(session->events)) { + + msg_err_session("cannot find event: %p(%p) from %s (%d total events)", fin, ud, + event_source, (int) kh_size(session->events)); + kh_foreach_key(session->events, found_ev, { + msg_err_session("existing event %s (%s): %p(%p)", + found_ev->subsystem, + found_ev->event_source, + found_ev->fin, + found_ev->user_data); + }); + + g_assert_not_reached(); + } + + found_ev = kh_key(session->events, k); + msg_debug_session("removed event: %p, pending %d (-1) events, " + "subsystem: %s (%s), added at %s", + ud, + kh_size(session->events), + found_ev->subsystem, + event_source, + found_ev->event_source); + kh_del(rspamd_events_hash, session->events, k); + + /* Remove event */ + if (fin) { + fin(ud); + } + + rspamd_session_pending(session); +} + +gboolean +rspamd_session_destroy(struct rspamd_async_session *session) +{ + if (session == NULL) { + msg_err("session is NULL"); + return FALSE; + } + + if (!rspamd_session_blocked(session)) { + session->flags |= RSPAMD_SESSION_FLAG_DESTROYING; + rspamd_session_cleanup(session, false); + + if (session->cleanup != NULL) { + session->cleanup(session->user_data); + } + } + + return TRUE; +} + +void rspamd_session_cleanup(struct rspamd_async_session *session, bool forced_cleanup) +{ + struct rspamd_async_event *ev; + + if (session == NULL) { + msg_err("session is NULL"); + return; + } + + session->flags |= RSPAMD_SESSION_FLAG_CLEANUP; + khash_t(rspamd_events_hash) *uncancellable_events = kh_init(rspamd_events_hash); + + kh_foreach_key(session->events, ev, { + /* Call event's finalizer */ + int ret; + + if (ev->fin != NULL) { + if (forced_cleanup) { + msg_info_session("forced removed event on destroy: %p, subsystem: %s, scheduled from: %s", + ev->user_data, + ev->subsystem, + ev->event_source); + } + else { + msg_debug_session("removed event on destroy: %p, subsystem: %s", + ev->user_data, + ev->subsystem); + } + ev->fin(ev->user_data); + } + else { + if (forced_cleanup) { + msg_info_session("NOT forced removed event on destroy - uncancellable: " + "%p, subsystem: %s, scheduled from: %s", + ev->user_data, + ev->subsystem, + ev->event_source); + } + else { + msg_debug_session("NOT removed event on destroy - uncancellable: %p, subsystem: %s", + ev->user_data, + ev->subsystem); + } + /* Assume an event is uncancellable, move it to a new hash table */ + kh_put(rspamd_events_hash, uncancellable_events, ev, &ret); + } + }); + + kh_destroy(rspamd_events_hash, session->events); + session->events = uncancellable_events; + if (forced_cleanup) { + msg_info_session("pending %d uncancellable events", kh_size(uncancellable_events)); + } + else { + msg_debug_session("pending %d uncancellable events", kh_size(uncancellable_events)); + } + + session->flags &= ~RSPAMD_SESSION_FLAG_CLEANUP; +} + +gboolean +rspamd_session_pending(struct rspamd_async_session *session) +{ + gboolean ret = TRUE; + + if (kh_size(session->events) == 0) { + if (session->fin != NULL) { + msg_debug_session("call fin handler, as no events are pending"); + + if (!session->fin(session->user_data)) { + /* Session finished incompletely, perform restoration */ + msg_debug_session("restore incomplete session"); + if (session->restore != NULL) { + session->restore(session->user_data); + } + } + else { + ret = FALSE; + } + } + + ret = FALSE; + } + + return ret; +} + +guint rspamd_session_events_pending(struct rspamd_async_session *session) +{ + guint npending; + + g_assert(session != NULL); + + npending = kh_size(session->events); + msg_debug_session("pending %d events", npending); + + return npending; +} + +rspamd_mempool_t * +rspamd_session_mempool(struct rspamd_async_session *session) +{ + g_assert(session != NULL); + + return session->pool; +} + +gboolean +rspamd_session_blocked(struct rspamd_async_session *session) +{ + g_assert(session != NULL); + + return !RSPAMD_SESSION_CAN_ADD_EVENT(session); +}
\ No newline at end of file diff --git a/src/libserver/async_session.h b/src/libserver/async_session.h new file mode 100644 index 0000000..4573545 --- /dev/null +++ b/src/libserver/async_session.h @@ -0,0 +1,121 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_ASYNC_SESSION_H +#define RSPAMD_ASYNC_SESSION_H + +#include "config.h" +#include "mem_pool.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_async_event; +struct rspamd_async_session; + +typedef void (*event_finalizer_t)(gpointer ud); + +typedef gboolean (*session_finalizer_t)(gpointer user_data); + +/** + * Make new async session + * @param pool pool to alloc memory from + * @param fin a callback called when no events are found in session + * @param restore a callback is called to restore processing of session + * @param cleanup a callback called when session is forcefully destroyed + * @param user_data abstract user data + * @return + */ +struct rspamd_async_session *rspamd_session_create(rspamd_mempool_t *pool, + session_finalizer_t fin, event_finalizer_t restore, + event_finalizer_t cleanup, gpointer user_data); + +/** + * Insert new event to the session + * @param session session object + * @param fin finalizer callback + * @param user_data abstract user_data + * @param forced unused + */ +struct rspamd_async_event * +rspamd_session_add_event_full(struct rspamd_async_session *session, + event_finalizer_t fin, + gpointer user_data, + const gchar *subsystem, + const gchar *event_source); + +#define rspamd_session_add_event(session, fin, user_data, subsystem) \ + rspamd_session_add_event_full(session, fin, user_data, subsystem, G_STRLOC) + +/** + * Remove normal event + * @param session session object + * @param fin final callback + * @param ud user data object + */ +void rspamd_session_remove_event_full(struct rspamd_async_session *session, + event_finalizer_t fin, + gpointer ud, + const gchar *event_source); + +#define rspamd_session_remove_event(session, fin, user_data) \ + rspamd_session_remove_event_full(session, fin, user_data, G_STRLOC) + +/** + * Must be called at the end of session, it calls fin functions for all non-forced callbacks + * @return true if the whole session was destroyed and false if there are forced events + */ +gboolean rspamd_session_destroy(struct rspamd_async_session *session); + +/** + * Try to remove all events pending + */ +void rspamd_session_cleanup(struct rspamd_async_session *session, bool forced_cleanup); + +/** + * Returns mempool associated with async session + * @param session + * @return + */ +rspamd_mempool_t *rspamd_session_mempool(struct rspamd_async_session *session); + +/** + * Check session for events pending and call fin callback if no events are pending + * @param session session object + * @return TRUE if session has pending events + */ +gboolean rspamd_session_pending(struct rspamd_async_session *session); + +/** + * Returns number of events pending + * @param session + * @return + */ +guint rspamd_session_events_pending(struct rspamd_async_session *session); + + +/** + * Returns TRUE if an async session is currently destroying + * @param s + * @return + */ +gboolean rspamd_session_blocked(struct rspamd_async_session *s); + +#ifdef __cplusplus +} +#endif + +#endif /*RSPAMD_ASYNC_SESSION_H*/ diff --git a/src/libserver/backtrace.cxx b/src/libserver/backtrace.cxx new file mode 100644 index 0000000..6507b96 --- /dev/null +++ b/src/libserver/backtrace.cxx @@ -0,0 +1,61 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" + +#ifdef BACKWARD_ENABLE + +#include "contrib/backward-cpp/backward.hpp" +#include "fmt/core.h" +#include "logger.h" + +namespace rspamd { + +void log_backtrace(void) +{ + using namespace backward; + StackTrace st; + st.load_here(128); + + TraceResolver tr; + tr.load_stacktrace(st); + + for (auto i = 0ul; i < st.size(); ++i) { + auto trace = tr.resolve(st[i]); + auto trace_line = fmt::format("#{}: [{}]: ", i, trace.addr); + + if (!trace.source.filename.empty()) { + trace_line += fmt::format("{}:{} in {}", trace.source.filename, trace.source.line, trace.source.function); + } + else { + trace_line += fmt::format("{} in {}", trace.object_filename, trace.object_function); + } + + msg_err("%s", trace_line.c_str()); + } +} + +}// namespace rspamd +#endif + +extern "C" void rspamd_print_crash(void); + +void rspamd_print_crash(void) +{ +#ifdef BACKWARD_ENABLE + rspamd::log_backtrace(); +#endif +}
\ No newline at end of file diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h new file mode 100644 index 0000000..4cb87d9 --- /dev/null +++ b/src/libserver/cfg_file.h @@ -0,0 +1,889 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef CFG_FILE_H +#define CFG_FILE_H + +#include "config.h" +#include "mem_pool.h" +#include "upstream.h" +#include "rspamd_symcache.h" +#include "cfg_rcl.h" +#include "ucl.h" +#include "regexp.h" +#include "libserver/re_cache.h" +#include "libutil/ref.h" +#include "libutil/radix.h" +#include "monitored.h" +#include "redis_pool.h" + +#define DEFAULT_BIND_PORT 11333 +#define DEFAULT_CONTROL_PORT 11334 + +/* Default metric name */ +#define DEFAULT_METRIC "default" + +#ifdef __cplusplus +extern "C" { +#endif + +struct expression; +struct tokenizer; +struct rspamd_stat_classifier; +struct module_s; +struct worker_s; +struct rspamd_external_libs_ctx; +struct rspamd_cryptobox_pubkey; +struct rspamd_dns_resolver; + +/** + * Logging type + */ +enum rspamd_log_type { + RSPAMD_LOG_CONSOLE, + RSPAMD_LOG_SYSLOG, + RSPAMD_LOG_FILE +}; + +enum rspamd_log_cfg_flags { + RSPAMD_LOG_FLAG_DEFAULT = 0u, + RSPAMD_LOG_FLAG_SYSTEMD = (1u << 0u), + RSPAMD_LOG_FLAG_COLOR = (1u << 1u), + RSPAMD_LOG_FLAG_RE_CACHE = (1u << 2u), + RSPAMD_LOG_FLAG_USEC = (1u << 3u), + RSPAMD_LOG_FLAG_RSPAMADM = (1u << 4u), + RSPAMD_LOG_FLAG_ENFORCED = (1u << 5u), + RSPAMD_LOG_FLAG_SEVERITY = (1u << 6u), + RSPAMD_LOG_FLAG_JSON = (1u << 7u), +}; + +struct rspamd_worker_log_pipe { + gint fd; + gint type; + struct rspamd_worker_log_pipe *prev, *next; +}; + +/** + * script module list item + */ +struct script_module { + gchar *name; /**< name of module */ + gchar *path; /**< path to module */ + gchar *digest; +}; + +enum rspamd_symbol_group_flags { + RSPAMD_SYMBOL_GROUP_NORMAL = 0u, + RSPAMD_SYMBOL_GROUP_DISABLED = (1u << 0u), + RSPAMD_SYMBOL_GROUP_ONE_SHOT = (1u << 1u), + RSPAMD_SYMBOL_GROUP_UNGROUPED = (1u << 2u), + RSPAMD_SYMBOL_GROUP_PUBLIC = (1u << 3u), +}; + +/** + * Symbols group + */ +struct rspamd_symbol; +struct rspamd_symbols_group { + gchar *name; + gchar *description; + GHashTable *symbols; + gdouble max_score; + guint flags; +}; + +enum rspamd_symbol_flags { + RSPAMD_SYMBOL_FLAG_NORMAL = 0, + RSPAMD_SYMBOL_FLAG_IGNORE_METRIC = (1 << 1), + RSPAMD_SYMBOL_FLAG_ONEPARAM = (1 << 2), + RSPAMD_SYMBOL_FLAG_UNGROUPED = (1 << 3), + RSPAMD_SYMBOL_FLAG_DISABLED = (1 << 4), + RSPAMD_SYMBOL_FLAG_UNSCORED = (1 << 5), +}; + +/** + * Symbol config definition + */ +struct rspamd_symbol { + gchar *name; + gchar *description; + gdouble *weight_ptr; + gdouble score; + guint priority; + struct rspamd_symbols_group *gr; /* Main group */ + GPtrArray *groups; /* Other groups */ + guint flags; + void *cache_item; + gint nshots; +}; + +/** + * Statfile config definition + */ +struct rspamd_statfile_config { + gchar *symbol; /**< symbol of statfile */ + gchar *label; /**< label of this statfile */ + ucl_object_t *opts; /**< other options */ + gboolean is_spam; /**< spam flag */ + struct rspamd_classifier_config *clcf; /**< parent pointer of classifier configuration */ + gpointer data; /**< opaque data */ +}; + +struct rspamd_tokenizer_config { + const ucl_object_t *opts; /**< other options */ + const gchar *name; /**< name of tokenizer */ +}; + + +/* Classifier has all integer values (e.g. bayes) */ +#define RSPAMD_FLAG_CLASSIFIER_INTEGER (1 << 0) +/* + * Set if backend for a classifier is intended to increment and not set values + * (e.g. redis) + */ +#define RSPAMD_FLAG_CLASSIFIER_INCREMENTING_BACKEND (1 << 1) +/* + * No backend required for classifier + */ +#define RSPAMD_FLAG_CLASSIFIER_NO_BACKEND (1 << 2) + +/** + * Classifier config definition + */ +struct rspamd_classifier_config { + GList *statfiles; /**< statfiles list */ + GHashTable *labels; /**< statfiles with labels */ + gchar *metric; /**< metric of this classifier */ + gchar *classifier; /**< classifier interface */ + struct rspamd_tokenizer_config *tokenizer; /**< tokenizer used for classifier */ + const gchar *backend; /**< name of statfile's backend */ + ucl_object_t *opts; /**< other options */ + GList *learn_conditions; /**< list of learn condition callbacks */ + GList *classify_conditions; /**< list of classify condition callbacks */ + gchar *name; /**< unique name of classifier */ + guint32 min_tokens; /**< minimal number of tokens to process classifier */ + guint32 max_tokens; /**< maximum number of tokens */ + guint min_token_hits; /**< minimum number of hits for a token to be considered */ + gdouble min_prob_strength; /**< use only tokens with probability in [0.5 - MPS, 0.5 + MPS] */ + guint min_learns; /**< minimum number of learns for each statfile */ + guint flags; +}; + +struct rspamd_worker_bind_conf { + GPtrArray *addrs; + guint cnt; + gchar *name; + gchar *bind_line; + gboolean is_systemd; + struct rspamd_worker_bind_conf *next; +}; + +struct rspamd_worker_lua_script { + gint cbref; + struct rspamd_worker_lua_script *prev, *next; +}; + +/** + * Config params for rspamd worker + */ +struct rspamd_worker_conf { + struct worker_s *worker; /**< pointer to worker type */ + GQuark type; /**< type of worker */ + struct rspamd_worker_bind_conf *bind_conf; /**< bind configuration */ + gint16 count; /**< number of workers */ + GList *listen_socks; /**< listening sockets descriptors */ + guint64 rlimit_nofile; /**< max files limit */ + guint64 rlimit_maxcore; /**< maximum core file size */ + GHashTable *params; /**< params for worker */ + GQueue *active_workers; /**< linked list of spawned workers */ + gpointer ctx; /**< worker's context */ + ucl_object_t *options; /**< other worker's options */ + struct rspamd_worker_lua_script *scripts; /**< registered lua scripts */ + gboolean enabled; + ref_entry_t ref; +}; + +enum rspamd_log_format_type { + RSPAMD_LOG_STRING = 0, + RSPAMD_LOG_MID, + RSPAMD_LOG_QID, + RSPAMD_LOG_USER, + RSPAMD_LOG_ISSPAM, + RSPAMD_LOG_ACTION, + RSPAMD_LOG_SCORES, + RSPAMD_LOG_SYMBOLS, + RSPAMD_LOG_IP, + RSPAMD_LOG_LEN, + RSPAMD_LOG_DNS_REQ, + RSPAMD_LOG_SMTP_FROM, + RSPAMD_LOG_MIME_FROM, + RSPAMD_LOG_SMTP_RCPT, + RSPAMD_LOG_MIME_RCPT, + RSPAMD_LOG_SMTP_RCPTS, + RSPAMD_LOG_MIME_RCPTS, + RSPAMD_LOG_TIME_REAL, + RSPAMD_LOG_TIME_VIRTUAL, + RSPAMD_LOG_LUA, + RSPAMD_LOG_DIGEST, + RSPAMD_LOG_FILENAME, + RSPAMD_LOG_FORCED_ACTION, + RSPAMD_LOG_SETTINGS_ID, + RSPAMD_LOG_GROUPS, + RSPAMD_LOG_PUBLIC_GROUPS, + RSPAMD_LOG_MEMPOOL_SIZE, + RSPAMD_LOG_MEMPOOL_WASTE, +}; + +enum rspamd_log_format_flags { + RSPAMD_LOG_FMT_FLAG_DEFAULT = 0, + RSPAMD_LOG_FMT_FLAG_OPTIONAL = (1 << 0), + RSPAMD_LOG_FMT_FLAG_MIME_ALTERNATIVE = (1 << 1), + RSPAMD_LOG_FMT_FLAG_CONDITION = (1 << 2), + RSPAMD_LOG_FMT_FLAG_SYMBOLS_SCORES = (1 << 3), + RSPAMD_LOG_FMT_FLAG_SYMBOLS_PARAMS = (1 << 4) +}; + +struct rspamd_log_format { + enum rspamd_log_format_type type; + guint flags; + gsize len; + gpointer data; + struct rspamd_log_format *prev, *next; +}; + +/** + * Standard actions + */ +enum rspamd_action_type { + METRIC_ACTION_REJECT = 0, + METRIC_ACTION_SOFT_REJECT, + METRIC_ACTION_REWRITE_SUBJECT, + METRIC_ACTION_ADD_HEADER, + METRIC_ACTION_GREYLIST, + METRIC_ACTION_NOACTION, + METRIC_ACTION_MAX, + METRIC_ACTION_CUSTOM = 999, + METRIC_ACTION_DISCARD, + METRIC_ACTION_QUARANTINE +}; + +enum rspamd_action_flags { + RSPAMD_ACTION_NORMAL = 0u, + RSPAMD_ACTION_NO_THRESHOLD = (1u << 0u), + RSPAMD_ACTION_THRESHOLD_ONLY = (1u << 1u), + RSPAMD_ACTION_HAM = (1u << 2u), + RSPAMD_ACTION_MILTER = (1u << 3u), +}; + + +struct rspamd_action; + +struct rspamd_config_cfg_lua_script { + gint cbref; + gint priority; + gchar *lua_src_pos; + struct rspamd_config_cfg_lua_script *prev, *next; +}; + +struct rspamd_config_post_init_script { + gint cbref; + struct rspamd_config_post_init_script *prev, *next; +}; + +struct rspamd_lang_detector; +struct rspamd_rcl_sections_map; + +enum rspamd_config_settings_policy { + RSPAMD_SETTINGS_POLICY_DEFAULT = 0, + RSPAMD_SETTINGS_POLICY_IMPLICIT_ALLOW = 1, + RSPAMD_SETTINGS_POLICY_IMPLICIT_DENY = 2, +}; + +enum rspamd_gtube_patterns_policy { + RSPAMD_GTUBE_DISABLED = 0, /* Disabled */ + RSPAMD_GTUBE_REJECT, /* Reject message with GTUBE pattern */ + RSPAMD_GTUBE_ALL /* Check all GTUBE like patterns */ +}; + +struct rspamd_config_settings_elt { + guint32 id; + enum rspamd_config_settings_policy policy; + const gchar *name; + ucl_object_t *symbols_enabled; + ucl_object_t *symbols_disabled; + struct rspamd_config_settings_elt *prev, *next; + ref_entry_t ref; +}; + +/** + * Structure that stores all config data + */ +struct rspamd_config { + gchar *rspamd_user; /**< user to run as */ + gchar *rspamd_group; /**< group to run as */ + rspamd_mempool_t *cfg_pool; /**< memory pool for config */ + gchar *cfg_name; /**< name of config file */ + gchar *pid_file; /**< name of pid file */ + gchar *temp_dir; /**< dir for temp files */ + gchar *control_socket_path; /**< path to the control socket */ + const ucl_object_t *local_addrs; /**< tree of local addresses */ +#ifdef WITH_GPERF_TOOLS + gchar *profile_path; +#endif + gdouble unknown_weight; /**< weight of unknown symbols */ + gdouble grow_factor; /**< grow factor for metric */ + GHashTable *symbols; /**< weights of symbols in metric */ + const gchar *subject; /**< subject rewrite string */ + GHashTable *groups; /**< groups of symbols */ + void *actions; /**< all actions of the metric (opaque type) */ + + gboolean one_shot_mode; /**< rules add only one symbol */ + gboolean check_text_attachements; /**< check text attachements as text */ + gboolean check_all_filters; /**< check all filters */ + gboolean allow_raw_input; /**< scan messages with invalid mime */ + gboolean disable_hyperscan; /**< disable hyperscan usage */ + gboolean vectorized_hyperscan; /**< use vectorized hyperscan matching */ + gboolean enable_shutdown_workaround; /**< enable workaround for legacy SA clients (exim) */ + gboolean ignore_received; /**< Ignore data from the first received header */ + gboolean enable_sessions_cache; /**< Enable session cache for debug */ + gboolean enable_experimental; /**< Enable experimental plugins */ + gboolean disable_pcre_jit; /**< Disable pcre JIT */ + gboolean own_lua_state; /**< True if we have created lua_state internally */ + gboolean soft_reject_on_timeout; /**< If true emit soft reject on task timeout (if not reject) */ + gboolean public_groups_only; /**< Output merely public groups everywhere */ + enum rspamd_gtube_patterns_policy gtube_patterns_policy; /**< Enable test patterns */ + gboolean enable_css_parser; /**< Enable css parsing in HTML */ + + gsize max_cores_size; /**< maximum size occupied by rspamd core files */ + gsize max_cores_count; /**< maximum number of core files */ + gchar *cores_dir; /**< directory for core files */ + gsize max_message; /**< maximum size for messages */ + gsize max_pic_size; /**< maximum size for a picture to process */ + gsize images_cache_size; /**< size of LRU cache for DCT data from images */ + gdouble task_timeout; /**< maximum message processing time */ + gint default_max_shots; /**< default maximum count of symbols hits permitted (-1 for unlimited) */ + gint32 heartbeats_loss_max; /**< number of heartbeats lost to consider worker's termination */ + gdouble heartbeat_interval; /**< interval for heartbeats for workers */ + + enum rspamd_log_type log_type; /**< log type */ + gint log_facility; /**< log facility in case of syslog */ + gint log_level; /**< log level trigger */ + gchar *log_file; /**< path to logfile in case of file logging */ + gboolean log_buffered; /**< whether logging is buffered */ + gboolean log_silent_workers; /**< silence info messages from workers */ + guint32 log_buf_size; /**< length of log buffer */ + const ucl_object_t *debug_ip_map; /**< turn on debugging for specified ip addresses */ + gboolean log_urls; /**< whether we should log URLs */ + GHashTable *debug_modules; /**< logging modules to debug */ + struct rspamd_cryptobox_pubkey *log_encryption_key; /**< encryption key for logs */ + guint log_flags; /**< logging flags */ + guint log_error_elts; /**< number of elements in error logbuf */ + guint log_error_elt_maxlen; /**< maximum size of error log element */ + guint log_task_max_elts; /**< maximum number of elements in task logging */ + struct rspamd_worker_log_pipe *log_pipes; + + gboolean compat_messages; /**< use old messages in the protocol (array) */ + + GPtrArray *script_modules; /**< a list of script modules to load */ + GHashTable *explicit_modules; /**< modules that should be always loaded */ + + GList *filters; /**< linked list of all filters */ + GList *workers; /**< linked list of all workers params */ + struct rspamd_rcl_sections_map *rcl_top_section; /**< top section for RCL config */ + ucl_object_t *cfg_ucl_obj; /**< ucl object */ + ucl_object_t *config_comments; /**< comments saved from the config */ + ucl_object_t *doc_strings; /**< documentation strings for config options */ + GPtrArray *c_modules; /**< list of C modules */ + void *composites_manager; /**< hash of composite symbols indexed by its name */ + GList *classifiers; /**< list of all classifiers defined */ + GList *statfiles; /**< list of all statfiles in config file order */ + GHashTable *classifiers_symbols; /**< hashtable indexed by symbol name of classifiers */ + GHashTable *cfg_params; /**< all cfg params indexed by its name in this structure */ + gchar *dynamic_conf; /**< path to dynamic configuration */ + ucl_object_t *current_dynamic_conf; /**< currently loaded dynamic configuration */ + gint clock_res; /**< resolution of clock used */ + + GList *maps; /**< maps active */ + gdouble map_timeout; /**< maps watch timeout */ + gdouble map_file_watch_multiplier; /**< multiplier for watch timeout when maps are files */ + gchar *maps_cache_dir; /**< where to save HTTP cached data */ + + gdouble monitored_interval; /**< interval between monitored checks */ + gboolean disable_monitored; /**< disable monitoring completely */ + gboolean fips_mode; /**< turn on fips mode for openssl */ + + struct rspamd_symcache *cache; /**< symbols cache object */ + gchar *cache_filename; /**< filename of cache file */ + gdouble cache_reload_time; /**< how often cache reload should be performed */ + gchar *checksum; /**< real checksum of config file */ + gpointer lua_state; /**< pointer to lua state */ + gpointer lua_thread_pool; /**< pointer to lua thread (coroutine) pool */ + + gchar *rrd_file; /**< rrd file to store statistics */ + gchar *history_file; /**< file to save rolling history */ + gchar *stats_file; /**< file to save stats */ + gchar *tld_file; /**< file to load effective tld list from */ + gchar *hs_cache_dir; /**< directory to save hyperscan databases */ + gchar *events_backend; /**< string representation of the events backend used */ + + gdouble dns_timeout; /**< timeout in milliseconds for waiting for dns reply */ + guint32 dns_retransmits; /**< maximum retransmits count */ + guint32 dns_io_per_server; /**< number of sockets per DNS server */ + const ucl_object_t *nameservers; /**< list of nameservers or NULL to parse resolv.conf */ + guint32 dns_max_requests; /**< limit of DNS requests per task */ + gboolean enable_dnssec; /**< enable dnssec stub resolver */ + + guint upstream_max_errors; /**< upstream max errors before shutting off */ + gdouble upstream_error_time; /**< rate of upstream errors */ + gdouble upstream_revive_time; /**< revive timeout for upstreams */ + gdouble upstream_lazy_resolve_time; /**< lazy resolve time for upstreams */ + struct upstream_ctx *ups_ctx; /**< upstream context */ + struct rspamd_dns_resolver *dns_resolver; /**< dns resolver if loaded */ + + guint min_word_len; /**< minimum length of the word to be considered */ + guint max_word_len; /**< maximum length of the word to be considered */ + guint words_decay; /**< limit for words for starting adaptive ignoring */ + guint history_rows; /**< number of history rows stored */ + guint max_sessions_cache; /**< maximum number of sessions cache elts */ + guint lua_gc_step; /**< lua gc step */ + guint lua_gc_pause; /**< lua gc pause */ + guint full_gc_iters; /**< iterations between full gc cycle */ + guint max_lua_urls; /**< maximum number of urls to be passed to Lua */ + guint max_urls; /**< maximum number of urls to be processed in general */ + gint max_recipients; /**< maximum number of recipients to be processed */ + guint max_blas_threads; /**< maximum threads for openblas when learning ANN */ + guint max_opts_len; /**< maximum length for all options for a symbol */ + gsize max_html_len; /**< maximum length of HTML document */ + + struct module_s **compiled_modules; /**< list of compiled C modules */ + struct worker_s **compiled_workers; /**< list of compiled C modules */ + struct rspamd_log_format *log_format; /**< parsed log format */ + gchar *log_format_str; /**< raw log format string */ + + struct rspamd_external_libs_ctx *libs_ctx; /**< context for external libraries */ + struct rspamd_monitored_ctx *monitored_ctx; /**< context for monitored resources */ + void *redis_pool; /**< redis connection pool */ + + struct rspamd_re_cache *re_cache; /**< static regexp cache */ + + GHashTable *trusted_keys; /**< list of trusted public keys */ + + struct rspamd_config_cfg_lua_script *on_load_scripts; /**< list of scripts executed on workers load */ + struct rspamd_config_cfg_lua_script *post_init_scripts; /**< list of scripts executed on config being fully loaded */ + struct rspamd_config_cfg_lua_script *on_term_scripts; /**< list of callbacks called on worker's termination */ + struct rspamd_config_cfg_lua_script *config_unload_scripts; /**< list of scripts executed on config unload */ + + gchar *ssl_ca_path; /**< path to CA certs */ + gchar *ssl_ciphers; /**< set of preferred ciphers */ + gchar *zstd_input_dictionary; /**< path to zstd input dictionary */ + gchar *zstd_output_dictionary; /**< path to zstd output dictionary */ + ucl_object_t *neighbours; /**< other servers in the cluster */ + + struct rspamd_config_settings_elt *setting_ids; /**< preprocessed settings ids */ + struct rspamd_lang_detector *lang_det; /**< language detector */ + struct rspamd_worker *cur_worker; /**< set dynamically by each worker */ + + ref_entry_t ref; /**< reference counter */ +}; + + +/** + * Parse bind credits + * @param cf config file to use + * @param str line that presents bind line + * @param type type of credits + * @return 1 if line was successfully parsed and 0 in case of error + */ +gboolean rspamd_parse_bind_line(struct rspamd_config *cfg, + struct rspamd_worker_conf *cf, const gchar *str); + + +enum rspamd_config_init_flags { + RSPAMD_CONFIG_INIT_DEFAULT = 0u, + RSPAMD_CONFIG_INIT_SKIP_LUA = (1u << 0u), + RSPAMD_CONFIG_INIT_WIPE_LUA_MEM = (1u << 1u), +}; + +/** + * Init default values + * @param cfg config file + */ +struct rspamd_config *rspamd_config_new(enum rspamd_config_init_flags flags); + +/** + * Free memory used by config structure + * @param cfg config file + */ +void rspamd_config_free(struct rspamd_config *cfg); + +/** + * Gets module option with specified name + * @param cfg config file + * @param module_name name of module + * @param opt_name name of option to get + * @return module value or NULL if option does not defined + */ +const ucl_object_t *rspamd_config_get_module_opt(struct rspamd_config *cfg, + const gchar *module_name, + const gchar *opt_name) G_GNUC_WARN_UNUSED_RESULT; + + +/** + * Parse flag + * @param str string representation of flag (eg. 'on') + * @return numeric value of flag (0 or 1) + */ +gint rspamd_config_parse_flag(const gchar *str, guint len); + +enum rspamd_post_load_options { + RSPAMD_CONFIG_INIT_URL = 1 << 0, + RSPAMD_CONFIG_INIT_LIBS = 1 << 1, + RSPAMD_CONFIG_INIT_SYMCACHE = 1 << 2, + RSPAMD_CONFIG_INIT_VALIDATE = 1 << 3, + RSPAMD_CONFIG_INIT_NO_TLD = 1 << 4, + RSPAMD_CONFIG_INIT_PRELOAD_MAPS = 1 << 5, + RSPAMD_CONFIG_INIT_POST_LOAD_LUA = 1 << 6, +}; + +#define RSPAMD_CONFIG_LOAD_ALL (RSPAMD_CONFIG_INIT_URL | \ + RSPAMD_CONFIG_INIT_LIBS | \ + RSPAMD_CONFIG_INIT_SYMCACHE | \ + RSPAMD_CONFIG_INIT_VALIDATE | \ + RSPAMD_CONFIG_INIT_PRELOAD_MAPS | \ + RSPAMD_CONFIG_INIT_POST_LOAD_LUA) + +/** + * Do post load actions for config + * @param cfg config file + */ +gboolean rspamd_config_post_load(struct rspamd_config *cfg, + enum rspamd_post_load_options opts); + +/* + * Return a new classifier_config structure, setting default and non-conflicting attributes + */ +struct rspamd_classifier_config *rspamd_config_new_classifier( + struct rspamd_config *cfg, + struct rspamd_classifier_config *c); + +/* + * Return a new worker_conf structure, setting default and non-conflicting attributes + */ +struct rspamd_worker_conf *rspamd_config_new_worker(struct rspamd_config *cfg, + struct rspamd_worker_conf *c); + +/* + * Return a new metric structure, setting default and non-conflicting attributes + */ +void rspamd_config_init_metric(struct rspamd_config *cfg); + +/* + * Return new symbols group definition + */ +struct rspamd_symbols_group *rspamd_config_new_group( + struct rspamd_config *cfg, + const gchar *name); + +/* + * Return a new statfile structure, setting default and non-conflicting attributes + */ +struct rspamd_statfile_config *rspamd_config_new_statfile( + struct rspamd_config *cfg, + struct rspamd_statfile_config *c); + +/* + * Register symbols of classifiers inside metrics + */ +void rspamd_config_insert_classify_symbols(struct rspamd_config *cfg); + +/* + * Check statfiles inside a classifier + */ +gboolean rspamd_config_check_statfiles(struct rspamd_classifier_config *cf); + +/* + * Find classifier config by name + */ +struct rspamd_classifier_config *rspamd_config_find_classifier( + struct rspamd_config *cfg, + const gchar *name); + +void rspamd_ucl_add_conf_macros(struct ucl_parser *parser, + struct rspamd_config *cfg); + +void rspamd_ucl_add_conf_variables(struct ucl_parser *parser, GHashTable *vars); + +/** + * Initialize rspamd filtering system (lua and C filters) + * @param cfg + * @param reconfig + * @return + */ +gboolean rspamd_init_filters(struct rspamd_config *cfg, bool reconfig, bool strict); + +/** + * Add new symbol to the metric + * @param cfg + * @param metric metric's name (or NULL for the default metric) + * @param symbol symbol's name + * @param score symbol's score + * @param description optional description + * @param group optional group name + * @param one_shot TRUE if symbol can add its score once + * @param rewrite_existing TRUE if we need to rewrite the existing symbol + * @param priority use the following priority for a symbol + * @param nshots means maximum number of hits for a symbol in metric (-1 for unlimited) + * @return TRUE if symbol has been inserted or FALSE if symbol already exists with higher priority + */ +gboolean rspamd_config_add_symbol(struct rspamd_config *cfg, + const gchar *symbol, + gdouble score, + const gchar *description, + const gchar *group, + guint flags, + guint priority, + gint nshots); + +/** + * Adds new group for a symbol + * @param cfg + * @param symbol + * @param group + * @return + */ +gboolean rspamd_config_add_symbol_group(struct rspamd_config *cfg, + const gchar *symbol, + const gchar *group); + +/** + * Sets action score for a specified metric with the specified priority + * @param cfg config file + * @param metric metric name (or NULL for default metric) + * @param action_name symbolic name of action + * @param obj data to set for action + * @return TRUE if symbol has been inserted or FALSE if action already exists with higher priority + */ +gboolean rspamd_config_set_action_score(struct rspamd_config *cfg, + const gchar *action_name, + const ucl_object_t *obj); + +/** + * Check priority and maybe disable action completely + * @param cfg + * @param action_name + * @param priority + * @return + */ +gboolean rspamd_config_maybe_disable_action(struct rspamd_config *cfg, + const gchar *action_name, + guint priority); + +/** + * Checks if a specified C or lua module is enabled or disabled in the config. + * The logic of check is the following: + * + * - For C modules, we check `filters` line and enable module only if it is found there + * - For LUA modules we check the corresponding configuration section: + * - if section exists, then we check `enabled` key and check its value + * - if section is absent, we consider module as disabled + * - For both C and LUA modules we check if the group with the module name is disabled in the default metric + * @param cfg config file + * @param module_name module name + * @return TRUE if a module is enabled + */ +gboolean rspamd_config_is_module_enabled(struct rspamd_config *cfg, + const gchar *module_name); + +/** + * Verifies enabled/disabled combination in the specified object + * @param obj + * @return TRUE if there is no explicit disable in the object found + */ +gboolean rspamd_config_is_enabled_from_ucl(rspamd_mempool_t *pool, + const ucl_object_t *obj); + +/* + * Get action from a string + */ +gboolean rspamd_action_from_str(const gchar *data, enum rspamd_action_type *result); + +/* + * Return textual representation of action enumeration + */ +const gchar *rspamd_action_to_str(enum rspamd_action_type action); + +const gchar *rspamd_action_to_str_alt(enum rspamd_action_type action); + +/** + * Parse radix tree or radix map from ucl object + * @param cfg configuration object + * @param obj ucl object with parameter + * @param target target radix tree + * @param err error pointer + * @return + */ +struct rspamd_radix_map_helper; + +gboolean rspamd_config_radix_from_ucl(struct rspamd_config *cfg, const ucl_object_t *obj, const gchar *description, + struct rspamd_radix_map_helper **target, GError **err, + struct rspamd_worker *worker, const gchar *map_name); + +/** + * Adds new settings id to be preprocessed + * @param cfg + * @param name + * @param symbols_enabled (ownership is transferred to callee) + * @param symbols_disabled (ownership is transferred to callee) + */ +void rspamd_config_register_settings_id(struct rspamd_config *cfg, + const gchar *name, + ucl_object_t *symbols_enabled, + ucl_object_t *symbols_disabled, + enum rspamd_config_settings_policy policy); + +/** + * Convert settings name to settings id + * @param name + * @param namelen + * @return + */ +guint32 rspamd_config_name_to_id(const gchar *name, gsize namelen); + +/** + * Finds settings id element and obtain reference count (must be unrefed by caller) + * @param cfg + * @param id + * @return + */ +struct rspamd_config_settings_elt *rspamd_config_find_settings_id_ref( + struct rspamd_config *cfg, + guint32 id); + +/** + * Finds settings id element and obtain reference count (must be unrefed by callee) + * @param cfg + * @param id + * @return + */ +struct rspamd_config_settings_elt *rspamd_config_find_settings_name_ref( + struct rspamd_config *cfg, + const gchar *name, gsize namelen); + +/** + * Returns action object by name + * @param cfg + * @param name + * @return + */ +struct rspamd_action *rspamd_config_get_action(struct rspamd_config *cfg, + const gchar *name); + +struct rspamd_action *rspamd_config_get_action_by_type(struct rspamd_config *cfg, + enum rspamd_action_type type); + +/** + * Iterate over all actions + * @param cfg + * @param func + * @param data + */ +void rspamd_config_actions_foreach(struct rspamd_config *cfg, + void (*func)(struct rspamd_action *act, void *d), + void *data); +/** + * Iterate over all actions with index + * @param cfg + * @param func + * @param data + */ +void rspamd_config_actions_foreach_enumerate(struct rspamd_config *cfg, + void (*func)(int idx, struct rspamd_action *act, void *d), + void *data); + +/** + * Returns number of actions defined in the config + * @param cfg + * @return + */ +gsize rspamd_config_actions_size(struct rspamd_config *cfg); + +int rspamd_config_ev_backend_get(struct rspamd_config *cfg); +const gchar *rspamd_config_ev_backend_to_string(int ev_backend, gboolean *effective); + +struct rspamd_external_libs_ctx; + +/** + * Initialize rspamd libraries + */ +struct rspamd_external_libs_ctx *rspamd_init_libs(void); + +/** + * Reset and initialize decompressor + * @param ctx + */ +gboolean rspamd_libs_reset_decompression(struct rspamd_external_libs_ctx *ctx); + +/** + * Reset and initialize compressor + * @param ctx + */ +gboolean rspamd_libs_reset_compression(struct rspamd_external_libs_ctx *ctx); + +/** + * Destroy external libraries context + */ +void rspamd_deinit_libs(struct rspamd_external_libs_ctx *ctx); + +/** + * Returns TRUE if an address belongs to some local address + */ +gboolean rspamd_ip_is_local_cfg(struct rspamd_config *cfg, + const rspamd_inet_addr_t *addr); + +/** + * Configure libraries + */ +gboolean rspamd_config_libs(struct rspamd_external_libs_ctx *ctx, + struct rspamd_config *cfg); + + +#define msg_err_config(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + cfg->cfg_pool->tag.tagname, cfg->checksum, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_err_config_forced(...) rspamd_default_log_function((gint) G_LOG_LEVEL_CRITICAL | (gint) RSPAMD_LOG_FORCED, \ + cfg->cfg_pool->tag.tagname, cfg->checksum, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_config(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + cfg->cfg_pool->tag.tagname, cfg->checksum, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_config(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + cfg->cfg_pool->tag.tagname, cfg->checksum, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +extern guint rspamd_config_log_id; +#define msg_debug_config(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_config_log_id, "config", cfg->checksum, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +#ifdef __cplusplus +} +#endif + +#endif /* ifdef CFG_FILE_H */ diff --git a/src/libserver/cfg_file_private.h b/src/libserver/cfg_file_private.h new file mode 100644 index 0000000..8c9fc65 --- /dev/null +++ b/src/libserver/cfg_file_private.h @@ -0,0 +1,41 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_CFG_FILE_PRIVATE_H +#define RSPAMD_CFG_FILE_PRIVATE_H + +#include "cfg_file.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Action config definition + */ +struct rspamd_action { + enum rspamd_action_type action_type; + int flags; /* enum rspamd_action_flags */ + guint priority; + gdouble threshold; + gchar *name; +}; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/cfg_rcl.cxx b/src/libserver/cfg_rcl.cxx new file mode 100644 index 0000000..3ac7560 --- /dev/null +++ b/src/libserver/cfg_rcl.cxx @@ -0,0 +1,4110 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lua/lua_common.h" +#include "cfg_rcl.h" +#include "rspamd.h" +#include "cfg_file_private.h" +#include "utlist.h" +#include "cfg_file.h" +#include "expression.h" +#include "src/libserver/composites/composites.h" +#include "libserver/worker_util.h" +#include "unix-std.h" +#include "cryptobox.h" +#include "libutil/multipattern.h" +#include "libmime/email_addr.h" +#include "libmime/lang_detection.h" + +#include <string> +#include <filesystem> +#include <algorithm>// for std::transform +#include <memory> +#include "contrib/ankerl/unordered_dense.h" +#include "fmt/core.h" +#include "libutil/cxx/util.hxx" +#include "libutil/cxx/file_util.hxx" +#include "frozen/unordered_set.h" +#include "frozen/string.h" + +#ifdef HAVE_SYSLOG_H +#include <syslog.h> +#endif + +#include <cmath> + +struct rspamd_rcl_default_handler_data { + struct rspamd_rcl_struct_parser pd; + std::string key; + rspamd_rcl_default_handler_t handler; +}; + +struct rspamd_rcl_sections_map; + +struct rspamd_rcl_section { + struct rspamd_rcl_sections_map *top{}; + std::string name; /**< name of section */ + std::optional<std::string> key_attr; + std::optional<std::string> default_key; + rspamd_rcl_handler_t handler{}; /**< handler of section attributes */ + enum ucl_type type; /**< type of attribute */ + bool required{}; /**< whether this param is required */ + bool strict_type{}; /**< whether we need strict type */ + mutable bool processed{}; /**< whether this section was processed */ + ankerl::unordered_dense::map<std::string, std::shared_ptr<struct rspamd_rcl_section>> subsections; + ankerl::unordered_dense::map<std::string, struct rspamd_rcl_default_handler_data> default_parser; /**< generic parsing fields */ + rspamd_rcl_section_fin_t fin{}; /** called at the end of section parsing */ + gpointer fin_ud{}; + ucl_object_t *doc_ref{}; /**< reference to the section's documentation */ + + virtual ~rspamd_rcl_section() + { + if (doc_ref) { + ucl_object_unref(doc_ref); + } + } +}; + +struct rspamd_worker_param_parser { + rspamd_rcl_default_handler_t handler; /**< handler function */ + struct rspamd_rcl_struct_parser parser; /**< parser attributes */ +}; + +struct rspamd_worker_cfg_parser { + struct pair_hash { + using is_avalanching = void; + template<class T1, class T2> + std::size_t operator()(const std::pair<T1, T2> &pair) const + { + return ankerl::unordered_dense::hash<T1>()(pair.first) ^ ankerl::unordered_dense::hash<T2>()(pair.second); + } + }; + ankerl::unordered_dense::map<std::pair<std::string, gpointer>, + rspamd_worker_param_parser, pair_hash> + parsers; /**< parsers hash */ + gint type; /**< workers quark */ + gboolean (*def_obj_parser)(ucl_object_t *obj, gpointer ud); /**< default object parser */ + gpointer def_ud; +}; + +struct rspamd_rcl_sections_map { + ankerl::unordered_dense::map<std::string, std::shared_ptr<struct rspamd_rcl_section>> sections; + std::vector<std::shared_ptr<struct rspamd_rcl_section>> sections_order; + ankerl::unordered_dense::map<int, struct rspamd_worker_cfg_parser> workers_parser; + ankerl::unordered_dense::set<std::string> lua_modules_seen; +}; + +static bool rspamd_rcl_process_section(struct rspamd_config *cfg, + const struct rspamd_rcl_section &sec, + gpointer ptr, const ucl_object_t *obj, rspamd_mempool_t *pool, + GError **err); +static bool +rspamd_rcl_section_parse_defaults(struct rspamd_config *cfg, + const struct rspamd_rcl_section §ion, + rspamd_mempool_t *pool, const ucl_object_t *obj, gpointer ptr, + GError **err); + +/* + * Common section handlers + */ +static gboolean +rspamd_rcl_logging_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, + const gchar *key, gpointer ud, struct rspamd_rcl_section *section, + GError **err) +{ + const ucl_object_t *val; + const gchar *facility = nullptr, *log_type = nullptr, *log_level = nullptr; + auto *cfg = (struct rspamd_config *) ud; + + val = ucl_object_lookup(obj, "type"); + if (val != nullptr && ucl_object_tostring_safe(val, &log_type)) { + if (g_ascii_strcasecmp(log_type, "file") == 0) { + /* Need to get filename */ + val = ucl_object_lookup(obj, "filename"); + if (val == nullptr || val->type != UCL_STRING) { + g_set_error(err, + CFG_RCL_ERROR, + ENOENT, + "filename attribute must be specified for file logging type"); + return FALSE; + } + cfg->log_type = RSPAMD_LOG_FILE; + cfg->log_file = rspamd_mempool_strdup(cfg->cfg_pool, + ucl_object_tostring(val)); + } + else if (g_ascii_strcasecmp(log_type, "syslog") == 0) { + /* Need to get facility */ +#ifdef HAVE_SYSLOG_H + cfg->log_facility = LOG_DAEMON; + cfg->log_type = RSPAMD_LOG_SYSLOG; + val = ucl_object_lookup(obj, "facility"); + if (val != nullptr && ucl_object_tostring_safe(val, &facility)) { + if (g_ascii_strcasecmp(facility, "LOG_AUTH") == 0 || + g_ascii_strcasecmp(facility, "auth") == 0) { + cfg->log_facility = LOG_AUTH; + } + else if (g_ascii_strcasecmp(facility, "LOG_CRON") == 0 || + g_ascii_strcasecmp(facility, "cron") == 0) { + cfg->log_facility = LOG_CRON; + } + else if (g_ascii_strcasecmp(facility, "LOG_DAEMON") == 0 || + g_ascii_strcasecmp(facility, "daemon") == 0) { + cfg->log_facility = LOG_DAEMON; + } + else if (g_ascii_strcasecmp(facility, "LOG_MAIL") == 0 || + g_ascii_strcasecmp(facility, "mail") == 0) { + cfg->log_facility = LOG_MAIL; + } + else if (g_ascii_strcasecmp(facility, "LOG_USER") == 0 || + g_ascii_strcasecmp(facility, "user") == 0) { + cfg->log_facility = LOG_USER; + } + else if (g_ascii_strcasecmp(facility, "LOG_LOCAL0") == 0 || + g_ascii_strcasecmp(facility, "local0") == 0) { + cfg->log_facility = LOG_LOCAL0; + } + else if (g_ascii_strcasecmp(facility, "LOG_LOCAL1") == 0 || + g_ascii_strcasecmp(facility, "local1") == 0) { + cfg->log_facility = LOG_LOCAL1; + } + else if (g_ascii_strcasecmp(facility, "LOG_LOCAL2") == 0 || + g_ascii_strcasecmp(facility, "local2") == 0) { + cfg->log_facility = LOG_LOCAL2; + } + else if (g_ascii_strcasecmp(facility, "LOG_LOCAL3") == 0 || + g_ascii_strcasecmp(facility, "local3") == 0) { + cfg->log_facility = LOG_LOCAL3; + } + else if (g_ascii_strcasecmp(facility, "LOG_LOCAL4") == 0 || + g_ascii_strcasecmp(facility, "local4") == 0) { + cfg->log_facility = LOG_LOCAL4; + } + else if (g_ascii_strcasecmp(facility, "LOG_LOCAL5") == 0 || + g_ascii_strcasecmp(facility, "local5") == 0) { + cfg->log_facility = LOG_LOCAL5; + } + else if (g_ascii_strcasecmp(facility, "LOG_LOCAL6") == 0 || + g_ascii_strcasecmp(facility, "local6") == 0) { + cfg->log_facility = LOG_LOCAL6; + } + else if (g_ascii_strcasecmp(facility, "LOG_LOCAL7") == 0 || + g_ascii_strcasecmp(facility, "local7") == 0) { + cfg->log_facility = LOG_LOCAL7; + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "invalid log facility: %s", + facility); + return FALSE; + } + } +#endif + } + else if (g_ascii_strcasecmp(log_type, + "stderr") == 0 || + g_ascii_strcasecmp(log_type, "console") == 0) { + cfg->log_type = RSPAMD_LOG_CONSOLE; + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "invalid log type: %s", + log_type); + return FALSE; + } + } + else { + /* No type specified */ + msg_warn_config( + "logging type is not specified correctly, log output to the console"); + } + + /* Handle log level */ + val = ucl_object_lookup(obj, "level"); + if (val != nullptr && ucl_object_tostring_safe(val, &log_level)) { + if (g_ascii_strcasecmp(log_level, "error") == 0) { + cfg->log_level = G_LOG_LEVEL_ERROR | G_LOG_LEVEL_CRITICAL; + } + else if (g_ascii_strcasecmp(log_level, "warning") == 0) { + cfg->log_level = G_LOG_LEVEL_WARNING; + } + else if (g_ascii_strcasecmp(log_level, "info") == 0) { + cfg->log_level = G_LOG_LEVEL_INFO | G_LOG_LEVEL_MESSAGE; + } + else if (g_ascii_strcasecmp(log_level, "message") == 0 || + g_ascii_strcasecmp(log_level, "notice") == 0) { + cfg->log_level = G_LOG_LEVEL_MESSAGE; + } + else if (g_ascii_strcasecmp(log_level, "silent") == 0) { + cfg->log_level = G_LOG_LEVEL_MESSAGE | G_LOG_LEVEL_INFO; + cfg->log_silent_workers = TRUE; + } + else if (g_ascii_strcasecmp(log_level, "debug") == 0) { + cfg->log_level = G_LOG_LEVEL_DEBUG; + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "invalid log level: %s", + log_level); + return FALSE; + } + } + + /* Handle flags */ + val = ucl_object_lookup_any(obj, "color", "log_color", nullptr); + if (val && ucl_object_toboolean(val)) { + cfg->log_flags |= RSPAMD_LOG_FLAG_COLOR; + } + + val = ucl_object_lookup_any(obj, "severity", "log_severity", nullptr); + if (val && ucl_object_toboolean(val)) { + cfg->log_flags |= RSPAMD_LOG_FLAG_SEVERITY; + } + + val = ucl_object_lookup_any(obj, "systemd", "log_systemd", nullptr); + if (val && ucl_object_toboolean(val)) { + cfg->log_flags |= RSPAMD_LOG_FLAG_SYSTEMD; + } + + val = ucl_object_lookup_any(obj, "json", "log_json", nullptr); + if (val && ucl_object_toboolean(val)) { + cfg->log_flags |= RSPAMD_LOG_FLAG_JSON; + } + + val = ucl_object_lookup(obj, "log_re_cache"); + if (val && ucl_object_toboolean(val)) { + cfg->log_flags |= RSPAMD_LOG_FLAG_RE_CACHE; + } + + val = ucl_object_lookup_any(obj, "usec", "log_usec", nullptr); + if (val && ucl_object_toboolean(val)) { + cfg->log_flags |= RSPAMD_LOG_FLAG_USEC; + } + + return rspamd_rcl_section_parse_defaults(cfg, *section, cfg->cfg_pool, obj, + (void *) cfg, err); +} + +static gboolean +rspamd_rcl_options_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, + const gchar *key, gpointer ud, + struct rspamd_rcl_section *section, GError **err) +{ + const ucl_object_t *dns, *upstream, *neighbours; + auto *cfg = (struct rspamd_config *) ud; + + auto maybe_subsection = rspamd::find_map(section->subsections, "dns"); + + dns = ucl_object_lookup(obj, "dns"); + if (maybe_subsection && dns != nullptr) { + if (!rspamd_rcl_section_parse_defaults(cfg, + *maybe_subsection.value().get(), cfg->cfg_pool, dns, + cfg, err)) { + return FALSE; + } + } + + maybe_subsection = rspamd::find_map(section->subsections, "upstream"); + + upstream = ucl_object_lookup_any(obj, "upstream", "upstreams", nullptr); + if (maybe_subsection && upstream != nullptr) { + if (!rspamd_rcl_section_parse_defaults(cfg, + *maybe_subsection.value().get(), cfg->cfg_pool, + upstream, cfg, err)) { + return FALSE; + } + } + + maybe_subsection = rspamd::find_map(section->subsections, "neighbours"); + + neighbours = ucl_object_lookup(obj, "neighbours"); + if (maybe_subsection && neighbours != nullptr) { + const ucl_object_t *cur; + + LL_FOREACH(neighbours, cur) + { + if (!rspamd_rcl_process_section(cfg, *maybe_subsection.value().get(), cfg, cur, + pool, err)) { + return FALSE; + } + } + } + + const auto *gtube_patterns = ucl_object_lookup(obj, "gtube_patterns"); + if (gtube_patterns != nullptr && ucl_object_type(gtube_patterns) == UCL_STRING) { + auto gtube_st = std::string{ucl_object_tostring(gtube_patterns)}; + std::transform(gtube_st.begin(), gtube_st.end(), gtube_st.begin(), [](const auto c) -> int { + if (c <= 'Z' && c >= 'A') + return c - ('Z' - 'z'); + return c; + }); + + + if (gtube_st == "all") { + cfg->gtube_patterns_policy = RSPAMD_GTUBE_ALL; + } + else if (gtube_st == "reject") { + cfg->gtube_patterns_policy = RSPAMD_GTUBE_REJECT; + } + else if (gtube_st == "disabled" || gtube_st == "disable") { + cfg->gtube_patterns_policy = RSPAMD_GTUBE_DISABLED; + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "invalid GTUBE patterns policy: %s", + gtube_st.c_str()); + return FALSE; + } + } + else if (auto *enable_test_patterns = ucl_object_lookup(obj, "enable_test_patterns"); enable_test_patterns != nullptr) { + /* Legacy setting */ + if (!!ucl_object_toboolean(enable_test_patterns)) { + cfg->gtube_patterns_policy = RSPAMD_GTUBE_ALL; + } + } + + if (rspamd_rcl_section_parse_defaults(cfg, + *section, cfg->cfg_pool, obj, + cfg, err)) { + /* We need to init this early */ + rspamd_multipattern_library_init(cfg->hs_cache_dir); + + return TRUE; + } + + return FALSE; +} + +struct rspamd_rcl_symbol_data { + struct rspamd_symbols_group *gr; + struct rspamd_config *cfg; +}; + +static gboolean +rspamd_rcl_group_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, + const gchar *key, gpointer ud, + struct rspamd_rcl_section *section, GError **err) +{ + auto *cfg = static_cast<rspamd_config *>(ud); + + g_assert(key != nullptr); + + auto *gr = static_cast<rspamd_symbols_group *>(g_hash_table_lookup(cfg->groups, key)); + + if (gr == nullptr) { + gr = rspamd_config_new_group(cfg, key); + } + + if (!rspamd_rcl_section_parse_defaults(cfg, *section, pool, obj, + gr, err)) { + return FALSE; + } + + if (const auto *elt = ucl_object_lookup(obj, "one_shot"); elt != nullptr) { + if (ucl_object_type(elt) != UCL_BOOLEAN) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "one_shot attribute is not boolean for symbol: '%s'", + key); + + return FALSE; + } + if (ucl_object_toboolean(elt)) { + gr->flags |= RSPAMD_SYMBOL_GROUP_ONE_SHOT; + } + } + + if (const auto *elt = ucl_object_lookup(obj, "disabled"); elt != nullptr) { + if (ucl_object_type(elt) != UCL_BOOLEAN) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "disabled attribute is not boolean for symbol: '%s'", + key); + + return FALSE; + } + if (ucl_object_toboolean(elt)) { + gr->flags |= RSPAMD_SYMBOL_GROUP_DISABLED; + } + } + + if (const auto *elt = ucl_object_lookup(obj, "enabled"); elt != nullptr) { + if (ucl_object_type(elt) != UCL_BOOLEAN) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "enabled attribute is not boolean for symbol: '%s'", + key); + + return FALSE; + } + if (!ucl_object_toboolean(elt)) { + gr->flags |= RSPAMD_SYMBOL_GROUP_DISABLED; + } + } + + if (const auto *elt = ucl_object_lookup(obj, "public"); elt != nullptr) { + if (ucl_object_type(elt) != UCL_BOOLEAN) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "public attribute is not boolean for symbol: '%s'", + key); + + return FALSE; + } + if (ucl_object_toboolean(elt)) { + gr->flags |= RSPAMD_SYMBOL_GROUP_PUBLIC; + } + } + + if (const auto *elt = ucl_object_lookup(obj, "private"); elt != nullptr) { + if (ucl_object_type(elt) != UCL_BOOLEAN) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "private attribute is not boolean for symbol: '%s'", + key); + + return FALSE; + } + if (!ucl_object_toboolean(elt)) { + gr->flags |= RSPAMD_SYMBOL_GROUP_PUBLIC; + } + } + + + if (const auto *elt = ucl_object_lookup(obj, "description"); elt != nullptr) { + gr->description = rspamd_mempool_strdup(cfg->cfg_pool, + ucl_object_tostring(elt)); + } + + struct rspamd_rcl_symbol_data sd = { + .gr = gr, + .cfg = cfg, + }; + + /* Handle symbols */ + if (const auto *val = ucl_object_lookup(obj, "symbols"); val != nullptr && ucl_object_type(val) == UCL_OBJECT) { + auto subsection = rspamd::find_map(section->subsections, "symbols"); + + g_assert(subsection.has_value()); + if (!rspamd_rcl_process_section(cfg, *subsection.value().get(), &sd, val, + pool, err)) { + + return FALSE; + } + } + + return TRUE; +} + +static gboolean +rspamd_rcl_symbol_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, + const gchar *key, gpointer ud, + struct rspamd_rcl_section *section, GError **err) +{ + auto *sd = static_cast<rspamd_rcl_symbol_data *>(ud); + struct rspamd_config *cfg; + const ucl_object_t *elt; + const gchar *description = nullptr; + gdouble score = NAN; + guint priority = 1, flags = 0; + gint nshots = 0; + + g_assert(key != nullptr); + cfg = sd->cfg; + + if ((elt = ucl_object_lookup(obj, "one_shot")) != nullptr) { + if (ucl_object_type(elt) != UCL_BOOLEAN) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "one_shot attribute is not boolean for symbol: '%s'", + key); + + return FALSE; + } + if (ucl_object_toboolean(elt)) { + nshots = 1; + } + } + + if ((elt = ucl_object_lookup(obj, "any_shot")) != nullptr) { + if (ucl_object_type(elt) != UCL_BOOLEAN) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "any_shot attribute is not boolean for symbol: '%s'", + key); + + return FALSE; + } + if (ucl_object_toboolean(elt)) { + nshots = -1; + } + } + + if ((elt = ucl_object_lookup(obj, "one_param")) != nullptr) { + if (ucl_object_type(elt) != UCL_BOOLEAN) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "one_param attribute is not boolean for symbol: '%s'", + key); + + return FALSE; + } + + if (ucl_object_toboolean(elt)) { + flags |= RSPAMD_SYMBOL_FLAG_ONEPARAM; + } + } + + if ((elt = ucl_object_lookup(obj, "ignore")) != nullptr) { + if (ucl_object_type(elt) != UCL_BOOLEAN) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "ignore attribute is not boolean for symbol: '%s'", + key); + + return FALSE; + } + + if (ucl_object_toboolean(elt)) { + flags |= RSPAMD_SYMBOL_FLAG_IGNORE_METRIC; + } + } + + if ((elt = ucl_object_lookup(obj, "enabled")) != nullptr) { + if (ucl_object_type(elt) != UCL_BOOLEAN) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "enabled attribute is not boolean for symbol: '%s'", + key); + + return FALSE; + } + + if (!ucl_object_toboolean(elt)) { + flags |= RSPAMD_SYMBOL_FLAG_DISABLED; + } + } + + if ((elt = ucl_object_lookup(obj, "nshots")) != nullptr) { + if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "nshots attribute is not numeric for symbol: '%s'", + key); + + return FALSE; + } + + nshots = ucl_object_toint(elt); + } + + elt = ucl_object_lookup_any(obj, "score", "weight", nullptr); + if (elt) { + if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "score attribute is not numeric for symbol: '%s'", + key); + + return FALSE; + } + + score = ucl_object_todouble(elt); + } + + elt = ucl_object_lookup(obj, "priority"); + if (elt) { + if (ucl_object_type(elt) != UCL_FLOAT && ucl_object_type(elt) != UCL_INT) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "priority attribute is not numeric for symbol: '%s'", + key); + + return FALSE; + } + + priority = ucl_object_toint(elt); + } + else { + priority = ucl_object_get_priority(obj) + 1; + } + + elt = ucl_object_lookup(obj, "description"); + if (elt) { + description = ucl_object_tostring(elt); + } + + if (sd->gr) { + rspamd_config_add_symbol(cfg, key, score, + description, sd->gr->name, flags, priority, nshots); + } + else { + rspamd_config_add_symbol(cfg, key, score, + description, nullptr, flags, priority, nshots); + } + + elt = ucl_object_lookup(obj, "groups"); + + if (elt) { + ucl_object_iter_t gr_it; + const ucl_object_t *cur_gr; + + gr_it = ucl_object_iterate_new(elt); + + while ((cur_gr = ucl_object_iterate_safe(gr_it, true)) != nullptr) { + rspamd_config_add_symbol_group(cfg, key, + ucl_object_tostring(cur_gr)); + } + + ucl_object_iterate_free(gr_it); + } + + return TRUE; +} + +static gboolean +rspamd_rcl_actions_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, + const gchar *key, gpointer ud, + struct rspamd_rcl_section *section, GError **err) +{ + auto *cfg = static_cast<rspamd_config *>(ud); + const ucl_object_t *cur; + ucl_object_iter_t it; + + it = ucl_object_iterate_new(obj); + + while ((cur = ucl_object_iterate_safe(it, true)) != nullptr) { + gint type = ucl_object_type(cur); + + if (type == UCL_NULL) { + rspamd_config_maybe_disable_action(cfg, ucl_object_key(cur), + ucl_object_get_priority(cur)); + } + else if (type == UCL_OBJECT || type == UCL_FLOAT || type == UCL_INT) { + /* Exceptions */ + auto default_elt = false; + + for (const auto &[name, def_elt]: section->default_parser) { + if (def_elt.key == ucl_object_key(cur)) { + default_elt = true; + break; + } + } + + if (default_elt) { + continue; + } + + /* Something non-default */ + if (!rspamd_config_set_action_score(cfg, + ucl_object_key(cur), + cur)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "invalid action definition for: '%s'", + ucl_object_key(cur)); + ucl_object_iterate_free(it); + + return FALSE; + } + } + } + + ucl_object_iterate_free(it); + + return rspamd_rcl_section_parse_defaults(cfg, *section, pool, obj, cfg, err); +} +constexpr const auto known_worker_attributes = frozen::make_unordered_set<frozen::string>({ + "bind_socket", + "listen", + "bind", + "count", + "max_files", + "max_core", + "enabled", +}); +static gboolean +rspamd_rcl_worker_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, + const gchar *key, gpointer ud, + struct rspamd_rcl_section *section, GError **err) +{ + auto *cfg = static_cast<rspamd_config *>(ud); + + g_assert(key != nullptr); + const auto *worker_type = key; + + auto qtype = g_quark_try_string(worker_type); + if (qtype == 0) { + msg_err_config("unknown worker type: %s", worker_type); + return FALSE; + } + + auto *wrk = rspamd_config_new_worker(cfg, nullptr); + wrk->options = ucl_object_copy(obj); + wrk->worker = rspamd_get_worker_by_type(cfg, qtype); + + if (wrk->worker == nullptr) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "unknown worker type: %s", + worker_type); + return FALSE; + } + + wrk->type = qtype; + + if (wrk->worker->worker_init_func) { + wrk->ctx = wrk->worker->worker_init_func(cfg); + } + + const auto *val = ucl_object_lookup_any(obj, "bind_socket", "listen", "bind", nullptr); + /* This name is more logical */ + if (val != nullptr) { + auto it = ucl_object_iterate_new(val); + const ucl_object_t *cur; + const char *worker_bind = nullptr; + + while ((cur = ucl_object_iterate_safe(it, true)) != nullptr) { + if (!ucl_object_tostring_safe(cur, &worker_bind)) { + continue; + } + if (!rspamd_parse_bind_line(cfg, wrk, worker_bind)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot parse bind line: %s", + worker_bind); + ucl_object_iterate_free(it); + return FALSE; + } + } + + ucl_object_iterate_free(it); + } + + if (!rspamd_rcl_section_parse_defaults(cfg, *section, cfg->cfg_pool, obj, + wrk, err)) { + return FALSE; + } + + /* Parse other attributes */ + auto maybe_wparser = rspamd::find_map(section->top->workers_parser, wrk->type); + + if (maybe_wparser && obj->type == UCL_OBJECT) { + auto &wparser = maybe_wparser.value().get(); + auto it = ucl_object_iterate_new(obj); + const ucl_object_t *cur; + + while ((cur = ucl_object_iterate_full(it, UCL_ITERATE_EXPLICIT)) != nullptr) { + auto srch = std::make_pair(ucl_object_key(cur), (gpointer) wrk->ctx); + auto maybe_specific = rspamd::find_map(wparser.parsers, srch); + + if (maybe_specific) { + auto &whandler = maybe_specific.value().get(); + const ucl_object_t *cur_obj; + + LL_FOREACH(cur, cur_obj) + { + if (!whandler.handler(cfg->cfg_pool, + cur_obj, + (void *) &whandler.parser, + section, + err)) { + + ucl_object_iterate_free(it); + return FALSE; + } + + if (!(whandler.parser.flags & RSPAMD_CL_FLAG_MULTIPLE)) { + break; + } + } + } + else if (!(wrk->worker->flags & RSPAMD_WORKER_NO_STRICT_CONFIG) && + known_worker_attributes.find(std::string_view{ucl_object_key(cur)}) == known_worker_attributes.end()) { + msg_warn_config("unknown worker attribute: %s; worker type: %s", ucl_object_key(cur), worker_type); + } + } + + ucl_object_iterate_free(it); + + if (wparser.def_obj_parser != nullptr) { + auto *robj = ucl_object_ref(obj); + + if (!wparser.def_obj_parser(robj, wparser.def_ud)) { + ucl_object_unref(robj); + + return FALSE; + } + + ucl_object_unref(robj); + } + } + + cfg->workers = g_list_prepend(cfg->workers, wrk); + + return TRUE; +} + +static gboolean +rspamd_rcl_lua_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, + const gchar *key, gpointer ud, + struct rspamd_rcl_section *section, GError **err) +{ + namespace fs = std::filesystem; + auto *cfg = static_cast<rspamd_config *>(ud); + auto lua_src = fs::path{ucl_object_tostring(obj)}; + auto *L = RSPAMD_LUA_CFG_STATE(cfg); + std::error_code ec1; + + auto lua_dir = fs::weakly_canonical(lua_src.parent_path(), ec1); + auto lua_file = lua_src.filename(); + + if (!ec1 && !lua_dir.empty() && !lua_file.empty()) { + auto cur_dir = fs::current_path(ec1); + if (!ec1 && !cur_dir.empty() && ::chdir(lua_dir.c_str()) != -1) { + /* Push traceback function */ + lua_pushcfunction(L, &rspamd_lua_traceback); + auto err_idx = lua_gettop(L); + + /* Load file */ + if (luaL_loadfile(L, lua_file.c_str()) != 0) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot load lua file %s: %s", + lua_src.c_str(), + lua_tostring(L, -1)); + if (::chdir(cur_dir.c_str()) == -1) { + msg_err_config("cannot chdir to %s: %s", cur_dir.c_str(), + strerror(errno)); + } + + return FALSE; + } + + /* Now do it */ + if (lua_pcall(L, 0, 0, err_idx) != 0) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot init lua file %s: %s", + lua_src.c_str(), + lua_tostring(L, -1)); + lua_settop(L, 0); + + if (::chdir(cur_dir.c_str()) == -1) { + msg_err_config("cannot chdir to %s: %s", cur_dir.c_str(), + strerror(errno)); + } + + return FALSE; + } + + lua_pop(L, 1); + } + else { + g_set_error(err, CFG_RCL_ERROR, ENOENT, "cannot chdir to %s: %s", + lua_dir.c_str(), strerror(errno)); + if (::chdir(cur_dir.c_str()) == -1) { + msg_err_config("cannot chdir back to %s: %s", cur_dir.c_str(), strerror(errno)); + } + + return FALSE; + } + if (::chdir(cur_dir.c_str()) == -1) { + msg_err_config("cannot chdir back to %s: %s", cur_dir.c_str(), strerror(errno)); + } + } + else { + + g_set_error(err, CFG_RCL_ERROR, ENOENT, "cannot find to %s: %s", + lua_src.c_str(), strerror(errno)); + return FALSE; + } + + return TRUE; +} + +static int +rspamd_lua_mod_sort_fn(gconstpointer a, gconstpointer b) +{ + auto *m1 = *(const script_module **) a; + auto *m2 = *(const script_module **) b; + + return strcmp(m1->name, m2->name); +} + +gboolean +rspamd_rcl_add_lua_plugins_path(struct rspamd_rcl_sections_map *sections, + struct rspamd_config *cfg, + const gchar *path, + gboolean main_path, + GError **err) +{ + namespace fs = std::filesystem; + auto dir = fs::path{path}; + std::error_code ec; + + auto add_single_file = [&](const fs::path &fpath) -> bool { + auto fname = fpath.filename(); + auto modname = fname.string(); + + if (fname.has_extension()) { + modname = modname.substr(0, modname.size() - fname.extension().native().size()); + } + auto *cur_mod = rspamd_mempool_alloc_type(cfg->cfg_pool, + struct script_module); + cur_mod->path = rspamd_mempool_strdup(cfg->cfg_pool, fpath.c_str()); + cur_mod->name = rspamd_mempool_strdup(cfg->cfg_pool, modname.c_str()); + + if (sections->lua_modules_seen.contains(modname)) { + msg_info_config("already seen module %s, skip %s", + cur_mod->name, cur_mod->path); + return false; + } + + g_ptr_array_add(cfg->script_modules, cur_mod); + sections->lua_modules_seen.insert(fname.string()); + + return true; + }; + + if (fs::is_regular_file(dir, ec) && dir.has_extension() && dir.extension() == ".lua") { + add_single_file(dir); + } + else if (!fs::is_directory(dir, ec)) { + if (!fs::exists(dir) && !main_path) { + msg_debug_config("optional plugins path %s is absent, skip it", path); + + return TRUE; + } + + g_set_error(err, + CFG_RCL_ERROR, + errno, + "invalid lua path spec %s, %s", + path, + ec.message().c_str()); + return FALSE; + } + else { + /* Handle directory */ + for (const auto &p: fs::recursive_directory_iterator(dir, ec)) { + auto fpath = p.path().string(); + if (p.is_regular_file() && fpath.ends_with(".lua")) { + add_single_file(p.path()); + } + } + } + + g_ptr_array_sort(cfg->script_modules, rspamd_lua_mod_sort_fn); + + return TRUE; +} + +static gboolean +rspamd_rcl_modules_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, + const gchar *key, gpointer ud, + struct rspamd_rcl_section *section, GError **err) +{ + auto *cfg = static_cast<rspamd_config *>(ud); + const char *data; + + if (obj->type == UCL_OBJECT) { + const auto *val = ucl_object_lookup(obj, "path"); + + if (val) { + const auto *cur = val; + LL_FOREACH(val, cur) + { + if (ucl_object_tostring_safe(cur, &data)) { + if (!rspamd_rcl_add_lua_plugins_path(section->top, + cfg, + data, + TRUE, + err)) { + return FALSE; + } + } + } + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "path attribute is missing"); + + return FALSE; + } + + val = ucl_object_lookup(obj, "fallback_path"); + + if (val) { + const auto *cur = val; + LL_FOREACH(val, cur) + { + if (ucl_object_tostring_safe(cur, &data)) { + if (!rspamd_rcl_add_lua_plugins_path(section->top, + cfg, + data, + FALSE, + err)) { + + return FALSE; + } + } + } + } + + val = ucl_object_lookup(obj, "try_path"); + + if (val) { + const auto *cur = val; + LL_FOREACH(val, cur) + { + if (ucl_object_tostring_safe(cur, &data)) { + if (!rspamd_rcl_add_lua_plugins_path(section->top, + cfg, + data, + FALSE, + err)) { + + return FALSE; + } + } + } + } + } + else if (ucl_object_tostring_safe(obj, &data)) { + if (!rspamd_rcl_add_lua_plugins_path(section->top, cfg, data, TRUE, err)) { + return FALSE; + } + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "module parameter has wrong type (must be an object or a string)"); + return FALSE; + } + + return TRUE; +} + +struct statfile_parser_data { + struct rspamd_config *cfg; + struct rspamd_classifier_config *ccf; +}; + +static gboolean +rspamd_rcl_statfile_handler(rspamd_mempool_t *pool, const ucl_object_t *obj, + const gchar *key, gpointer ud, + struct rspamd_rcl_section *section, GError **err) +{ + auto *stud = (struct statfile_parser_data *) ud; + GList *labels; + + g_assert(key != nullptr); + + auto *cfg = stud->cfg; + auto *ccf = stud->ccf; + + auto *st = rspamd_config_new_statfile(cfg, nullptr); + st->symbol = rspamd_mempool_strdup(cfg->cfg_pool, key); + + if (rspamd_rcl_section_parse_defaults(cfg, *section, pool, obj, st, err)) { + ccf->statfiles = rspamd_mempool_glist_prepend(pool, ccf->statfiles, st); + + if (st->label != nullptr) { + labels = (GList *) g_hash_table_lookup(ccf->labels, st->label); + if (labels != nullptr) { + /* Must use append to preserve the head stored in the hash table */ + labels = g_list_append(labels, st); + } + else { + g_hash_table_insert(ccf->labels, st->label, + g_list_prepend(nullptr, st)); + } + } + + if (st->symbol != nullptr) { + g_hash_table_insert(cfg->classifiers_symbols, st->symbol, st); + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "statfile must have a symbol defined"); + return FALSE; + } + + st->opts = (ucl_object_t *) obj; + st->clcf = ccf; + + const auto *val = ucl_object_lookup(obj, "spam"); + if (val == nullptr) { + msg_info_config( + "statfile %s has no explicit 'spam' setting, trying to guess by symbol", + st->symbol); + if (rspamd_substring_search_caseless(st->symbol, + strlen(st->symbol), "spam", 4) != -1) { + st->is_spam = TRUE; + } + else if (rspamd_substring_search_caseless(st->symbol, + strlen(st->symbol), "ham", 3) != -1) { + st->is_spam = FALSE; + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot guess spam setting from %s", + st->symbol); + return FALSE; + } + msg_info_config("guessed that statfile with symbol %s is %s", + st->symbol, + st->is_spam ? "spam" : "ham"); + } + return TRUE; + } + + return FALSE; +} + +static gboolean +rspamd_rcl_classifier_handler(rspamd_mempool_t *pool, + const ucl_object_t *obj, + const gchar *key, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *cfg = static_cast<rspamd_config *>(ud); + + g_assert(key != nullptr); + auto *ccf = rspamd_config_new_classifier(cfg, nullptr); + auto *tkcf = (rspamd_tokenizer_config *) nullptr; + + ccf->classifier = rspamd_mempool_strdup(cfg->cfg_pool, key); + + if (rspamd_rcl_section_parse_defaults(cfg, *section, cfg->cfg_pool, obj, + ccf, err)) { + + auto stat_section = rspamd::find_map(section->subsections, "statfile"); + + if (ccf->classifier == nullptr) { + ccf->classifier = rspamd_mempool_strdup(cfg->cfg_pool, "bayes"); + } + + if (ccf->name == nullptr) { + ccf->name = ccf->classifier; + } + + auto it = ucl_object_iterate_new(obj); + const auto *val = obj; + auto res = TRUE; + + while ((val = ucl_object_iterate_safe(it, true)) != nullptr && res) { + const auto *st_key = ucl_object_key(val); + + if (st_key != nullptr) { + if (g_ascii_strcasecmp(st_key, "statfile") == 0) { + const auto *cur = val; + LL_FOREACH(val, cur) + { + struct statfile_parser_data stud = {.cfg = cfg, .ccf = ccf}; + res = rspamd_rcl_process_section(cfg, *stat_section.value().get(), &stud, + cur, cfg->cfg_pool, err); + + if (!res) { + ucl_object_iterate_free(it); + + return FALSE; + } + } + } + else if (g_ascii_strcasecmp(st_key, "tokenizer") == 0) { + tkcf = rspamd_mempool_alloc0_type(cfg->cfg_pool, rspamd_tokenizer_config); + + if (ucl_object_type(val) == UCL_STRING) { + tkcf->name = ucl_object_tostring(val); + } + else if (ucl_object_type(val) == UCL_OBJECT) { + const auto *cur = ucl_object_lookup(val, "name"); + if (cur != nullptr) { + tkcf->name = ucl_object_tostring(cur); + tkcf->opts = val; + } + else { + cur = ucl_object_lookup(val, "type"); + if (cur != nullptr) { + tkcf->name = ucl_object_tostring(cur); + tkcf->opts = val; + } + } + } + } + } + } + + ucl_object_iterate_free(it); + } + else { + msg_err_config("fatal configuration error, cannot parse statfile definition"); + } + + if (tkcf == nullptr) { + tkcf = rspamd_mempool_alloc0_type(cfg->cfg_pool, rspamd_tokenizer_config); + tkcf->name = nullptr; + } + + ccf->tokenizer = tkcf; + + /* Handle lua conditions */ + const auto *val = ucl_object_lookup_any(obj, "learn_condition", nullptr); + + if (val) { + const auto *cur = val; + LL_FOREACH(val, cur) + { + if (ucl_object_type(cur) == UCL_STRING) { + const gchar *lua_script; + gsize slen; + gint ref_idx; + + lua_script = ucl_object_tolstring(cur, &slen); + ref_idx = rspamd_lua_function_ref_from_str(RSPAMD_LUA_CFG_STATE(cfg), + lua_script, slen, "learn_condition", err); + + if (ref_idx == LUA_NOREF) { + return FALSE; + } + + rspamd_lua_add_ref_dtor(RSPAMD_LUA_CFG_STATE(cfg), cfg->cfg_pool, ref_idx); + ccf->learn_conditions = rspamd_mempool_glist_append( + cfg->cfg_pool, + ccf->learn_conditions, + GINT_TO_POINTER(ref_idx)); + } + } + } + + val = ucl_object_lookup_any(obj, "classify_condition", nullptr); + + if (val) { + const auto *cur = val; + LL_FOREACH(val, cur) + { + if (ucl_object_type(cur) == UCL_STRING) { + const gchar *lua_script; + gsize slen; + gint ref_idx; + + lua_script = ucl_object_tolstring(cur, &slen); + ref_idx = rspamd_lua_function_ref_from_str(RSPAMD_LUA_CFG_STATE(cfg), + lua_script, slen, "classify_condition", err); + + if (ref_idx == LUA_NOREF) { + return FALSE; + } + + rspamd_lua_add_ref_dtor(RSPAMD_LUA_CFG_STATE(cfg), cfg->cfg_pool, ref_idx); + ccf->classify_conditions = rspamd_mempool_glist_append( + cfg->cfg_pool, + ccf->classify_conditions, + GINT_TO_POINTER(ref_idx)); + } + } + } + + ccf->opts = (ucl_object_t *) obj; + cfg->classifiers = g_list_prepend(cfg->classifiers, ccf); + + return TRUE; +} + +static gboolean +rspamd_rcl_composite_handler(rspamd_mempool_t *pool, + const ucl_object_t *obj, + const gchar *key, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *cfg = static_cast<rspamd_config *>(ud); + void *composite; + const gchar *composite_name; + + g_assert(key != nullptr); + + composite_name = key; + + const auto *val = ucl_object_lookup(obj, "enabled"); + if (val != nullptr && !ucl_object_toboolean(val)) { + msg_info_config("composite %s is disabled", composite_name); + return TRUE; + } + + if ((composite = rspamd_composites_manager_add_from_ucl(cfg->composites_manager, + composite_name, obj)) != nullptr) { + rspamd_symcache_add_symbol(cfg->cache, composite_name, 0, + nullptr, composite, SYMBOL_TYPE_COMPOSITE, -1); + } + + return composite != nullptr; +} + +static gboolean +rspamd_rcl_composites_handler(rspamd_mempool_t *pool, + const ucl_object_t *obj, + const gchar *key, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto success = TRUE; + + auto it = ucl_object_iterate_new(obj); + const auto *cur = obj; + + while ((cur = ucl_object_iterate_safe(it, true))) { + success = rspamd_rcl_composite_handler(pool, cur, + ucl_object_key(cur), ud, section, err); + if (!success) { + break; + } + } + + ucl_object_iterate_free(it); + + return success; +} + +static gboolean +rspamd_rcl_neighbours_handler(rspamd_mempool_t *pool, + const ucl_object_t *obj, + const gchar *key, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *cfg = static_cast<rspamd_config *>(ud); + auto has_port = FALSE, has_proto = FALSE; + const gchar *p; + + if (key == nullptr) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "missing name for neighbour"); + return FALSE; + } + + const auto *hostval = ucl_object_lookup(obj, "host"); + + if (hostval == nullptr || ucl_object_type(hostval) != UCL_STRING) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "missing host for neighbour: %s", ucl_object_key(obj)); + return FALSE; + } + + auto *neigh = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(neigh, ucl_object_copy(hostval), "host", 0, false); + + if ((p = strrchr(ucl_object_tostring(hostval), ':')) != nullptr) { + if (g_ascii_isdigit(p[1])) { + has_port = TRUE; + } + } + + if (strstr(ucl_object_tostring(hostval), "://") != nullptr) { + has_proto = TRUE; + } + + /* Now make url */ + auto urlstr = std::string{}; + const auto *pathval = ucl_object_lookup(obj, "path"); + + if (!has_proto) { + urlstr += "http://"; + } + + urlstr += ucl_object_tostring(hostval); + + if (!has_port) { + urlstr += ":11334"; + } + + if (pathval == nullptr) { + urlstr += "/"; + } + else { + urlstr += ucl_object_tostring(pathval); + } + + ucl_object_insert_key(neigh, + ucl_object_fromlstring(urlstr.data(), urlstr.size()), + "url", 0, false); + ucl_object_insert_key(cfg->neighbours, neigh, key, 0, true); + + return TRUE; +} + + +struct rspamd_rcl_section * +rspamd_rcl_add_section(struct rspamd_rcl_sections_map **top, + struct rspamd_rcl_section *parent_section, + const gchar *name, const gchar *key_attr, rspamd_rcl_handler_t handler, + enum ucl_type type, gboolean required, gboolean strict_type) +{ + return rspamd_rcl_add_section_doc(top, parent_section, name, key_attr, handler, + type, required, strict_type, nullptr, nullptr); +} + +struct rspamd_rcl_section * +rspamd_rcl_add_section_doc(struct rspamd_rcl_sections_map **top, + struct rspamd_rcl_section *parent_section, + const gchar *name, const gchar *key_attr, rspamd_rcl_handler_t handler, + enum ucl_type type, gboolean required, gboolean strict_type, + ucl_object_t *doc_target, + const gchar *doc_string) +{ + if (top == nullptr) { + g_error("invalid arguments to rspamd_rcl_add_section"); + return nullptr; + } + if (*top == nullptr) { + *top = new rspamd_rcl_sections_map; + } + + auto fill_section = [&](struct rspamd_rcl_section *section) { + section->name = name; + if (key_attr) { + section->key_attr = std::string{key_attr}; + } + section->handler = handler; + section->type = type; + section->strict_type = strict_type; + + if (doc_target == nullptr) { + if (parent_section && parent_section->doc_ref) { + section->doc_ref = ucl_object_ref(rspamd_rcl_add_doc_obj(parent_section->doc_ref, + doc_string, + name, + type, + nullptr, + 0, + nullptr, + 0)); + } + else { + section->doc_ref = nullptr; + } + } + else { + section->doc_ref = ucl_object_ref(rspamd_rcl_add_doc_obj(doc_target, + doc_string, + name, + type, + nullptr, + 0, + nullptr, + 0)); + } + section->top = *top; + }; + + /* Select the appropriate container and insert section inside it */ + if (parent_section) { + auto it = parent_section->subsections.insert(std::make_pair(std::string{name}, + std::make_shared<rspamd_rcl_section>())); + if (!it.second) { + g_error("invalid arguments to rspamd_rcl_add_section"); + return nullptr; + } + + fill_section(it.first->second.get()); + return it.first->second.get(); + } + else { + auto it = (*top)->sections.insert(std::make_pair(std::string{name}, + std::make_shared<rspamd_rcl_section>())); + if (!it.second) { + g_error("invalid arguments to rspamd_rcl_add_section"); + return nullptr; + } + + (*top)->sections_order.push_back(it.first->second); + fill_section(it.first->second.get()); + return it.first->second.get(); + } +} + +struct rspamd_rcl_default_handler_data * +rspamd_rcl_add_default_handler(struct rspamd_rcl_section *section, + const gchar *name, + rspamd_rcl_default_handler_t handler, + goffset offset, + gint flags, + const gchar *doc_string) +{ + auto it = section->default_parser.emplace(std::make_pair(std::string{name}, rspamd_rcl_default_handler_data{})); + + auto &nhandler = it.first->second; + nhandler.key = name; + nhandler.handler = handler; + nhandler.pd.offset = offset; + nhandler.pd.flags = flags; + + if (section->doc_ref != nullptr) { + rspamd_rcl_add_doc_obj(section->doc_ref, + doc_string, + name, + UCL_NULL, + handler, + flags, + nullptr, + 0); + } + + return &nhandler; +} + +struct rspamd_rcl_sections_map * +rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections) +{ + auto *top = new rspamd_rcl_sections_map; + /* + * Important notice: + * the order of parsing is equal to order of this initialization, therefore + * it is possible to init some portions of config prior to others + */ + + /** + * Logging section + */ + if (!(skip_sections && g_hash_table_lookup(skip_sections, "logging"))) { + auto *sub = rspamd_rcl_add_section_doc(&top, nullptr, + "logging", nullptr, + rspamd_rcl_logging_handler, + UCL_OBJECT, + FALSE, + TRUE, + cfg->doc_strings, + "Configure rspamd logging"); + /* Default handlers */ + rspamd_rcl_add_default_handler(sub, + "log_buffer", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, log_buf_size), + RSPAMD_CL_FLAG_INT_32, + "Size of log buffer in bytes (for file logging)"); + rspamd_rcl_add_default_handler(sub, + "log_urls", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, log_urls), + 0, + "Write each URL found in a message to the log file"); + rspamd_rcl_add_default_handler(sub, + "debug_ip", + rspamd_rcl_parse_struct_ucl, + G_STRUCT_OFFSET(struct rspamd_config, debug_ip_map), + 0, + "Enable debugging log for the specified IP addresses"); + rspamd_rcl_add_default_handler(sub, + "debug_modules", + rspamd_rcl_parse_struct_string_list, + G_STRUCT_OFFSET(struct rspamd_config, debug_modules), + RSPAMD_CL_FLAG_STRING_LIST_HASH, + "Enable debugging for the specified modules"); + rspamd_rcl_add_default_handler(sub, + "log_format", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, log_format_str), + 0, + "Specify format string for the task logging output " + "(https://rspamd.com/doc/configuration/logging.html " + "for details)"); + rspamd_rcl_add_default_handler(sub, + "encryption_key", + rspamd_rcl_parse_struct_pubkey, + G_STRUCT_OFFSET(struct rspamd_config, log_encryption_key), + 0, + "Encrypt sensitive information in logs using this pubkey"); + rspamd_rcl_add_default_handler(sub, + "error_elts", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, log_error_elts), + RSPAMD_CL_FLAG_UINT, + "Size of circular buffer for last errors (10 by default)"); + rspamd_rcl_add_default_handler(sub, + "error_maxlen", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, log_error_elt_maxlen), + RSPAMD_CL_FLAG_UINT, + "Size of each element in error log buffer (1000 by default)"); + rspamd_rcl_add_default_handler(sub, + "task_max_elts", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, log_task_max_elts), + RSPAMD_CL_FLAG_UINT, + "Maximum number of elements in task log entry (7 by default)"); + + /* Documentation only options, handled in log_handler to map flags */ + rspamd_rcl_add_doc_by_path(cfg, + "logging", + "Enable colored output (for console logging)", + "log_color", + UCL_BOOLEAN, + nullptr, + 0, + nullptr, + 0); + rspamd_rcl_add_doc_by_path(cfg, + "logging", + "Enable severity logging output (e.g. [error] or [warning])", + "log_severity", + UCL_BOOLEAN, + nullptr, + 0, + nullptr, + 0); + rspamd_rcl_add_doc_by_path(cfg, + "logging", + "Enable systemd compatible logging", + "systemd", + UCL_BOOLEAN, + nullptr, + 0, + nullptr, + 0); + rspamd_rcl_add_doc_by_path(cfg, + "logging", + "Write statistics of regexp processing to log (useful for hyperscan)", + "log_re_cache", + UCL_BOOLEAN, + nullptr, + 0, + nullptr, + 0); + rspamd_rcl_add_doc_by_path(cfg, + "logging", + "Use microseconds resolution for timestamps", + "log_usec", + UCL_BOOLEAN, + nullptr, + 0, + nullptr, + 0); + } + if (!(skip_sections && g_hash_table_lookup(skip_sections, "options"))) { + /** + * Options section + */ + auto *sub = rspamd_rcl_add_section_doc(&top, nullptr, + "options", nullptr, + rspamd_rcl_options_handler, + UCL_OBJECT, + FALSE, + TRUE, + cfg->doc_strings, + "Global rspamd options"); + rspamd_rcl_add_default_handler(sub, + "cache_file", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, cache_filename), + RSPAMD_CL_FLAG_STRING_PATH, + "Path to the cache file"); + rspamd_rcl_add_default_handler(sub, + "cache_reload", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, cache_reload_time), + RSPAMD_CL_FLAG_TIME_FLOAT, + "How often cache reload should be performed"); + + /* Old DNS configuration */ + rspamd_rcl_add_default_handler(sub, + "dns_nameserver", + rspamd_rcl_parse_struct_ucl, + G_STRUCT_OFFSET(struct rspamd_config, nameservers), + 0, + "Legacy option for DNS servers used"); + rspamd_rcl_add_default_handler(sub, + "dns_timeout", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, dns_timeout), + RSPAMD_CL_FLAG_TIME_FLOAT, + "Legacy option for DNS request timeout"); + rspamd_rcl_add_default_handler(sub, + "dns_retransmits", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, dns_retransmits), + RSPAMD_CL_FLAG_INT_32, + "Legacy option for DNS retransmits count"); + rspamd_rcl_add_default_handler(sub, + "dns_sockets", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, dns_io_per_server), + RSPAMD_CL_FLAG_INT_32, + "Legacy option for DNS sockets per server count"); + rspamd_rcl_add_default_handler(sub, + "dns_max_requests", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, dns_max_requests), + RSPAMD_CL_FLAG_INT_32, + "Maximum DNS requests per task (default: 64)"); + rspamd_rcl_add_default_handler(sub, + "control_socket", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, control_socket_path), + 0, + "Path to the control socket"); + rspamd_rcl_add_default_handler(sub, + "explicit_modules", + rspamd_rcl_parse_struct_string_list, + G_STRUCT_OFFSET(struct rspamd_config, explicit_modules), + RSPAMD_CL_FLAG_STRING_LIST_HASH, + "Always load these modules even if they are not configured explicitly"); + rspamd_rcl_add_default_handler(sub, + "allow_raw_input", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, allow_raw_input), + 0, + "Allow non MIME input for rspamd"); + rspamd_rcl_add_default_handler(sub, + "one_shot", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, one_shot_mode), + 0, + "Add all symbols only once per message"); + rspamd_rcl_add_default_handler(sub, + "check_attachements", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, check_text_attachements), + 0, + "Treat text attachments as normal text parts"); + rspamd_rcl_add_default_handler(sub, + "tempdir", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, temp_dir), + RSPAMD_CL_FLAG_STRING_PATH, + "Directory for temporary files"); + rspamd_rcl_add_default_handler(sub, + "pidfile", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, pid_file), + RSPAMD_CL_FLAG_STRING_PATH, + "Path to the pid file"); + rspamd_rcl_add_default_handler(sub, + "filters", + rspamd_rcl_parse_struct_string_list, + G_STRUCT_OFFSET(struct rspamd_config, filters), + 0, + "List of internal filters enabled"); + rspamd_rcl_add_default_handler(sub, + "map_watch_interval", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, map_timeout), + RSPAMD_CL_FLAG_TIME_FLOAT, + "Interval for checking maps"); + rspamd_rcl_add_default_handler(sub, + "map_file_watch_multiplier", + rspamd_rcl_parse_struct_double, + G_STRUCT_OFFSET(struct rspamd_config, map_file_watch_multiplier), + 0, + "Multiplier for map watch interval when map is file"); + rspamd_rcl_add_default_handler(sub, + "maps_cache_dir", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, maps_cache_dir), + 0, + "Directory to save maps cached data (default: $DBDIR)"); + rspamd_rcl_add_default_handler(sub, + "monitoring_watch_interval", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, monitored_interval), + RSPAMD_CL_FLAG_TIME_FLOAT, + "Interval for checking monitored instances"); + rspamd_rcl_add_default_handler(sub, + "disable_monitoring", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, disable_monitored), + 0, + "Disable monitoring completely"); + rspamd_rcl_add_default_handler(sub, + "fips_mode", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, fips_mode), + 0, + "Enable FIPS 140-2 mode in OpenSSL"); + rspamd_rcl_add_default_handler(sub, + "dynamic_conf", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, dynamic_conf), + 0, + "Path to the dynamic configuration"); + rspamd_rcl_add_default_handler(sub, + "rrd", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, rrd_file), + RSPAMD_CL_FLAG_STRING_PATH, + "Path to RRD file"); + rspamd_rcl_add_default_handler(sub, + "stats_file", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, stats_file), + RSPAMD_CL_FLAG_STRING_PATH, + "Path to stats file"); + rspamd_rcl_add_default_handler(sub, + "history_file", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, history_file), + RSPAMD_CL_FLAG_STRING_PATH, + "Path to history file"); + rspamd_rcl_add_default_handler(sub, + "check_all_filters", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, check_all_filters), + 0, + "Always check all filters"); + rspamd_rcl_add_default_handler(sub, + "public_groups_only", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, public_groups_only), + 0, + "Output merely public groups everywhere"); + rspamd_rcl_add_default_handler(sub, + "enable_css_parser", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, enable_css_parser), + 0, + "Enable CSS parser (experimental)"); + rspamd_rcl_add_default_handler(sub, + "enable_experimental", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, enable_experimental), + 0, + "Enable experimental plugins"); + rspamd_rcl_add_default_handler(sub, + "disable_pcre_jit", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, disable_pcre_jit), + 0, + "Disable PCRE JIT"); + rspamd_rcl_add_default_handler(sub, + "min_word_len", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, min_word_len), + RSPAMD_CL_FLAG_UINT, + "Minimum length of the word to be considered in statistics/fuzzy"); + rspamd_rcl_add_default_handler(sub, + "max_word_len", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_word_len), + RSPAMD_CL_FLAG_UINT, + "Maximum length of the word to be considered in statistics/fuzzy"); + rspamd_rcl_add_default_handler(sub, + "max_html_len", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_word_len), + RSPAMD_CL_FLAG_INT_SIZE, + "Maximum length of the html part to be parsed"); + rspamd_rcl_add_default_handler(sub, + "words_decay", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, words_decay), + RSPAMD_CL_FLAG_UINT, + "Start skipping words at this amount"); + rspamd_rcl_add_default_handler(sub, + "url_tld", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, tld_file), + RSPAMD_CL_FLAG_STRING_PATH, + "Path to the TLD file for urls detector"); + rspamd_rcl_add_default_handler(sub, + "tld", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, tld_file), + RSPAMD_CL_FLAG_STRING_PATH, + "Path to the TLD file for urls detector"); + rspamd_rcl_add_default_handler(sub, + "hs_cache_dir", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, hs_cache_dir), + RSPAMD_CL_FLAG_STRING_PATH, + "Path directory where rspamd would save hyperscan cache"); + rspamd_rcl_add_default_handler(sub, + "history_rows", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, history_rows), + RSPAMD_CL_FLAG_UINT, + "Number of records in the history file"); + rspamd_rcl_add_default_handler(sub, + "disable_hyperscan", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, disable_hyperscan), + 0, + "Disable hyperscan optimizations for regular expressions"); + rspamd_rcl_add_default_handler(sub, + "vectorized_hyperscan", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, vectorized_hyperscan), + 0, + "Use hyperscan in vectorized mode (obsoleted, do not use)"); + rspamd_rcl_add_default_handler(sub, + "cores_dir", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, cores_dir), + RSPAMD_CL_FLAG_STRING_PATH, + "Path to the directory where rspamd core files are intended to be dumped"); + rspamd_rcl_add_default_handler(sub, + "max_cores_size", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_cores_size), + RSPAMD_CL_FLAG_INT_SIZE, + "Limit of joint size of all files in `cores_dir`"); + rspamd_rcl_add_default_handler(sub, + "max_cores_count", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_cores_count), + RSPAMD_CL_FLAG_INT_SIZE, + "Limit of files count in `cores_dir`"); + rspamd_rcl_add_default_handler(sub, + "local_addrs", + rspamd_rcl_parse_struct_ucl, + G_STRUCT_OFFSET(struct rspamd_config, local_addrs), + 0, + "Use the specified addresses as local ones"); + rspamd_rcl_add_default_handler(sub, + "local_networks", + rspamd_rcl_parse_struct_ucl, + G_STRUCT_OFFSET(struct rspamd_config, local_addrs), + 0, + "Use the specified addresses as local ones (alias for `local_addrs`)"); + rspamd_rcl_add_default_handler(sub, + "trusted_keys", + rspamd_rcl_parse_struct_string_list, + G_STRUCT_OFFSET(struct rspamd_config, trusted_keys), + RSPAMD_CL_FLAG_STRING_LIST_HASH, + "List of trusted public keys used for signatures in base32 encoding"); + rspamd_rcl_add_default_handler(sub, + "enable_shutdown_workaround", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, enable_shutdown_workaround), + 0, + "Enable workaround for legacy clients"); + rspamd_rcl_add_default_handler(sub, + "ignore_received", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, ignore_received), + 0, + "Ignore data from the first received header"); + rspamd_rcl_add_default_handler(sub, + "ssl_ca_path", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, ssl_ca_path), + RSPAMD_CL_FLAG_STRING_PATH, + "Path to ssl CA file"); + rspamd_rcl_add_default_handler(sub, + "ssl_ciphers", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, ssl_ciphers), + 0, + "List of ssl ciphers (e.g. HIGH:!aNULL:!kRSA:!PSK:!SRP:!MD5:!RC4)"); + rspamd_rcl_add_default_handler(sub, + "max_message", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_message), + RSPAMD_CL_FLAG_INT_SIZE, + "Maximum size of the message to be scanned (50Mb by default)"); + rspamd_rcl_add_default_handler(sub, + "max_pic", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_pic_size), + RSPAMD_CL_FLAG_INT_SIZE, + "Maximum size of the picture to be normalized (1Mb by default)"); + rspamd_rcl_add_default_handler(sub, + "images_cache", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_pic_size), + RSPAMD_CL_FLAG_INT_SIZE, + "Size of DCT data cache for images (256 elements by default)"); + rspamd_rcl_add_default_handler(sub, + "zstd_input_dictionary", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, zstd_input_dictionary), + RSPAMD_CL_FLAG_STRING_PATH, + "Dictionary for zstd inbound protocol compression"); + rspamd_rcl_add_default_handler(sub, + "zstd_output_dictionary", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, zstd_output_dictionary), + RSPAMD_CL_FLAG_STRING_PATH, + "Dictionary for outbound zstd compression"); + rspamd_rcl_add_default_handler(sub, + "compat_messages", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, compat_messages), + 0, + "Use pre 1.4 style of messages in the protocol"); + rspamd_rcl_add_default_handler(sub, + "max_shots", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, default_max_shots), + 0, + "Maximum number of hits per a single symbol (default: 100)"); + rspamd_rcl_add_default_handler(sub, + "sessions_cache", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, enable_sessions_cache), + 0, + "Enable sessions cache to debug dangling sessions"); + rspamd_rcl_add_default_handler(sub, + "max_sessions_cache", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_sessions_cache), + 0, + "Maximum number of sessions in cache before warning (default: 100)"); + rspamd_rcl_add_default_handler(sub, + "task_timeout", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, task_timeout), + RSPAMD_CL_FLAG_TIME_FLOAT, + "Maximum time for checking a message"); + rspamd_rcl_add_default_handler(sub, + "soft_reject_on_timeout", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, soft_reject_on_timeout), + 0, + "Emit soft reject if task timeout takes place"); + rspamd_rcl_add_default_handler(sub, + "check_timeout", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, task_timeout), + RSPAMD_CL_FLAG_TIME_FLOAT, + "Maximum time for checking a message (alias for task_timeout)"); + rspamd_rcl_add_default_handler(sub, + "lua_gc_step", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, lua_gc_step), + RSPAMD_CL_FLAG_UINT, + "Lua garbage-collector step (default: 200)"); + rspamd_rcl_add_default_handler(sub, + "lua_gc_pause", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, lua_gc_pause), + RSPAMD_CL_FLAG_UINT, + "Lua garbage-collector pause (default: 200)"); + rspamd_rcl_add_default_handler(sub, + "full_gc_iters", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, full_gc_iters), + RSPAMD_CL_FLAG_UINT, + "Task scanned before memory gc is performed (default: 0 - disabled)"); + rspamd_rcl_add_default_handler(sub, + "heartbeat_interval", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, heartbeat_interval), + RSPAMD_CL_FLAG_TIME_FLOAT, + "Time between workers heartbeats"); + rspamd_rcl_add_default_handler(sub, + "heartbeats_loss_max", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, heartbeats_loss_max), + RSPAMD_CL_FLAG_INT_32, + "Maximum count of heartbeats to be lost before trying to " + "terminate a worker (default: 0 - disabled)"); + rspamd_rcl_add_default_handler(sub, + "max_lua_urls", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_lua_urls), + RSPAMD_CL_FLAG_INT_32, + "Maximum count of URLs to pass to Lua to avoid DoS (default: 1024)"); + rspamd_rcl_add_default_handler(sub, + "max_urls", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_urls), + RSPAMD_CL_FLAG_INT_32, + "Maximum count of URLs to process to avoid DoS (default: 10240)"); + rspamd_rcl_add_default_handler(sub, + "max_recipients", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_recipients), + RSPAMD_CL_FLAG_INT_32, + "Maximum count of recipients to process to avoid DoS (default: 1024)"); + rspamd_rcl_add_default_handler(sub, + "max_blas_threads", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_blas_threads), + RSPAMD_CL_FLAG_INT_32, + "Maximum number of Blas threads for learning neural networks (default: 1)"); + rspamd_rcl_add_default_handler(sub, + "max_opts_len", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, max_opts_len), + RSPAMD_CL_FLAG_INT_32, + "Maximum size of all options for a single symbol (default: 4096)"); + rspamd_rcl_add_default_handler(sub, + "events_backend", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, events_backend), + 0, + "Events backend to use: kqueue, epoll, select, poll or auto (default: auto)"); + + rspamd_rcl_add_doc_by_path(cfg, + "options", + "Swtich mode of gtube patterns: disable, reject, all", + "gtube_patterns", + UCL_STRING, + nullptr, + 0, + "reject", + 0); + + /* Neighbours configuration */ + rspamd_rcl_add_section_doc(&top, sub, "neighbours", "name", + rspamd_rcl_neighbours_handler, + UCL_OBJECT, FALSE, TRUE, + cfg->doc_strings, + "List of members of Rspamd cluster"); + + /* New DNS configuration */ + auto *ssub = rspamd_rcl_add_section_doc(&top, sub, "dns", nullptr, nullptr, + UCL_OBJECT, FALSE, TRUE, + cfg->doc_strings, + "Options for DNS resolver"); + rspamd_rcl_add_default_handler(ssub, + "nameserver", + rspamd_rcl_parse_struct_ucl, + G_STRUCT_OFFSET(struct rspamd_config, nameservers), + 0, + "List of DNS servers"); + rspamd_rcl_add_default_handler(ssub, + "server", + rspamd_rcl_parse_struct_ucl, + G_STRUCT_OFFSET(struct rspamd_config, nameservers), + 0, + "List of DNS servers"); + rspamd_rcl_add_default_handler(ssub, + "timeout", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, dns_timeout), + RSPAMD_CL_FLAG_TIME_FLOAT, + "DNS request timeout"); + rspamd_rcl_add_default_handler(ssub, + "retransmits", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, dns_retransmits), + RSPAMD_CL_FLAG_INT_32, + "DNS request retransmits"); + rspamd_rcl_add_default_handler(ssub, + "sockets", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, dns_io_per_server), + RSPAMD_CL_FLAG_INT_32, + "Number of sockets per DNS server"); + rspamd_rcl_add_default_handler(ssub, + "connections", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, dns_io_per_server), + RSPAMD_CL_FLAG_INT_32, + "Number of sockets per DNS server"); + rspamd_rcl_add_default_handler(ssub, + "enable_dnssec", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_config, enable_dnssec), + 0, + "Enable DNSSEC support in Rspamd"); + + + /* New upstreams configuration */ + ssub = rspamd_rcl_add_section_doc(&top, sub, "upstream", nullptr, nullptr, + UCL_OBJECT, FALSE, TRUE, + cfg->doc_strings, + "Upstreams configuration parameters"); + rspamd_rcl_add_default_handler(ssub, + "max_errors", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_config, upstream_max_errors), + RSPAMD_CL_FLAG_UINT, + "Maximum number of errors during `error_time` to consider upstream down"); + rspamd_rcl_add_default_handler(ssub, + "error_time", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, upstream_error_time), + RSPAMD_CL_FLAG_TIME_FLOAT, + "Time frame to check errors"); + rspamd_rcl_add_default_handler(ssub, + "revive_time", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, upstream_revive_time), + RSPAMD_CL_FLAG_TIME_FLOAT, + "Time before attempting to recover upstream after an error"); + rspamd_rcl_add_default_handler(ssub, + "lazy_resolve_time", + rspamd_rcl_parse_struct_time, + G_STRUCT_OFFSET(struct rspamd_config, upstream_lazy_resolve_time), + RSPAMD_CL_FLAG_TIME_FLOAT, + "Time to resolve upstreams addresses in lazy mode"); + } + + if (!(skip_sections && g_hash_table_lookup(skip_sections, "actions"))) { + /** + * Symbols and actions sections + */ + auto *sub = rspamd_rcl_add_section_doc(&top, nullptr, + "actions", nullptr, + rspamd_rcl_actions_handler, + UCL_OBJECT, + FALSE, + TRUE, + cfg->doc_strings, + "Actions configuration"); + rspamd_rcl_add_default_handler(sub, + "unknown_weight", + rspamd_rcl_parse_struct_double, + G_STRUCT_OFFSET(struct rspamd_config, unknown_weight), + 0, + "Accept unknown symbols with the specified weight"); + rspamd_rcl_add_default_handler(sub, + "grow_factor", + rspamd_rcl_parse_struct_double, + G_STRUCT_OFFSET(struct rspamd_config, grow_factor), + 0, + "Multiply the subsequent symbols by this number " + "(does not affect symbols with score less or " + "equal to zero)"); + rspamd_rcl_add_default_handler(sub, + "subject", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_config, subject), + 0, + "Rewrite subject with this value"); + } + + if (!(skip_sections && g_hash_table_lookup(skip_sections, "group"))) { + auto *sub = rspamd_rcl_add_section_doc(&top, nullptr, + "group", "name", + rspamd_rcl_group_handler, + UCL_OBJECT, + FALSE, + TRUE, + cfg->doc_strings, + "Symbol groups configuration"); + rspamd_rcl_add_section_doc(&top, sub, "symbols", "name", + rspamd_rcl_symbol_handler, + UCL_OBJECT, FALSE, TRUE, + cfg->doc_strings, + "Symbols configuration"); + + /* Group part */ + rspamd_rcl_add_default_handler(sub, + "max_score", + rspamd_rcl_parse_struct_double, + G_STRUCT_OFFSET(struct rspamd_symbols_group, max_score), + 0, + "Maximum score that could be reached by this symbols group"); + } + + if (!(skip_sections && g_hash_table_lookup(skip_sections, "worker"))) { + /** + * Worker section + */ + auto *sub = rspamd_rcl_add_section_doc(&top, nullptr, "worker", "type", + rspamd_rcl_worker_handler, + UCL_OBJECT, + FALSE, + TRUE, + cfg->doc_strings, + "Workers common options"); + rspamd_rcl_add_default_handler(sub, + "count", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_worker_conf, count), + RSPAMD_CL_FLAG_INT_16, + "Number of workers to spawn"); + rspamd_rcl_add_default_handler(sub, + "max_files", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_worker_conf, rlimit_nofile), + RSPAMD_CL_FLAG_INT_64, + "Maximum number of opened files per worker"); + rspamd_rcl_add_default_handler(sub, + "max_core", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_worker_conf, rlimit_maxcore), + RSPAMD_CL_FLAG_INT_64, + "Max size of core file in bytes"); + rspamd_rcl_add_default_handler(sub, + "enabled", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_worker_conf, enabled), + 0, + "Enable or disable a worker (true by default)"); + } + + if (!(skip_sections && g_hash_table_lookup(skip_sections, "modules"))) { + /** + * Modules handler + */ + rspamd_rcl_add_section_doc(&top, nullptr, + "modules", nullptr, + rspamd_rcl_modules_handler, + UCL_OBJECT, + FALSE, + FALSE, + cfg->doc_strings, + "Lua plugins to load"); + } + + if (!(skip_sections && g_hash_table_lookup(skip_sections, "classifier"))) { + /** + * Classifiers handler + */ + auto *sub = rspamd_rcl_add_section_doc(&top, nullptr, + "classifier", "type", + rspamd_rcl_classifier_handler, + UCL_OBJECT, + FALSE, + TRUE, + cfg->doc_strings, + "CLassifier options"); + /* Default classifier is 'bayes' for now */ + sub->default_key = "bayes"; + + rspamd_rcl_add_default_handler(sub, + "min_tokens", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_classifier_config, min_tokens), + RSPAMD_CL_FLAG_INT_32, + "Minimum count of tokens (words) to be considered for statistics"); + rspamd_rcl_add_default_handler(sub, + "min_token_hits", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_classifier_config, min_token_hits), + RSPAMD_CL_FLAG_UINT, + "Minimum number of hits for a token to be considered"); + rspamd_rcl_add_default_handler(sub, + "min_prob_strength", + rspamd_rcl_parse_struct_double, + G_STRUCT_OFFSET(struct rspamd_classifier_config, min_token_hits), + 0, + "Use only tokens with probability in [0.5 - MPS, 0.5 + MPS]"); + rspamd_rcl_add_default_handler(sub, + "max_tokens", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_classifier_config, max_tokens), + RSPAMD_CL_FLAG_INT_32, + "Maximum count of tokens (words) to be considered for statistics"); + rspamd_rcl_add_default_handler(sub, + "min_learns", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET(struct rspamd_classifier_config, min_learns), + RSPAMD_CL_FLAG_UINT, + "Minimum number of learns for each statfile to use this classifier"); + rspamd_rcl_add_default_handler(sub, + "backend", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_classifier_config, backend), + 0, + "Statfiles engine"); + rspamd_rcl_add_default_handler(sub, + "name", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_classifier_config, name), + 0, + "Name of classifier"); + + /* + * Statfile defaults + */ + auto *ssub = rspamd_rcl_add_section_doc(&top, sub, + "statfile", "symbol", + rspamd_rcl_statfile_handler, + UCL_OBJECT, + TRUE, + TRUE, + sub->doc_ref, + "Statfiles options"); + rspamd_rcl_add_default_handler(ssub, + "label", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_statfile_config, label), + 0, + "Statfile unique label"); + rspamd_rcl_add_default_handler(ssub, + "spam", + rspamd_rcl_parse_struct_boolean, + G_STRUCT_OFFSET(struct rspamd_statfile_config, is_spam), + 0, + "Sets if this statfile contains spam samples"); + } + + if (!(skip_sections && g_hash_table_lookup(skip_sections, "composite"))) { + /** + * Composites handlers + */ + rspamd_rcl_add_section_doc(&top, nullptr, + "composite", "name", + rspamd_rcl_composite_handler, + UCL_OBJECT, + FALSE, + TRUE, + cfg->doc_strings, + "Rspamd composite symbols"); + rspamd_rcl_add_section_doc(&top, nullptr, + "composites", nullptr, + rspamd_rcl_composites_handler, + UCL_OBJECT, + FALSE, + TRUE, + cfg->doc_strings, + "Rspamd composite symbols"); + } + + if (!(skip_sections && g_hash_table_lookup(skip_sections, "lua"))) { + /** + * Lua handler + */ + rspamd_rcl_add_section_doc(&top, nullptr, + "lua", nullptr, + rspamd_rcl_lua_handler, + UCL_STRING, + FALSE, + TRUE, + cfg->doc_strings, + "Lua files to load"); + } + + cfg->rcl_top_section = top; + + return top; +} + +static bool +rspamd_rcl_process_section(struct rspamd_config *cfg, + const struct rspamd_rcl_section &sec, + gpointer ptr, const ucl_object_t *obj, rspamd_mempool_t *pool, + GError **err) +{ + ucl_object_iter_t it; + const ucl_object_t *cur; + auto is_nested = true; + const gchar *key = nullptr; + + if (sec.processed) { + /* Section has been already processed */ + return TRUE; + } + + g_assert(obj != nullptr); + g_assert(sec.handler != nullptr); + + if (sec.key_attr) { + it = ucl_object_iterate_new(obj); + + while ((cur = ucl_object_iterate_full(it, UCL_ITERATE_EXPLICIT)) != nullptr) { + if (ucl_object_type(cur) != UCL_OBJECT) { + is_nested = false; + break; + } + } + + ucl_object_iterate_free(it); + } + else { + is_nested = false; + } + + if (is_nested) { + /* Just reiterate on all subobjects */ + it = ucl_object_iterate_new(obj); + + while ((cur = ucl_object_iterate_full(it, UCL_ITERATE_EXPLICIT)) != nullptr) { + if (!sec.handler(pool, cur, ucl_object_key(cur), ptr, const_cast<rspamd_rcl_section *>(&sec), err)) { + ucl_object_iterate_free(it); + + return false; + } + } + + ucl_object_iterate_free(it); + + return true; + } + else { + if (sec.key_attr) { + /* First of all search for required attribute and use it as a key */ + cur = ucl_object_lookup(obj, sec.key_attr.value().c_str()); + + if (cur == nullptr) { + if (!sec.default_key) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, "required attribute " + "'%s' is missing for section '%s', current key: %s", + sec.key_attr.value().c_str(), + sec.name.c_str(), + ucl_object_key(obj)); + + return false; + } + else { + msg_info("using default key '%s' for mandatory field '%s' " + "for section '%s'", + sec.default_key.value().c_str(), sec.key_attr.value().c_str(), + sec.name.c_str()); + key = sec.default_key.value().c_str(); + } + } + else if (ucl_object_type(cur) != UCL_STRING) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, "required attribute %s" + " is not a string for section %s", + sec.key_attr.value().c_str(), sec.name.c_str()); + + return false; + } + else { + key = ucl_object_tostring(cur); + } + } + } + + return sec.handler(pool, obj, key, ptr, const_cast<rspamd_rcl_section *>(&sec), err); +} + +gboolean +rspamd_rcl_parse(struct rspamd_rcl_sections_map *top, + struct rspamd_config *cfg, + gpointer ptr, rspamd_mempool_t *pool, + const ucl_object_t *obj, GError **err) +{ + if (obj->type != UCL_OBJECT) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "top configuration must be an object"); + return FALSE; + } + + /* Iterate over known sections and ignore unknown ones */ + for (const auto &sec_ptr: top->sections_order) { + if (sec_ptr->name == "*") { + /* Default section handler */ + const auto *cur_obj = obj; + LL_FOREACH(obj, cur_obj) + { + if (!top->sections.contains(ucl_object_key(cur_obj))) { + if (sec_ptr->handler != nullptr) { + if (!rspamd_rcl_process_section(cfg, *sec_ptr, ptr, cur_obj, + pool, err)) { + return FALSE; + } + } + else { + rspamd_rcl_section_parse_defaults(cfg, + *sec_ptr, + pool, + cur_obj, + ptr, + err); + } + } + } + } + else { + const auto *found = ucl_object_lookup(obj, sec_ptr->name.c_str()); + if (found == nullptr) { + if (sec_ptr->required) { + g_set_error(err, CFG_RCL_ERROR, ENOENT, + "required section %s is missing", sec_ptr->name.c_str()); + return FALSE; + } + } + else { + /* Check type */ + if (sec_ptr->strict_type) { + if (sec_ptr->type != found->type) { + g_set_error(err, CFG_RCL_ERROR, EINVAL, + "object in section %s has invalid type", sec_ptr->name.c_str()); + return FALSE; + } + } + + const auto *cur_obj = found; + LL_FOREACH(found, cur_obj) + { + if (sec_ptr->handler != nullptr) { + if (!rspamd_rcl_process_section(cfg, *sec_ptr, ptr, cur_obj, + pool, err)) { + return FALSE; + } + } + else { + rspamd_rcl_section_parse_defaults(cfg, *sec_ptr, + pool, + cur_obj, + ptr, + err); + } + } + } + } + if (sec_ptr->fin) { + sec_ptr->fin(pool, sec_ptr->fin_ud); + } + } + + return TRUE; +} + +static bool +rspamd_rcl_section_parse_defaults(struct rspamd_config *cfg, + const struct rspamd_rcl_section §ion, + rspamd_mempool_t *pool, const ucl_object_t *obj, gpointer ptr, + GError **err) +{ + + if (obj->type != UCL_OBJECT) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "default configuration must be an object for section %s " + "(actual type is %s)", + section.name.c_str(), ucl_object_type_to_string(ucl_object_type(obj))); + return FALSE; + } + + for (const auto &cur: section.default_parser) { + const auto *found = ucl_object_lookup(obj, cur.first.c_str()); + if (found != nullptr) { + auto new_pd = cur.second.pd; + new_pd.user_struct = ptr; + new_pd.cfg = cfg; + const auto *cur_obj = found; + + LL_FOREACH(found, cur_obj) + { + if (!cur.second.handler(pool, cur_obj, &new_pd, const_cast<rspamd_rcl_section *>(§ion), err)) { + return FALSE; + } + + if (!(new_pd.flags & RSPAMD_CL_FLAG_MULTIPLE)) { + break; + } + } + } + } + + return TRUE; +} + +gboolean +rspamd_rcl_parse_struct_string(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + const gsize num_str_len = 32; + + auto target = (gchar **) (((gchar *) pd->user_struct) + pd->offset); + switch (obj->type) { + case UCL_STRING: + *target = + rspamd_mempool_strdup(pool, ucl_copy_value_trash(obj)); + break; + case UCL_INT: + *target = (gchar *) rspamd_mempool_alloc(pool, num_str_len); + rspamd_snprintf(*target, num_str_len, "%L", obj->value.iv); + break; + case UCL_FLOAT: + *target = (gchar *) rspamd_mempool_alloc(pool, num_str_len); + rspamd_snprintf(*target, num_str_len, "%f", obj->value.dv); + break; + case UCL_BOOLEAN: + *target = (gchar *) rspamd_mempool_alloc(pool, num_str_len); + rspamd_snprintf(*target, num_str_len, "%s", + ((gboolean) obj->value.iv) ? "true" : "false"); + break; + case UCL_NULL: + /* String is enforced to be null */ + *target = nullptr; + break; + default: + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to string in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + + return TRUE; +} + +gboolean +rspamd_rcl_parse_struct_integer(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + union { + gint *ip; + gint32 *i32p; + gint16 *i16p; + gint64 *i64p; + guint *up; + gsize *sp; + } target; + int64_t val; + + if (pd->flags == RSPAMD_CL_FLAG_INT_32) { + target.i32p = (gint32 *) (((gchar *) pd->user_struct) + pd->offset); + if (!ucl_object_toint_safe(obj, &val)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to integer in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + *target.i32p = val; + } + else if (pd->flags == RSPAMD_CL_FLAG_INT_64) { + target.i64p = (gint64 *) (((gchar *) pd->user_struct) + pd->offset); + if (!ucl_object_toint_safe(obj, &val)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to integer in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + *target.i64p = val; + } + else if (pd->flags == RSPAMD_CL_FLAG_INT_SIZE) { + target.sp = (gsize *) (((gchar *) pd->user_struct) + pd->offset); + if (!ucl_object_toint_safe(obj, &val)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to integer in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + *target.sp = val; + } + else if (pd->flags == RSPAMD_CL_FLAG_INT_16) { + target.i16p = (gint16 *) (((gchar *) pd->user_struct) + pd->offset); + if (!ucl_object_toint_safe(obj, &val)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to integer in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + *target.i16p = val; + } + else if (pd->flags == RSPAMD_CL_FLAG_UINT) { + target.up = (guint *) (((gchar *) pd->user_struct) + pd->offset); + if (!ucl_object_toint_safe(obj, &val)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to integer in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + *target.up = val; + } + else { + target.ip = (gint *) (((gchar *) pd->user_struct) + pd->offset); + if (!ucl_object_toint_safe(obj, &val)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to integer in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + *target.ip = val; + } + + return TRUE; +} + +gboolean +rspamd_rcl_parse_struct_double(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + gdouble *target; + + target = (gdouble *) (((gchar *) pd->user_struct) + pd->offset); + + if (!ucl_object_todouble_safe(obj, target)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to double in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + + return TRUE; +} + +gboolean +rspamd_rcl_parse_struct_time(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + union { + gint *psec; + guint32 *pu32; + gdouble *pdv; + struct timeval *ptv; + struct timespec *pts; + } target; + gdouble val; + + if (!ucl_object_todouble_safe(obj, &val)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to double in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + + if (pd->flags == RSPAMD_CL_FLAG_TIME_TIMEVAL) { + target.ptv = + (struct timeval *) (((gchar *) pd->user_struct) + pd->offset); + target.ptv->tv_sec = (glong) val; + target.ptv->tv_usec = (val - (glong) val) * 1000000; + } + else if (pd->flags == RSPAMD_CL_FLAG_TIME_TIMESPEC) { + target.pts = + (struct timespec *) (((gchar *) pd->user_struct) + pd->offset); + target.pts->tv_sec = (glong) val; + target.pts->tv_nsec = (val - (glong) val) * 1000000000000LL; + } + else if (pd->flags == RSPAMD_CL_FLAG_TIME_FLOAT) { + target.pdv = (double *) (((gchar *) pd->user_struct) + pd->offset); + *target.pdv = val; + } + else if (pd->flags == RSPAMD_CL_FLAG_TIME_INTEGER) { + target.psec = (gint *) (((gchar *) pd->user_struct) + pd->offset); + *target.psec = val * 1000; + } + else if (pd->flags == RSPAMD_CL_FLAG_TIME_UINT_32) { + target.pu32 = (guint32 *) (((gchar *) pd->user_struct) + pd->offset); + *target.pu32 = val * 1000; + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to time in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + + return TRUE; +} + +gboolean +rspamd_rcl_parse_struct_keypair(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + struct rspamd_cryptobox_keypair **target, *kp; + + target = (struct rspamd_cryptobox_keypair **) (((gchar *) pd->user_struct) + + pd->offset); + if (obj->type == UCL_OBJECT) { + kp = rspamd_keypair_from_ucl(obj); + + if (kp != nullptr) { + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) rspamd_keypair_unref, kp); + *target = kp; + } + else { + gchar *dump = (char *) ucl_object_emit(obj, UCL_EMIT_JSON_COMPACT); + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot load the keypair specified: %s; section: %s; value: %s", + ucl_object_key(obj), section->name.c_str(), dump); + free(dump); + + return FALSE; + } + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "no sane pubkey or privkey found in the keypair: %s", + ucl_object_key(obj)); + return FALSE; + } + + return TRUE; +} + +gboolean +rspamd_rcl_parse_struct_pubkey(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + struct rspamd_cryptobox_pubkey **target, *pk; + gsize len; + const gchar *str; + rspamd_cryptobox_keypair_type keypair_type = RSPAMD_KEYPAIR_KEX; + rspamd_cryptobox_mode keypair_mode = RSPAMD_CRYPTOBOX_MODE_25519; + + if (pd->flags & RSPAMD_CL_FLAG_SIGNKEY) { + keypair_type = RSPAMD_KEYPAIR_SIGN; + } + if (pd->flags & RSPAMD_CL_FLAG_NISTKEY) { + keypair_mode = RSPAMD_CRYPTOBOX_MODE_NIST; + } + + target = (struct rspamd_cryptobox_pubkey **) (((gchar *) pd->user_struct) + + pd->offset); + if (obj->type == UCL_STRING) { + str = ucl_object_tolstring(obj, &len); + pk = rspamd_pubkey_from_base32(str, len, keypair_type, + keypair_mode); + + if (pk != nullptr) { + *target = pk; + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot load the pubkey specified: %s", + ucl_object_key(obj)); + return FALSE; + } + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "no sane pubkey found in the element: %s", + ucl_object_key(obj)); + return FALSE; + } + + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) rspamd_pubkey_unref, pk); + + return TRUE; +} + +static void +rspamd_rcl_insert_string_list_item(gpointer *target, rspamd_mempool_t *pool, + std::string_view elt, gboolean is_hash) +{ + union { + GHashTable *hv; + GList *lv; + gpointer p; + } d; + gchar *val; + + d.p = *target; + + if (is_hash) { + if (d.hv == nullptr) { + d.hv = g_hash_table_new(rspamd_str_hash, rspamd_str_equal); + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) g_hash_table_unref, d.hv); + } + + val = rspamd_mempool_strdup_len(pool, elt.data(), elt.size()); + g_hash_table_insert(d.hv, val, val); + } + else { + val = rspamd_mempool_strdup_len(pool, elt.data(), elt.size()); + d.lv = g_list_prepend(d.lv, val); + } + + *target = d.p; +} + +gboolean +rspamd_rcl_parse_struct_string_list(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + constexpr const auto num_str_len = 32; + auto need_destructor = true; + + + auto is_hash = pd->flags & RSPAMD_CL_FLAG_STRING_LIST_HASH; + auto *target = (gpointer *) (((gchar *) pd->user_struct) + pd->offset); + + if (!is_hash && *target != nullptr) { + need_destructor = FALSE; + } + + auto iter = ucl_object_iterate_new(obj); + const auto *cur = obj; + + while ((cur = ucl_object_iterate_safe(iter, true)) != nullptr) { + switch (cur->type) { + case UCL_STRING: { + rspamd::string_foreach_delim(ucl_object_tostring(cur), ", ", [&](const auto &elt) { + rspamd_rcl_insert_string_list_item(target, pool, elt, is_hash); + }); + + /* Go to the next object */ + continue; + } + case UCL_INT: { + auto *val = (gchar *) rspamd_mempool_alloc(pool, num_str_len); + rspamd_snprintf(val, num_str_len, "%L", cur->value.iv); + rspamd_rcl_insert_string_list_item(target, pool, val, is_hash); + break; + } + case UCL_FLOAT: { + auto *val = (gchar *) rspamd_mempool_alloc(pool, num_str_len); + rspamd_snprintf(val, num_str_len, "%f", cur->value.dv); + rspamd_rcl_insert_string_list_item(target, pool, val, is_hash); + break; + } + case UCL_BOOLEAN: { + auto *val = (gchar *) rspamd_mempool_alloc(pool, num_str_len); + rspamd_snprintf(val, num_str_len, "%s", + ((gboolean) cur->value.iv) ? "true" : "false"); + rspamd_rcl_insert_string_list_item(target, pool, val, is_hash); + break; + } + default: + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to a string list in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + ucl_object_iterate_free(iter); + + return FALSE; + } + } + + ucl_object_iterate_free(iter); + +#if 0 + /* WTF: why don't we allow empty list here?? */ + if (*target == nullptr) { + g_set_error (err, + CFG_RCL_ERROR, + EINVAL, + "non-empty array of strings is expected: %s, " + "got: %s, of length: %d", + ucl_object_key (obj), ucl_object_type_to_string (obj->type), + obj->len); + return FALSE; + } +#endif + + if (!is_hash && *target != nullptr) { + *target = g_list_reverse(*(GList **) target); + + if (need_destructor) { + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) g_list_free, + *target); + } + } + + return TRUE; +} + +gboolean +rspamd_rcl_parse_struct_ucl(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + const ucl_object_t **target; + + target = (const ucl_object_t **) (((gchar *) pd->user_struct) + pd->offset); + + *target = obj; + + return TRUE; +} + + +gboolean +rspamd_rcl_parse_struct_boolean(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + gboolean *target; + + target = (gboolean *) (((gchar *) pd->user_struct) + pd->offset); + + if (obj->type == UCL_BOOLEAN) { + *target = obj->value.iv; + } + else if (obj->type == UCL_INT) { + *target = obj->value.iv; + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to boolean in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + + if (pd->flags & RSPAMD_CL_FLAG_BOOLEAN_INVERSE) { + *target = !*target; + } + + return TRUE; +} + +gboolean +rspamd_rcl_parse_struct_addr(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + rspamd_inet_addr_t **target; + const gchar *val; + gsize size; + + target = (rspamd_inet_addr_t **) (((gchar *) pd->user_struct) + pd->offset); + + if (ucl_object_type(obj) == UCL_STRING) { + val = ucl_object_tolstring(obj, &size); + + if (!rspamd_parse_inet_address(target, val, size, + RSPAMD_INET_ADDRESS_PARSE_DEFAULT)) { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot parse inet address: %s", val); + return FALSE; + } + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot convert %s to inet address in option %s", + ucl_object_type_to_string(ucl_object_type(obj)), + ucl_object_key(obj)); + return FALSE; + } + + return TRUE; +} + +gboolean +rspamd_rcl_parse_struct_mime_addr(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + auto *pd = (struct rspamd_rcl_struct_parser *) ud; + GPtrArray **target, *tmp_addr = nullptr; + const gchar *val; + ucl_object_iter_t it; + const ucl_object_t *cur; + + target = (GPtrArray **) (((gchar *) pd->user_struct) + pd->offset); + it = ucl_object_iterate_new(obj); + + while ((cur = ucl_object_iterate_safe(it, true)) != nullptr) { + if (ucl_object_type(cur) == UCL_STRING) { + val = ucl_object_tostring(obj); + tmp_addr = rspamd_email_address_from_mime(pool, val, + strlen(val), tmp_addr, -1); + } + else { + g_set_error(err, + CFG_RCL_ERROR, + EINVAL, + "cannot get inet address from ucl object in %s", + ucl_object_key(obj)); + ucl_object_iterate_free(it); + + return FALSE; + } + } + + ucl_object_iterate_free(it); + *target = tmp_addr; + + return TRUE; +} + +void rspamd_rcl_register_worker_option(struct rspamd_config *cfg, + GQuark type, + const gchar *name, + rspamd_rcl_default_handler_t handler, + gpointer target, + glong offset, + gint flags, + const gchar *doc_string) +{ + auto parser_it = cfg->rcl_top_section->workers_parser.try_emplace(type, rspamd_worker_cfg_parser{}); + auto &parser = parser_it.first->second; + auto handler_it = parser.parsers.try_emplace(std::make_pair(std::string{name}, target), rspamd_worker_param_parser{}); + + if (!handler_it.second) { + msg_warn_config( + "handler for parameter %s is already registered for worker type %s", + name, + g_quark_to_string(type)); + return; + } + + auto &nhandler = handler_it.first->second; + nhandler.parser.flags = flags; + nhandler.parser.offset = offset; + nhandler.parser.user_struct = target; + nhandler.handler = handler; + + const auto *doc_workers = ucl_object_lookup(cfg->doc_strings, "workers"); + + if (doc_workers == nullptr) { + auto *doc_obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(cfg->doc_strings, doc_obj, "workers", 0, false); + doc_workers = doc_obj; + } + + const auto *doc_target = ucl_object_lookup(doc_workers, g_quark_to_string(type)); + + if (doc_target == nullptr) { + auto *doc_obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key((ucl_object_t *) doc_workers, doc_obj, + g_quark_to_string(type), 0, true); + doc_target = doc_obj; + } + + rspamd_rcl_add_doc_obj((ucl_object_t *) doc_target, + doc_string, + name, + UCL_NULL, + handler, + flags, + nullptr, + 0); +} + +/* Checksum functions */ +static int +rspamd_rcl_emitter_append_c(unsigned char c, size_t nchars, void *ud) +{ + auto *hs = (rspamd_cryptobox_hash_state_t *) ud; + guint64 d[2]; + + d[0] = nchars; + d[1] = c; + + rspamd_cryptobox_hash_update(hs, (const guchar *) d, sizeof(d)); + + return 0; +} + +static int +rspamd_rcl_emitter_append_len(unsigned const char *str, size_t len, void *ud) +{ + auto *hs = (rspamd_cryptobox_hash_state_t *) ud; + + rspamd_cryptobox_hash_update(hs, str, len); + + return 0; +} +static int +rspamd_rcl_emitter_append_int(int64_t elt, void *ud) +{ + auto *hs = (rspamd_cryptobox_hash_state_t *) ud; + + rspamd_cryptobox_hash_update(hs, (const guchar *) &elt, sizeof(elt)); + + return 0; +} + +static int +rspamd_rcl_emitter_append_double(double elt, void *ud) +{ + auto *hs = (rspamd_cryptobox_hash_state_t *) ud; + + rspamd_cryptobox_hash_update(hs, (const guchar *) &elt, sizeof(elt)); + + return 0; +} + +void rspamd_rcl_sections_free(struct rspamd_rcl_sections_map *sections) +{ + delete sections; +} + +/** + * Calls for an external lua function to apply potential config transformations + * if needed. This function can change the cfg->rcl_obj. + * + * Example of transformation function: + * + * function(obj) + * if obj.something == 'foo' then + * obj.something = "bla" + * return true, obj + * end + * + * return false, nil + * end + * + * If function returns 'false' then rcl_obj is not touched. Otherwise, + * it is changed, then rcl_obj is imported from lua. Old config is dereferenced. + * @param cfg + */ +void rspamd_rcl_maybe_apply_lua_transform(struct rspamd_config *cfg) +{ + auto *L = RSPAMD_LUA_CFG_STATE(cfg); + static const char *transform_script = "lua_cfg_transform"; + + g_assert(L != nullptr); + + if (!rspamd_lua_require_function(L, transform_script, nullptr)) { + /* No function defined */ + msg_warn_config("cannot execute lua script %s: %s", + transform_script, lua_tostring(L, -1)); + + return; + } + + lua_pushcfunction(L, &rspamd_lua_traceback); + auto err_idx = lua_gettop(L); + + /* Push function */ + lua_pushvalue(L, -2); + + /* Push the existing config */ + ucl_object_push_lua(L, cfg->cfg_ucl_obj, true); + + if (auto ret = lua_pcall(L, 1, 2, err_idx); ret != 0) { + msg_err("call to rspamadm lua script failed (%d): %s", ret, + lua_tostring(L, -1)); + lua_settop(L, 0); + + return; + } + + if (lua_toboolean(L, -2) && lua_type(L, -1) == LUA_TTABLE) { + ucl_object_t *old_cfg = cfg->cfg_ucl_obj; + + msg_info_config("configuration has been transformed in Lua"); + cfg->cfg_ucl_obj = ucl_object_lua_import(L, -1); + ucl_object_unref(old_cfg); + } + + /* error function */ + lua_settop(L, 0); +} + +static bool +rspamd_rcl_decrypt_handler(struct ucl_parser *parser, + const unsigned char *source, size_t source_len, + unsigned char **destination, size_t *dest_len, + void *user_data) +{ + GError *err = nullptr; + auto *kp = (struct rspamd_cryptobox_keypair *) user_data; + + if (!rspamd_keypair_decrypt(kp, source, source_len, + destination, dest_len, &err)) { + msg_err("cannot decrypt file: %e", err); + g_error_free(err); + + return false; + } + + return true; +} + +static bool +rspamd_rcl_jinja_handler(struct ucl_parser *parser, + const unsigned char *source, size_t source_len, + unsigned char **destination, size_t *dest_len, + void *user_data) +{ + auto *cfg = (struct rspamd_config *) user_data; + auto *L = RSPAMD_LUA_CFG_STATE(cfg); + + lua_pushcfunction(L, &rspamd_lua_traceback); + auto err_idx = lua_gettop(L); + + /* Obtain function */ + if (!rspamd_lua_require_function(L, "lua_util", "jinja_template")) { + msg_err_config("cannot require lua_util.jinja_template"); + lua_settop(L, err_idx - 1); + + return false; + } + + lua_pushlstring(L, (const char *) source, source_len); + lua_getglobal(L, "rspamd_env"); + lua_pushboolean(L, false); + + if (lua_pcall(L, 3, 1, err_idx) != 0) { + msg_err_config("cannot call lua jinja_template script: %s", + lua_tostring(L, -1)); + lua_settop(L, err_idx - 1); + + return false; + } + + if (lua_type(L, -1) == LUA_TSTRING) { + const char *ndata; + gsize nsize; + + ndata = lua_tolstring(L, -1, &nsize); + *destination = (unsigned char *) UCL_ALLOC(nsize); + memcpy(*destination, ndata, nsize); + *dest_len = nsize; + } + else { + msg_err_config("invalid return type when templating jinja %s", + lua_typename(L, lua_type(L, -1))); + lua_settop(L, err_idx - 1); + + return false; + } + + lua_settop(L, err_idx - 1); + + return true; +} + +static void +rspamd_rcl_decrypt_free(unsigned char *data, size_t len, void *user_data) +{ + g_free(data); +} + +void rspamd_config_calculate_cksum(struct rspamd_config *cfg) +{ + rspamd_cryptobox_hash_state_t hs; + unsigned char cksumbuf[rspamd_cryptobox_HASHBYTES]; + struct ucl_emitter_functions f; + + /* Calculate checksum */ + rspamd_cryptobox_hash_init(&hs, nullptr, 0); + f.ucl_emitter_append_character = rspamd_rcl_emitter_append_c; + f.ucl_emitter_append_double = rspamd_rcl_emitter_append_double; + f.ucl_emitter_append_int = rspamd_rcl_emitter_append_int; + f.ucl_emitter_append_len = rspamd_rcl_emitter_append_len; + f.ucl_emitter_free_func = nullptr; + f.ud = &hs; + ucl_object_emit_full(cfg->cfg_ucl_obj, UCL_EMIT_MSGPACK, + &f, cfg->config_comments); + rspamd_cryptobox_hash_final(&hs, cksumbuf); + cfg->checksum = rspamd_encode_base32(cksumbuf, sizeof(cksumbuf), RSPAMD_BASE32_DEFAULT); + /* Also change the tag of cfg pool to be equal to the checksum */ + rspamd_strlcpy(cfg->cfg_pool->tag.uid, cfg->checksum, + MIN(sizeof(cfg->cfg_pool->tag.uid), strlen(cfg->checksum))); +} + +gboolean +rspamd_config_parse_ucl(struct rspamd_config *cfg, + const gchar *filename, + GHashTable *vars, + ucl_include_trace_func_t inc_trace, + void *trace_data, + gboolean skip_jinja, + GError **err) +{ + struct rspamd_cryptobox_keypair *decrypt_keypair = nullptr; + auto cfg_file_maybe = rspamd::util::raii_mmaped_file::mmap_shared(filename, O_RDONLY, PROT_READ, 0); + + if (!cfg_file_maybe) { + g_set_error(err, cfg_rcl_error_quark(), errno, + "cannot open %s: %*s", filename, (int) cfg_file_maybe.error().error_message.size(), + cfg_file_maybe.error().error_message.data()); + return FALSE; + } + + auto &cfg_file = cfg_file_maybe.value(); + + /* Try to load keyfile if available */ + rspamd::util::raii_file::open(fmt::format("{}.key", filename), O_RDONLY).map([&](const auto &keyfile) { + auto *kp_parser = ucl_parser_new(0); + if (ucl_parser_add_fd(kp_parser, keyfile.get_fd())) { + auto *kp_obj = ucl_parser_get_object(kp_parser); + + g_assert(kp_obj != nullptr); + decrypt_keypair = rspamd_keypair_from_ucl(kp_obj); + + if (decrypt_keypair == nullptr) { + msg_err_config_forced("cannot load keypair from %s.key: invalid keypair", + filename); + } + else { + /* Add decryption support to UCL */ + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) rspamd_keypair_unref, + decrypt_keypair); + } + + ucl_object_unref(kp_obj); + } + else { + msg_err_config_forced("cannot load keypair from %s.key: %s", + filename, ucl_parser_get_error(kp_parser)); + } + ucl_parser_free(kp_parser); + }); + + auto parser = std::shared_ptr<ucl_parser>(ucl_parser_new(UCL_PARSER_SAVE_COMMENTS), ucl_parser_free); + rspamd_ucl_add_conf_variables(parser.get(), vars); + rspamd_ucl_add_conf_macros(parser.get(), cfg); + ucl_parser_set_filevars(parser.get(), filename, true); + + if (inc_trace) { + ucl_parser_set_include_tracer(parser.get(), inc_trace, trace_data); + } + + if (decrypt_keypair) { + auto *decrypt_handler = rspamd_mempool_alloc0_type(cfg->cfg_pool, + struct ucl_parser_special_handler); + decrypt_handler->user_data = decrypt_keypair; + decrypt_handler->magic = encrypted_magic; + decrypt_handler->magic_len = sizeof(encrypted_magic); + decrypt_handler->handler = rspamd_rcl_decrypt_handler; + decrypt_handler->free_function = rspamd_rcl_decrypt_free; + + ucl_parser_add_special_handler(parser.get(), decrypt_handler); + } + + if (!skip_jinja) { + auto *jinja_handler = rspamd_mempool_alloc0_type(cfg->cfg_pool, + struct ucl_parser_special_handler); + jinja_handler->user_data = cfg; + jinja_handler->flags = UCL_SPECIAL_HANDLER_PREPROCESS_ALL; + jinja_handler->handler = rspamd_rcl_jinja_handler; + + ucl_parser_add_special_handler(parser.get(), jinja_handler); + } + + if (!ucl_parser_add_chunk(parser.get(), (unsigned char *) cfg_file.get_map(), cfg_file.get_size())) { + g_set_error(err, cfg_rcl_error_quark(), errno, + "ucl parser error: %s", ucl_parser_get_error(parser.get())); + + return FALSE; + } + + cfg->cfg_ucl_obj = ucl_parser_get_object(parser.get()); + cfg->config_comments = ucl_object_ref(ucl_parser_get_comments(parser.get())); + + return TRUE; +} + +gboolean +rspamd_config_read(struct rspamd_config *cfg, + const gchar *filename, + rspamd_rcl_section_fin_t logger_fin, + gpointer logger_ud, + GHashTable *vars, + gboolean skip_jinja, + gchar **lua_env) +{ + GError *err = nullptr; + + rspamd_lua_set_path(RSPAMD_LUA_CFG_STATE(cfg), nullptr, vars); + + if (!rspamd_lua_set_env(RSPAMD_LUA_CFG_STATE(cfg), vars, lua_env, &err)) { + msg_err_config_forced("failed to set up environment: %e", err); + g_error_free(err); + + return FALSE; + } + + if (!rspamd_config_parse_ucl(cfg, filename, vars, nullptr, nullptr, skip_jinja, &err)) { + msg_err_config_forced("failed to load config: %e", err); + g_error_free(err); + + return FALSE; + } + + auto *top = rspamd_rcl_config_init(cfg, nullptr); + cfg->rcl_top_section = top; + /* Add new paths if defined in options */ + rspamd_lua_set_path(RSPAMD_LUA_CFG_STATE(cfg), cfg->cfg_ucl_obj, vars); + rspamd_lua_set_globals(cfg, RSPAMD_LUA_CFG_STATE(cfg)); + rspamd_mempool_add_destructor(cfg->cfg_pool, (rspamd_mempool_destruct_t) rspamd_rcl_sections_free, top); + err = nullptr; + + /* Pre-init logging if possible */ + if (logger_fin != nullptr) { + auto logging_section_maybe = rspamd::find_map(top->sections, "logging"); + + if (logging_section_maybe) { + const auto *logger_obj = ucl_object_lookup_any(cfg->cfg_ucl_obj, "logging", + "logger", nullptr); + + if (logger_obj == nullptr) { + logger_fin(cfg->cfg_pool, logger_ud); + } + else { + if (!rspamd_rcl_process_section(cfg, *logging_section_maybe.value().get().get(), cfg, + logger_obj, cfg->cfg_pool, &err)) { + msg_err_config_forced("cannot init logger: %e", err); + g_error_free(err); + + return FALSE; + } + else { + logger_fin(cfg->cfg_pool, logger_ud); + } + + /* Init lua logging */ + lua_pushcfunction(RSPAMD_LUA_CFG_STATE(cfg), &rspamd_lua_traceback); + auto err_idx = lua_gettop(RSPAMD_LUA_CFG_STATE(cfg)); + + /* Obtain function */ + if (!rspamd_lua_require_function(RSPAMD_LUA_CFG_STATE(cfg), "lua_util", + "init_debug_logging")) { + msg_err_config("cannot require lua_util.init_debug_logging"); + lua_settop(RSPAMD_LUA_CFG_STATE(cfg), err_idx - 1); + + return FALSE; + } + + void *pcfg = lua_newuserdata(RSPAMD_LUA_CFG_STATE(cfg), sizeof(void *)); + memcpy(pcfg, &cfg, sizeof(void *)); + rspamd_lua_setclass(RSPAMD_LUA_CFG_STATE(cfg), "rspamd{config}", -1); + + if (lua_pcall(RSPAMD_LUA_CFG_STATE(cfg), 1, 0, err_idx) != 0) { + msg_err_config("cannot call lua init_debug_logging script: %s", + lua_tostring(RSPAMD_LUA_CFG_STATE(cfg), -1)); + lua_settop(RSPAMD_LUA_CFG_STATE(cfg), err_idx - 1); + + return FALSE; + } + + lua_settop(RSPAMD_LUA_CFG_STATE(cfg), err_idx - 1); + } + } + } + + /* Transform config if needed */ + rspamd_rcl_maybe_apply_lua_transform(cfg); + rspamd_config_calculate_cksum(cfg); + + if (!rspamd_rcl_parse(top, cfg, cfg, cfg->cfg_pool, cfg->cfg_ucl_obj, &err)) { + msg_err_config("rcl parse error: %e", err); + + if (err) { + g_error_free(err); + } + + return FALSE; + } + + cfg->lang_det = rspamd_language_detector_init(cfg); + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) rspamd_language_detector_unref, + cfg->lang_det); + + return TRUE; +} + +static void +rspamd_rcl_doc_obj_from_handler(ucl_object_t *doc_obj, + rspamd_rcl_default_handler_t handler, + gint flags) +{ + auto has_example = ucl_object_lookup(doc_obj, "example") != nullptr; + auto has_type = ucl_object_lookup(doc_obj, "type") != nullptr; + + if (handler == rspamd_rcl_parse_struct_string) { + if (!has_type) { + ucl_object_insert_key(doc_obj, ucl_object_fromstring("string"), + "type", 0, false); + } + } + else if (handler == rspamd_rcl_parse_struct_integer) { + auto *type = "int"; + + if (flags & RSPAMD_CL_FLAG_INT_16) { + type = "int16"; + } + else if (flags & RSPAMD_CL_FLAG_INT_32) { + type = "int32"; + } + else if (flags & RSPAMD_CL_FLAG_INT_64) { + type = "int64"; + } + else if (flags & RSPAMD_CL_FLAG_INT_SIZE) { + type = "size"; + } + else if (flags & RSPAMD_CL_FLAG_UINT) { + type = "uint"; + } + + if (!has_type) { + ucl_object_insert_key(doc_obj, ucl_object_fromstring(type), + "type", 0, false); + } + } + else if (handler == rspamd_rcl_parse_struct_double) { + if (!has_type) { + ucl_object_insert_key(doc_obj, ucl_object_fromstring("double"), + "type", 0, false); + } + } + else if (handler == rspamd_rcl_parse_struct_time) { + auto *type = "time"; + + if (!has_type) { + ucl_object_insert_key(doc_obj, ucl_object_fromstring(type), + "type", 0, false); + } + } + else if (handler == rspamd_rcl_parse_struct_string_list) { + if (!has_type) { + ucl_object_insert_key(doc_obj, ucl_object_fromstring("string list"), + "type", 0, false); + } + if (!has_example) { + ucl_object_insert_key(doc_obj, + ucl_object_fromstring_common("param = \"str1, str2, str3\" OR " + "param = [\"str1\", \"str2\", \"str3\"]", + 0, static_cast<ucl_string_flags>(0)), + "example", + 0, + false); + } + } + else if (handler == rspamd_rcl_parse_struct_boolean) { + if (!has_type) { + ucl_object_insert_key(doc_obj, + ucl_object_fromstring("bool"), + "type", + 0, + false); + } + } + else if (handler == rspamd_rcl_parse_struct_keypair) { + if (!has_type) { + ucl_object_insert_key(doc_obj, + ucl_object_fromstring("keypair"), + "type", + 0, + false); + } + if (!has_example) { + ucl_object_insert_key(doc_obj, + ucl_object_fromstring("keypair { " + "pubkey = <base32_string>;" + " privkey = <base32_string>; " + "}"), + "example", + 0, + false); + } + } + else if (handler == rspamd_rcl_parse_struct_addr) { + if (!has_type) { + ucl_object_insert_key(doc_obj, + ucl_object_fromstring("socket address"), + "type", + 0, + false); + } + } + else if (handler == rspamd_rcl_parse_struct_mime_addr) { + if (!has_type) { + ucl_object_insert_key(doc_obj, + ucl_object_fromstring("email address"), + "type", + 0, + false); + } + } +} + +ucl_object_t * +rspamd_rcl_add_doc_obj(ucl_object_t *doc_target, + const char *doc_string, + const char *doc_name, + ucl_type_t type, + rspamd_rcl_default_handler_t handler, + gint flags, + const char *default_value, + gboolean required) +{ + ucl_object_t *doc_obj; + + if (doc_target == nullptr || doc_name == nullptr) { + return nullptr; + } + + doc_obj = ucl_object_typed_new(UCL_OBJECT); + + /* Insert doc string itself */ + if (doc_string) { + ucl_object_insert_key(doc_obj, + ucl_object_fromstring_common(doc_string, 0, static_cast<ucl_string_flags>(0)), + "data", 0, false); + } + else { + ucl_object_insert_key(doc_obj, ucl_object_fromstring("undocumented"), + "data", 0, false); + } + + if (type != UCL_NULL) { + ucl_object_insert_key(doc_obj, + ucl_object_fromstring(ucl_object_type_to_string(type)), + "type", 0, false); + } + + rspamd_rcl_doc_obj_from_handler(doc_obj, handler, flags); + + ucl_object_insert_key(doc_obj, + ucl_object_frombool(required), + "required", 0, false); + + if (default_value) { + ucl_object_insert_key(doc_obj, + ucl_object_fromstring_common(default_value, 0, static_cast<ucl_string_flags>(0)), + "default", 0, false); + } + + ucl_object_insert_key(doc_target, doc_obj, doc_name, 0, true); + + return doc_obj; +} + +ucl_object_t * +rspamd_rcl_add_doc_by_path(struct rspamd_config *cfg, + const gchar *doc_path, + const char *doc_string, + const char *doc_name, + ucl_type_t type, + rspamd_rcl_default_handler_t handler, + gint flags, + const char *default_value, + gboolean required) +{ + const auto *cur = cfg->doc_strings; + + if (doc_path == nullptr) { + /* Assume top object */ + return rspamd_rcl_add_doc_obj(cfg->doc_strings, + doc_string, + doc_name, + type, + handler, + flags, + default_value, + required); + } + else { + const auto *found = ucl_object_lookup_path(cfg->doc_strings, doc_path); + + if (found != nullptr) { + return rspamd_rcl_add_doc_obj((ucl_object_t *) found, + doc_string, + doc_name, + type, + handler, + flags, + default_value, + required); + } + + /* Otherwise we need to insert all components of the path */ + rspamd::string_foreach_delim(doc_path, ".", [&](const std::string_view &elt) { + if (ucl_object_type(cur) != UCL_OBJECT) { + msg_err_config("Bad path while lookup for '%s' at %*s", + doc_path, (int) elt.size(), elt.data()); + } + const auto *found = ucl_object_lookup_len(cur, elt.data(), elt.size()); + if (found == nullptr) { + auto *obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key((ucl_object_t *) cur, + obj, + elt.data(), + elt.size(), + true); + cur = obj; + } + else { + cur = found; + } + }); + } + + return rspamd_rcl_add_doc_obj(ucl_object_ref(cur), + doc_string, + doc_name, + type, + handler, + flags, + default_value, + required); +} + +static void +rspamd_rcl_add_doc_from_comments(struct rspamd_config *cfg, + ucl_object_t *top_doc, const ucl_object_t *obj, + const ucl_object_t *comments, gboolean is_top) +{ + ucl_object_iter_t it = nullptr; + const ucl_object_t *cur, *cmt; + ucl_object_t *cur_doc; + + if (ucl_object_type(obj) == UCL_OBJECT) { + while ((cur = ucl_object_iterate(obj, &it, true)) != nullptr) { + cur_doc = nullptr; + + if ((cmt = ucl_comments_find(comments, cur)) != nullptr) { + cur_doc = rspamd_rcl_add_doc_obj(top_doc, + ucl_object_tostring(cmt), ucl_object_key(cur), + ucl_object_type(cur), nullptr, 0, nullptr, FALSE); + } + + if (ucl_object_type(cur) == UCL_OBJECT) { + if (cur_doc) { + rspamd_rcl_add_doc_from_comments(cfg, cur_doc, cur, + comments, + FALSE); + } + else { + rspamd_rcl_add_doc_from_comments(cfg, top_doc, cur, + comments, + FALSE); + } + } + } + } + else if (!is_top) { + if ((cmt = ucl_comments_find(comments, obj)) != nullptr) { + rspamd_rcl_add_doc_obj(top_doc, + ucl_object_tostring(cmt), ucl_object_key(obj), + ucl_object_type(obj), nullptr, 0, nullptr, FALSE); + } + } +} + +ucl_object_t * +rspamd_rcl_add_doc_by_example(struct rspamd_config *cfg, + const gchar *root_path, + const gchar *doc_string, + const gchar *doc_name, + const gchar *example_data, gsize example_len) +{ + auto parser = std::shared_ptr<ucl_parser>(ucl_parser_new(UCL_PARSER_NO_FILEVARS | UCL_PARSER_SAVE_COMMENTS), ucl_parser_free); + + if (!ucl_parser_add_chunk(parser.get(), reinterpret_cast<const unsigned char *>(example_data), example_len)) { + msg_err_config("cannot parse example: %s", + ucl_parser_get_error(parser.get())); + + return nullptr; + } + + auto *top = ucl_parser_get_object(parser.get()); + const auto *comments = ucl_parser_get_comments(parser.get()); + + /* Add top object */ + auto *top_doc = rspamd_rcl_add_doc_by_path(cfg, root_path, doc_string, + doc_name, ucl_object_type(top), nullptr, 0, nullptr, FALSE); + ucl_object_insert_key(top_doc, + ucl_object_fromstring_common(example_data, example_len, static_cast<ucl_string_flags>(0)), + "example", 0, false); + + rspamd_rcl_add_doc_from_comments(cfg, top_doc, top, comments, TRUE); + + return top_doc; +} diff --git a/src/libserver/cfg_rcl.h b/src/libserver/cfg_rcl.h new file mode 100644 index 0000000..766c55e --- /dev/null +++ b/src/libserver/cfg_rcl.h @@ -0,0 +1,476 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef CFG_RCL_H_ +#define CFG_RCL_H_ + +#include "config.h" +#include "cfg_file.h" +#include "ucl.h" +#include "mem_pool.h" + +#define CFG_RCL_ERROR cfg_rcl_error_quark() +static inline GQuark +cfg_rcl_error_quark(void) +{ + return g_quark_from_static_string("cfg-rcl-error-quark"); +} + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_rcl_section; +struct rspamd_rcl_sections_map; +struct rspamd_config; +struct rspamd_rcl_default_handler_data; + +enum rspamd_rcl_flag { + RSPAMD_CL_FLAG_TIME_FLOAT = 0x1 << 0, + RSPAMD_CL_FLAG_TIME_TIMEVAL = 0x1 << 1, + RSPAMD_CL_FLAG_TIME_TIMESPEC = 0x1 << 2, + RSPAMD_CL_FLAG_TIME_INTEGER = 0x1 << 3, + RSPAMD_CL_FLAG_TIME_UINT_32 = 0x1 << 4, + RSPAMD_CL_FLAG_INT_16 = 0x1 << 5, + RSPAMD_CL_FLAG_INT_32 = 0x1 << 6, + RSPAMD_CL_FLAG_INT_64 = 0x1 << 7, + RSPAMD_CL_FLAG_UINT = 0x1 << 8, + RSPAMD_CL_FLAG_INT_SIZE = 0x1 << 9, + RSPAMD_CL_FLAG_STRING_PATH = 0x1 << 10, + RSPAMD_CL_FLAG_BOOLEAN_INVERSE = 0x1 << 11, + RSPAMD_CL_FLAG_STRING_LIST_HASH = 0x1 << 12, + RSPAMD_CL_FLAG_MULTIPLE = 0x1 << 13, + RSPAMD_CL_FLAG_SIGNKEY = 0x1 << 14, + RSPAMD_CL_FLAG_NISTKEY = 0x1 << 15, +}; + +struct rspamd_rcl_struct_parser { + struct rspamd_config *cfg; + gpointer user_struct; + goffset offset; + int flags; /* enum rspamd_rcl_flag */ +}; + + +/** + * Common handler type + * @param cfg configuration + * @param obj object to parse + * @param ud user data (depends on section) + * @param err error object + * @return TRUE if a section has been parsed + */ +typedef gboolean (*rspamd_rcl_handler_t)(rspamd_mempool_t *pool, + const ucl_object_t *obj, + const gchar *key, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +typedef gboolean (*rspamd_rcl_default_handler_t)(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +/** + * A handler type that is called at the end of section parsing + * @param cfg configuration + * @param ud user data + */ +typedef void (*rspamd_rcl_section_fin_t)(rspamd_mempool_t *pool, gpointer ud); + +/** + * Add a default handler for a section + * @param section section pointer + * @param name name of param + * @param handler handler of param + * @param offset offset in a structure + * @param flags flags for the parser + * @return newly created structure + */ +struct rspamd_rcl_default_handler_data *rspamd_rcl_add_default_handler( + struct rspamd_rcl_section *section, + const gchar *name, + rspamd_rcl_default_handler_t handler, + goffset offset, + gint flags, + const gchar *doc_string); + +/** + * Add new section to the configuration + * @param top top section + * @param name the name of the section + * @param key_attr name of the attribute that should be used as key attribute + * @param handler handler function for all attributes + * @param type type of object handled by a handler + * @param required whether at least one of these sections is required + * @param strict_type turn on strict check for types for this section + * @return newly created structure + */ +struct rspamd_rcl_section *rspamd_rcl_add_section( + struct rspamd_rcl_sections_map **top, + struct rspamd_rcl_section *parent_section, + const gchar *name, + const gchar *key_attr, + rspamd_rcl_handler_t handler, + enum ucl_type type, + gboolean required, + gboolean strict_type); + +struct rspamd_rcl_section *rspamd_rcl_add_section_doc( + struct rspamd_rcl_sections_map **top, + struct rspamd_rcl_section *parent_section, + const gchar *name, const gchar *key_attr, + rspamd_rcl_handler_t handler, + enum ucl_type type, gboolean required, + gboolean strict_type, + ucl_object_t *doc_target, + const gchar *doc_string); + +/** + * Init common sections known to rspamd + * @return top section + */ +struct rspamd_rcl_sections_map *rspamd_rcl_config_init(struct rspamd_config *cfg, + GHashTable *skip_sections); + +/** + * Parse configuration + * @param top top section + * @param cfg rspamd configuration + * @param ptr pointer to the target + * @param pool pool object + * @param obj ucl object to parse + * @param err error pointer + * @return + */ +gboolean rspamd_rcl_parse(struct rspamd_rcl_sections_map *top, + struct rspamd_config *cfg, + gpointer ptr, rspamd_mempool_t *pool, + const ucl_object_t *obj, GError **err); + +/** + * Here is a section of common handlers that accepts rcl_struct_parser + * which itself contains a struct pointer and the offset of a member in a + * specific structure + */ + +/** + * Parse a string field of a structure + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure + * @param section the current section + * @param err error pointer + * @return TRUE if a string value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_string(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +/** + * Parse an integer field of a structure + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure + * @param section the current section + * @param err error pointer + * @return TRUE if a value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_integer(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + + +/** + * Parse a float field of a structure + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure + * @param section the current section + * @param err error pointer + * @return TRUE if a value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_double(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +/** + * Parse a time field of a structure + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure (flags mean the exact structure used) + * @param section the current section + * @param err error pointer + * @return TRUE if a value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_time(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +/** + * Parse a string list field of a structure presented by a GList* object + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure (flags mean the exact structure used) + * @param section the current section + * @param err error pointer + * @return TRUE if a value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_string_list(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +/** + * Parse a boolean field of a structure + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure (flags mean the exact structure used) + * @param section the current section + * @param err error pointer + * @return TRUE if a value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_boolean(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +/** + * Parse a keypair field of a structure + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure (flags mean the exact structure used) + * @param section the current section + * @param err error pointer + * @return TRUE if a value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_keypair(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +/** + * Parse a pubkey field of a structure + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure (flags mean the exact structure used) + * @param section the current section + * @param err error pointer + * @return TRUE if a value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_pubkey(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +/** + * Parse a inet addr field of a structure + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure (flags mean the exact structure used) + * @param section the current section + * @param err error pointer + * @return TRUE if a value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_addr(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +/** + * Parse a gmime inet address field of a structure + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure (flags mean the exact structure used) + * @param section the current section + * @param err error pointer + * @return TRUE if a value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_mime_addr(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + +/** + * Parse a raw ucl object + * @param cfg config pointer + * @param obj object to parse + * @param ud struct_parser structure (flags mean the exact structure used) + * @param section the current section + * @param err error pointer + * @return TRUE if a value has been successfully parsed + */ +gboolean rspamd_rcl_parse_struct_ucl(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err); + + +/** + * Utility functions + */ + +/** + * Register new parser for a worker type of an option with the specified name + * @param cfg config structure + * @param type type of worker (GQuark) + * @param name name of option + * @param handler handler of option + * @param target opaque target structure, note it **MUST** be worker ctx due to some reasons I don't really remember + * @param offset offset inside a structure + */ +void rspamd_rcl_register_worker_option(struct rspamd_config *cfg, + GQuark type, + const gchar *name, + rspamd_rcl_default_handler_t handler, + gpointer target, + glong offset, + gint flags, + const gchar *doc_string); + +/** + * Adds new documentation object to the configuration + * @param doc_target target object where to insert documentation (top object is used if this is NULL) + * @param doc_object documentation object to insert + */ +ucl_object_t *rspamd_rcl_add_doc_obj(ucl_object_t *doc_target, + const char *doc_string, + const char *doc_name, + ucl_type_t type, + rspamd_rcl_default_handler_t handler, + gint flags, + const char *default_value, + gboolean required); + +/** + * Adds new documentation option specified by path `doc_path` that should be + * split by dots + */ +ucl_object_t *rspamd_rcl_add_doc_by_path(struct rspamd_config *cfg, + const gchar *doc_path, + const char *doc_string, + const char *doc_name, + ucl_type_t type, + rspamd_rcl_default_handler_t handler, + gint flags, + const char *default_value, + gboolean required); + + +/** + * Parses example and adds documentation according to the example: + * + * ``` + * section { + * param1 = value; # explanation + * param2 = value; # explanation + * } + * ``` + * + * will produce the following documentation strings: + * section -> + * section.param1 : explanation + * section.param2 : explanation + * + * @param cfg + * @param root_path + * @param example_data + * @param example_len + * @return + */ +ucl_object_t *rspamd_rcl_add_doc_by_example(struct rspamd_config *cfg, + const gchar *root_path, + const gchar *doc_string, + const gchar *doc_name, + const gchar *example_data, gsize example_len); + +/** + * Add lua modules path + * @param cfg + * @param path + * @param err + * @return + */ +gboolean rspamd_rcl_add_lua_plugins_path(struct rspamd_rcl_sections_map *sections, + struct rspamd_config *cfg, + const gchar *path, + gboolean main_path, + GError **err); + + +/** + * Calls for an external lua function to apply potential config transformations + * if needed. This function can change the cfg->rcl_obj. + * + * Example of transformation function: + * + * function(obj) + * if obj.something == 'foo' then + * obj.something = "bla" + * return true, obj + * end + * + * return false, nil + * end + * + * If function returns 'false' then rcl_obj is not touched. Otherwise, + * it is changed, then rcl_obj is imported from lua. Old config is dereferenced. + * @param cfg + */ +void rspamd_rcl_maybe_apply_lua_transform(struct rspamd_config *cfg); +void rspamd_rcl_sections_free(struct rspamd_rcl_sections_map *sections); + +void rspamd_config_calculate_cksum(struct rspamd_config *cfg); + +/* + * Read configuration file + */ +gboolean rspamd_config_parse_ucl(struct rspamd_config *cfg, + const gchar *filename, + GHashTable *vars, + ucl_include_trace_func_t inc_trace, + void *trace_data, + gboolean skip_jinja, + GError **err); +gboolean rspamd_config_read(struct rspamd_config *cfg, + const gchar *filename, + rspamd_rcl_section_fin_t logger_fin, + gpointer logger_ud, + GHashTable *vars, + gboolean skip_jinja, + gchar **lua_env); + +#ifdef __cplusplus +} +#endif + +#endif /* CFG_RCL_H_ */ diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx new file mode 100644 index 0000000..3a94b47 --- /dev/null +++ b/src/libserver/cfg_utils.cxx @@ -0,0 +1,2955 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" + +#include "lua/lua_common.h" +#include "lua/lua_thread_pool.h" + +#include "cfg_file.h" +#include "rspamd.h" +#include "cfg_file_private.h" + +#include "maps/map.h" +#include "maps/map_helpers.h" +#include "maps/map_private.h" +#include "dynamic_cfg.h" +#include "utlist.h" +#include "stat_api.h" +#include "unix-std.h" +#include "libutil/multipattern.h" +#include "monitored.h" +#include "ref.h" +#include "cryptobox.h" +#include "ssl_util.h" +#include "contrib/libottery/ottery.h" +#include "contrib/fastutf8/fastutf8.h" + +#ifdef SYS_ZSTD +#include "zstd.h" +#else +#define ZSTD_STATIC_LINKING_ONLY +#include "contrib/zstd/zstd.h" +#endif + +#ifdef HAVE_OPENSSL +#include <openssl/rand.h> +#include <openssl/err.h> +#include <openssl/evp.h> +#include <openssl/ssl.h> +#include <openssl/conf.h> +#endif +#ifdef HAVE_LOCALE_H +#include <locale.h> +#endif +#ifdef HAVE_SYS_RESOURCE_H +#include <sys/resource.h> +#endif +#include <math.h> +#include "libserver/composites/composites.h" + +#include "blas-config.h" + +#include <string> +#include <string_view> +#include <vector> +#include "fmt/core.h" +#include "cxx/util.hxx" +#include "frozen/unordered_map.h" +#include "frozen/string.h" +#include "contrib/ankerl/unordered_dense.h" + +#define DEFAULT_SCORE 10.0 + +#define DEFAULT_RLIMIT_NOFILE 2048 +#define DEFAULT_RLIMIT_MAXCORE 0 +#define DEFAULT_MAP_TIMEOUT 60.0 * 5 +#define DEFAULT_MAP_FILE_WATCH_MULTIPLIER 1 +#define DEFAULT_MIN_WORD 0 +#define DEFAULT_MAX_WORD 40 +#define DEFAULT_WORDS_DECAY 600 +#define DEFAULT_MAX_MESSAGE (50 * 1024 * 1024) +#define DEFAULT_MAX_PIC (1 * 1024 * 1024) +#define DEFAULT_MAX_SHOTS 100 +#define DEFAULT_MAX_SESSIONS 100 +#define DEFAULT_MAX_WORKERS 4 +#define DEFAULT_MAX_HTML_SIZE DEFAULT_MAX_MESSAGE / 5 /* 10 Mb */ +/* Timeout for task processing */ +#define DEFAULT_TASK_TIMEOUT 8.0 +#define DEFAULT_LUA_GC_STEP 200 +#define DEFAULT_LUA_GC_PAUSE 200 +#define DEFAULT_GC_MAXITERS 0 + +struct rspamd_ucl_map_cbdata { + struct rspamd_config *cfg; + std::string buf; + + explicit rspamd_ucl_map_cbdata(struct rspamd_config *cfg) + : cfg(cfg) + { + } +}; +static gchar *rspamd_ucl_read_cb(gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final); +static void rspamd_ucl_fin_cb(struct map_cb_data *data, void **target); +static void rspamd_ucl_dtor_cb(struct map_cb_data *data); + +guint rspamd_config_log_id = (guint) -1; +RSPAMD_CONSTRUCTOR(rspamd_config_log_init) +{ + rspamd_config_log_id = rspamd_logger_add_debug_module("config"); +} + +struct rspamd_actions_list { + using action_ptr = std::shared_ptr<rspamd_action>; + std::vector<action_ptr> actions; + ankerl::unordered_dense::map<std::string_view, action_ptr> actions_by_name; + + explicit rspamd_actions_list() + { + actions.reserve(METRIC_ACTION_MAX + 2); + actions_by_name.reserve(METRIC_ACTION_MAX + 2); + } + + void add_action(action_ptr action) + { + actions.push_back(action); + actions_by_name[action->name] = action; + sort(); + } + + void sort() + { + std::sort(actions.begin(), actions.end(), [](const action_ptr &a1, const action_ptr &a2) -> bool { + if (!isnan(a1->threshold) && !isnan(a2->threshold)) { + return a1->threshold < a2->threshold; + } + + if (isnan(a1->threshold) && isnan(a2->threshold)) { + return false; + } + else if (isnan(a1->threshold)) { + return true; + } + + return false; + }); + } + + void clear() + { + actions.clear(); + actions_by_name.clear(); + } +}; + +#define RSPAMD_CFG_ACTIONS(cfg) (reinterpret_cast<rspamd_actions_list *>((cfg)->actions)) + +gboolean +rspamd_parse_bind_line(struct rspamd_config *cfg, + struct rspamd_worker_conf *cf, + const gchar *str) +{ + struct rspamd_worker_bind_conf *cnf; + const gchar *fdname; + gboolean ret = TRUE; + + if (str == nullptr) { + return FALSE; + } + + cnf = rspamd_mempool_alloc0_type(cfg->cfg_pool, struct rspamd_worker_bind_conf); + + cnf->cnt = 1024; + cnf->bind_line = rspamd_mempool_strdup(cfg->cfg_pool, str); + + auto bind_line = std::string_view{cnf->bind_line}; + + if (bind_line.starts_with("systemd:")) { + /* The actual socket will be passed by systemd environment */ + fdname = str + sizeof("systemd:") - 1; + cnf->is_systemd = TRUE; + cnf->addrs = g_ptr_array_new_full(1, nullptr); + rspamd_mempool_add_destructor(cfg->cfg_pool, + rspamd_ptr_array_free_hard, cnf->addrs); + + if (fdname[0]) { + g_ptr_array_add(cnf->addrs, rspamd_mempool_strdup(cfg->cfg_pool, fdname)); + cnf->cnt = cnf->addrs->len; + cnf->name = rspamd_mempool_strdup(cfg->cfg_pool, str); + LL_PREPEND(cf->bind_conf, cnf); + } + else { + msg_err_config("cannot parse bind line: %s", str); + ret = FALSE; + } + } + else { + if (rspamd_parse_host_port_priority(str, &cnf->addrs, + nullptr, &cnf->name, DEFAULT_BIND_PORT, TRUE, cfg->cfg_pool) == RSPAMD_PARSE_ADDR_FAIL) { + msg_err_config("cannot parse bind line: %s", str); + ret = FALSE; + } + else { + cnf->cnt = cnf->addrs->len; + LL_PREPEND(cf->bind_conf, cnf); + } + } + + return ret; +} + +struct rspamd_config * +rspamd_config_new(enum rspamd_config_init_flags flags) +{ + struct rspamd_config *cfg; + rspamd_mempool_t *pool; + + pool = rspamd_mempool_new(8 * 1024 * 1024, "cfg", 0); + cfg = rspamd_mempool_alloc0_type(pool, struct rspamd_config); + /* Allocate larger pool for cfg */ + cfg->cfg_pool = pool; + cfg->dns_timeout = 1.0; + cfg->dns_retransmits = 5; + /* 16 sockets per DNS server */ + cfg->dns_io_per_server = 16; + cfg->unknown_weight = NAN; + + cfg->actions = (void *) new rspamd_actions_list(); + + /* Add all internal actions to keep compatibility */ + for (int i = METRIC_ACTION_REJECT; i < METRIC_ACTION_MAX; i++) { + + auto &&action = std::make_shared<rspamd_action>(); + action->threshold = NAN; + action->name = rspamd_mempool_strdup(cfg->cfg_pool, + rspamd_action_to_str(static_cast<rspamd_action_type>(i))); + action->action_type = static_cast<rspamd_action_type>(i); + + if (i == METRIC_ACTION_SOFT_REJECT) { + action->flags |= RSPAMD_ACTION_NO_THRESHOLD | RSPAMD_ACTION_HAM; + } + else if (i == METRIC_ACTION_GREYLIST) { + action->flags |= RSPAMD_ACTION_THRESHOLD_ONLY | RSPAMD_ACTION_HAM; + } + else if (i == METRIC_ACTION_NOACTION) { + action->flags |= RSPAMD_ACTION_HAM; + } + + RSPAMD_CFG_ACTIONS(cfg)->add_action(std::move(action)); + } + + /* Disable timeout */ + cfg->task_timeout = DEFAULT_TASK_TIMEOUT; + + + rspamd_config_init_metric(cfg); + cfg->composites_manager = rspamd_composites_manager_create(cfg); + cfg->classifiers_symbols = g_hash_table_new(rspamd_str_hash, + rspamd_str_equal); + cfg->cfg_params = g_hash_table_new(rspamd_str_hash, rspamd_str_equal); + cfg->debug_modules = g_hash_table_new(rspamd_str_hash, rspamd_str_equal); + cfg->explicit_modules = g_hash_table_new(rspamd_str_hash, rspamd_str_equal); + cfg->trusted_keys = g_hash_table_new(rspamd_str_hash, + rspamd_str_equal); + + cfg->map_timeout = DEFAULT_MAP_TIMEOUT; + cfg->map_file_watch_multiplier = DEFAULT_MAP_FILE_WATCH_MULTIPLIER; + + cfg->log_level = G_LOG_LEVEL_WARNING; + cfg->log_flags = RSPAMD_LOG_FLAG_DEFAULT; + + cfg->check_text_attachements = TRUE; + + cfg->dns_max_requests = 64; + cfg->history_rows = 200; + cfg->log_error_elts = 10; + cfg->log_error_elt_maxlen = 1000; + cfg->log_task_max_elts = 7; + cfg->cache_reload_time = 30.0; + cfg->max_lua_urls = 1024; + cfg->max_urls = cfg->max_lua_urls * 10; + cfg->max_recipients = 1024; + cfg->max_blas_threads = 1; + cfg->max_opts_len = 4096; + cfg->gtube_patterns_policy = RSPAMD_GTUBE_REJECT; + + /* Default log line */ + cfg->log_format_str = rspamd_mempool_strdup(cfg->cfg_pool, + "id: <$mid>,$if_qid{ qid: <$>,}$if_ip{ ip: $,}" + "$if_user{ user: $,}$if_smtp_from{ from: <$>,} (default: $is_spam " + "($action): [$scores] [$symbols_scores_params]), len: $len, time: $time_real, " + "dns req: $dns_req, digest: <$digest>" + "$if_smtp_rcpts{ rcpts: <$>, }$if_mime_rcpt{ mime_rcpt: <$>, }"); + /* Allow non-mime input by default */ + cfg->allow_raw_input = TRUE; + /* Default maximum words processed */ + cfg->words_decay = DEFAULT_WORDS_DECAY; + cfg->min_word_len = DEFAULT_MIN_WORD; + cfg->max_word_len = DEFAULT_MAX_WORD; + cfg->max_html_len = DEFAULT_MAX_HTML_SIZE; + + /* GC limits */ + cfg->lua_gc_pause = DEFAULT_LUA_GC_PAUSE; + cfg->lua_gc_step = DEFAULT_LUA_GC_STEP; + cfg->full_gc_iters = DEFAULT_GC_MAXITERS; + + /* Default hyperscan cache */ + cfg->hs_cache_dir = rspamd_mempool_strdup(cfg->cfg_pool, RSPAMD_DBDIR "/"); + + if (!(flags & RSPAMD_CONFIG_INIT_SKIP_LUA)) { + cfg->lua_state = (void *) rspamd_lua_init(flags & RSPAMD_CONFIG_INIT_WIPE_LUA_MEM); + cfg->own_lua_state = TRUE; + cfg->lua_thread_pool = (void *) lua_thread_pool_new(RSPAMD_LUA_CFG_STATE(cfg)); + } + + cfg->cache = rspamd_symcache_new(cfg); + cfg->ups_ctx = rspamd_upstreams_library_init(); + cfg->re_cache = rspamd_re_cache_new(); + cfg->doc_strings = ucl_object_typed_new(UCL_OBJECT); + /* + * Unless exim is fixed + */ + cfg->enable_shutdown_workaround = TRUE; + + cfg->ssl_ciphers = rspamd_mempool_strdup(cfg->cfg_pool, "HIGH:!aNULL:!kRSA:!PSK:!SRP:!MD5:!RC4"); + cfg->max_message = DEFAULT_MAX_MESSAGE; + cfg->max_pic_size = DEFAULT_MAX_PIC; + cfg->images_cache_size = 256; + cfg->monitored_ctx = rspamd_monitored_ctx_init(); + cfg->neighbours = ucl_object_typed_new(UCL_OBJECT); + cfg->redis_pool = rspamd_redis_pool_init(); + cfg->default_max_shots = DEFAULT_MAX_SHOTS; + cfg->max_sessions_cache = DEFAULT_MAX_SESSIONS; + cfg->maps_cache_dir = rspamd_mempool_strdup(cfg->cfg_pool, RSPAMD_DBDIR); + cfg->c_modules = g_ptr_array_new(); + cfg->heartbeat_interval = 10.0; + + cfg->enable_css_parser = true; + cfg->script_modules = g_ptr_array_new(); + + REF_INIT_RETAIN(cfg, rspamd_config_free); + + return cfg; +} + +void rspamd_config_free(struct rspamd_config *cfg) +{ + struct rspamd_config_cfg_lua_script *sc, *sctmp; + struct rspamd_config_settings_elt *set, *stmp; + struct rspamd_worker_log_pipe *lp, *ltmp; + + rspamd_lua_run_config_unload(RSPAMD_LUA_CFG_STATE(cfg), cfg); + + /* Scripts part */ + DL_FOREACH_SAFE(cfg->on_term_scripts, sc, sctmp) + { + luaL_unref(RSPAMD_LUA_CFG_STATE(cfg), LUA_REGISTRYINDEX, sc->cbref); + } + + DL_FOREACH_SAFE(cfg->on_load_scripts, sc, sctmp) + { + luaL_unref(RSPAMD_LUA_CFG_STATE(cfg), LUA_REGISTRYINDEX, sc->cbref); + } + + DL_FOREACH_SAFE(cfg->post_init_scripts, sc, sctmp) + { + luaL_unref(RSPAMD_LUA_CFG_STATE(cfg), LUA_REGISTRYINDEX, sc->cbref); + } + + DL_FOREACH_SAFE(cfg->config_unload_scripts, sc, sctmp) + { + luaL_unref(RSPAMD_LUA_CFG_STATE(cfg), LUA_REGISTRYINDEX, sc->cbref); + } + + DL_FOREACH_SAFE(cfg->setting_ids, set, stmp) + { + REF_RELEASE(set); + } + + rspamd_map_remove_all(cfg); + rspamd_mempool_destructors_enforce(cfg->cfg_pool); + + g_list_free(cfg->classifiers); + g_list_free(cfg->workers); + rspamd_symcache_destroy(cfg->cache); + ucl_object_unref(cfg->cfg_ucl_obj); + ucl_object_unref(cfg->config_comments); + ucl_object_unref(cfg->doc_strings); + ucl_object_unref(cfg->neighbours); + g_hash_table_remove_all(cfg->cfg_params); + g_hash_table_unref(cfg->cfg_params); + g_hash_table_unref(cfg->classifiers_symbols); + g_hash_table_unref(cfg->debug_modules); + g_hash_table_unref(cfg->explicit_modules); + g_hash_table_unref(cfg->trusted_keys); + + rspamd_re_cache_unref(cfg->re_cache); + g_ptr_array_free(cfg->c_modules, TRUE); + g_ptr_array_free(cfg->script_modules, TRUE); + + if (cfg->monitored_ctx) { + rspamd_monitored_ctx_destroy(cfg->monitored_ctx); + } + + if (RSPAMD_LUA_CFG_STATE(cfg) && cfg->own_lua_state) { + lua_thread_pool_free((struct lua_thread_pool *) cfg->lua_thread_pool); + rspamd_lua_close(RSPAMD_LUA_CFG_STATE(cfg)); + } + + if (cfg->redis_pool) { + rspamd_redis_pool_destroy(cfg->redis_pool); + } + + rspamd_upstreams_library_unref(cfg->ups_ctx); + delete RSPAMD_CFG_ACTIONS(cfg); + + rspamd_mempool_destructors_enforce(cfg->cfg_pool); + + if (cfg->checksum) { + g_free(cfg->checksum); + } + + REF_RELEASE(cfg->libs_ctx); + + DL_FOREACH_SAFE(cfg->log_pipes, lp, ltmp) + { + close(lp->fd); + g_free(lp); + } + + rspamd_mempool_delete(cfg->cfg_pool); +} + +const ucl_object_t * +rspamd_config_get_module_opt(struct rspamd_config *cfg, + const gchar *module_name, + const gchar *opt_name) +{ + const ucl_object_t *res = nullptr, *sec; + + sec = ucl_obj_get_key(cfg->cfg_ucl_obj, module_name); + if (sec != nullptr) { + res = ucl_obj_get_key(sec, opt_name); + } + + return res; +} + +gint rspamd_config_parse_flag(const gchar *str, guint len) +{ + gint c; + + if (!str || !*str) { + return -1; + } + + if (len == 0) { + len = strlen(str); + } + + switch (len) { + case 1: + c = g_ascii_tolower(*str); + if (c == 'y' || c == '1') { + return 1; + } + else if (c == 'n' || c == '0') { + return 0; + } + break; + case 2: + if (g_ascii_strncasecmp(str, "no", len) == 0) { + return 0; + } + else if (g_ascii_strncasecmp(str, "on", len) == 0) { + return 1; + } + break; + case 3: + if (g_ascii_strncasecmp(str, "yes", len) == 0) { + return 1; + } + else if (g_ascii_strncasecmp(str, "off", len) == 0) { + return 0; + } + break; + case 4: + if (g_ascii_strncasecmp(str, "true", len) == 0) { + return 1; + } + break; + case 5: + if (g_ascii_strncasecmp(str, "false", len) == 0) { + return 0; + } + break; + } + + return -1; +} + +// A mapping between names and log format types + flags +constexpr const auto config_vars = frozen::make_unordered_map<frozen::string, std::pair<rspamd_log_format_type, int>>({ + {"mid", {RSPAMD_LOG_MID, 0}}, + {"qid", {RSPAMD_LOG_QID, 0}}, + {"user", {RSPAMD_LOG_USER, 0}}, + {"ip", {RSPAMD_LOG_IP, 0}}, + {"len", {RSPAMD_LOG_LEN, 0}}, + {"dns_req", {RSPAMD_LOG_DNS_REQ, 0}}, + {"smtp_from", {RSPAMD_LOG_SMTP_FROM, 0}}, + {"mime_from", {RSPAMD_LOG_MIME_FROM, 0}}, + {"smtp_rcpt", {RSPAMD_LOG_SMTP_RCPT, 0}}, + {"mime_rcpt", {RSPAMD_LOG_MIME_RCPT, 0}}, + {"smtp_rcpts", {RSPAMD_LOG_SMTP_RCPTS, 0}}, + {"mime_rcpts", {RSPAMD_LOG_MIME_RCPTS, 0}}, + {"time_real", {RSPAMD_LOG_TIME_REAL, 0}}, + {"time_virtual", {RSPAMD_LOG_TIME_VIRTUAL, 0}}, + {"lua", {RSPAMD_LOG_LUA, 0}}, + {"digest", {RSPAMD_LOG_DIGEST, 0}}, + {"checksum", {RSPAMD_LOG_DIGEST, 0}}, + {"filename", {RSPAMD_LOG_FILENAME, 0}}, + {"forced_action", {RSPAMD_LOG_FORCED_ACTION, 0}}, + {"settings_id", {RSPAMD_LOG_SETTINGS_ID, 0}}, + {"mempool_size", {RSPAMD_LOG_MEMPOOL_SIZE, 0}}, + {"mempool_waste", {RSPAMD_LOG_MEMPOOL_WASTE, 0}}, + {"action", {RSPAMD_LOG_ACTION, 0}}, + {"scores", {RSPAMD_LOG_SCORES, 0}}, + {"symbols", {RSPAMD_LOG_SYMBOLS, 0}}, + {"symbols_scores", {RSPAMD_LOG_SYMBOLS, RSPAMD_LOG_FMT_FLAG_SYMBOLS_SCORES}}, + {"symbols_params", {RSPAMD_LOG_SYMBOLS, RSPAMD_LOG_FMT_FLAG_SYMBOLS_PARAMS}}, + {"symbols_scores_params", {RSPAMD_LOG_SYMBOLS, RSPAMD_LOG_FMT_FLAG_SYMBOLS_PARAMS | RSPAMD_LOG_FMT_FLAG_SYMBOLS_SCORES}}, + {"groups", {RSPAMD_LOG_GROUPS, 0}}, + {"public_groups", {RSPAMD_LOG_PUBLIC_GROUPS, 0}}, + {"is_spam", {RSPAMD_LOG_ISSPAM, 0}}, +}); + +static gboolean +rspamd_config_process_var(struct rspamd_config *cfg, const rspamd_ftok_t *var, + const rspamd_ftok_t *content) +{ + g_assert(var != nullptr); + + auto flags = 0; + auto lc_var = std::string{var->begin, var->len}; + std::transform(lc_var.begin(), lc_var.end(), lc_var.begin(), g_ascii_tolower); + auto tok = std::string_view{lc_var}; + + if (var->len > 3 && tok.starts_with("if_")) { + flags |= RSPAMD_LOG_FMT_FLAG_CONDITION; + tok = tok.substr(3); + } + + auto maybe_fmt_var = rspamd::find_map(config_vars, tok); + + if (maybe_fmt_var) { + auto &fmt_var = maybe_fmt_var.value().get(); + auto *log_format = rspamd_mempool_alloc0_type(cfg->cfg_pool, rspamd_log_format); + + log_format->type = fmt_var.first; + log_format->flags = fmt_var.second | flags; + + if (log_format->type != RSPAMD_LOG_LUA) { + if (content && content->len > 0) { + log_format->data = rspamd_mempool_alloc0(cfg->cfg_pool, + sizeof(rspamd_ftok_t)); + memcpy(log_format->data, content, sizeof(*content)); + log_format->len = sizeof(*content); + } + } + else { + /* Load lua code and ensure that we have function ref returned */ + if (!content || content->len == 0) { + msg_err_config("lua variable needs content: %T", &tok); + return FALSE; + } + + if (luaL_loadbuffer(RSPAMD_LUA_CFG_STATE(cfg), content->begin, content->len, + "lua log variable") != 0) { + msg_err_config("error loading lua code: '%T': %s", content, + lua_tostring(RSPAMD_LUA_CFG_STATE(cfg), -1)); + return FALSE; + } + if (lua_pcall(RSPAMD_LUA_CFG_STATE(cfg), 0, 1, 0) != 0) { + msg_err_config("error executing lua code: '%T': %s", content, + lua_tostring(RSPAMD_LUA_CFG_STATE(cfg), -1)); + lua_pop(RSPAMD_LUA_CFG_STATE(cfg), 1); + + return FALSE; + } + + if (lua_type(RSPAMD_LUA_CFG_STATE(cfg), -1) != LUA_TFUNCTION) { + msg_err_config("lua variable should return function: %T", content); + lua_pop(RSPAMD_LUA_CFG_STATE(cfg), 1); + return FALSE; + } + + auto id = luaL_ref(RSPAMD_LUA_CFG_STATE(cfg), LUA_REGISTRYINDEX); + log_format->data = GINT_TO_POINTER(id); + log_format->len = 0; + } + + DL_APPEND(cfg->log_format, log_format); + } + else { + std::string known_formats; + + for (const auto &v: config_vars) { + known_formats += std::string_view{v.first.data(), v.first.size()}; + known_formats += ", "; + } + + if (known_formats.size() > 2) { + // Remove last comma + known_formats.resize(known_formats.size() - 2); + } + msg_err_config("unknown log variable: %T, known vars are: \"%s\"", var, known_formats.c_str()); + return FALSE; + } + + return TRUE; +} + +static gboolean +rspamd_config_parse_log_format(struct rspamd_config *cfg) +{ + const gchar *p, *c, *end, *s; + gchar *d; + struct rspamd_log_format *lf = nullptr; + rspamd_ftok_t var, var_content; + enum { + parse_str, + parse_dollar, + parse_var_name, + parse_var_content, + } state = parse_str; + gint braces = 0; + + g_assert(cfg != nullptr); + c = cfg->log_format_str; + + if (c == nullptr) { + return FALSE; + } + + p = c; + end = p + strlen(p); + + while (p < end) { + switch (state) { + case parse_str: + if (*p == '$') { + state = parse_dollar; + } + else { + p++; + } + break; + case parse_dollar: + if (p > c) { + /* We have string element that we need to store */ + lf = rspamd_mempool_alloc0_type(cfg->cfg_pool, struct rspamd_log_format); + lf->type = RSPAMD_LOG_STRING; + lf->data = rspamd_mempool_alloc(cfg->cfg_pool, p - c + 1); + /* Filter \r\n from the destination */ + s = c; + d = (char *) lf->data; + + while (s < p) { + if (*s != '\r' && *s != '\n') { + *d++ = *s++; + } + else { + *d++ = ' '; + s++; + } + } + *d = '\0'; + + lf->len = d - (char *) lf->data; + DL_APPEND(cfg->log_format, lf); + lf = nullptr; + } + p++; + c = p; + state = parse_var_name; + break; + case parse_var_name: + if (*p == '{') { + var.begin = c; + var.len = p - c; + p++; + c = p; + state = parse_var_content; + braces = 1; + } + else if (*p != '_' && *p != '-' && !g_ascii_isalnum(*p)) { + /* Variable with no content */ + var.begin = c; + var.len = p - c; + c = p; + + if (!rspamd_config_process_var(cfg, &var, nullptr)) { + return FALSE; + } + + state = parse_str; + } + else { + p++; + } + break; + case parse_var_content: + if (*p == '}' && --braces == 0) { + var_content.begin = c; + var_content.len = p - c; + p++; + c = p; + + if (!rspamd_config_process_var(cfg, &var, &var_content)) { + return FALSE; + } + + state = parse_str; + } + else if (*p == '{') { + braces++; + p++; + } + else { + p++; + } + break; + } + } + + /* Last state */ + switch (state) { + case parse_str: + if (p > c) { + /* We have string element that we need to store */ + lf = rspamd_mempool_alloc0_type(cfg->cfg_pool, struct rspamd_log_format); + lf->type = RSPAMD_LOG_STRING; + lf->data = rspamd_mempool_alloc(cfg->cfg_pool, p - c + 1); + /* Filter \r\n from the destination */ + s = c; + d = (char *) lf->data; + + while (s < p) { + if (*s != '\r' && *s != '\n') { + *d++ = *s++; + } + else { + *d++ = ' '; + s++; + } + } + *d = '\0'; + + lf->len = d - (char *) lf->data; + DL_APPEND(cfg->log_format, lf); + lf = nullptr; + } + break; + + case parse_var_name: + var.begin = c; + var.len = p - c; + + if (!rspamd_config_process_var(cfg, &var, nullptr)) { + return FALSE; + } + break; + case parse_dollar: + case parse_var_content: + msg_err_config("cannot parse log format %s: incomplete string", + cfg->log_format_str); + return FALSE; + break; + } + + return TRUE; +} + +static void +rspamd_urls_config_dtor(gpointer _unused) +{ + rspamd_url_deinit(); +} + +static void +rspamd_adjust_clocks_resolution(struct rspamd_config *cfg) +{ +#ifdef HAVE_CLOCK_GETTIME + struct timespec ts; +#endif + +#ifdef HAVE_CLOCK_GETTIME +#ifdef HAVE_CLOCK_PROCESS_CPUTIME_ID + clock_getres(CLOCK_PROCESS_CPUTIME_ID, &ts); +#elif defined(HAVE_CLOCK_VIRTUAL) + clock_getres(CLOCK_VIRTUAL, &ts); +#else + clock_getres(CLOCK_REALTIME, &ts); +#endif + cfg->clock_res = log10(1000000. / ts.tv_nsec); + if (cfg->clock_res < 0) { + cfg->clock_res = 0; + } + if (cfg->clock_res > 3) { + cfg->clock_res = 3; + } +#else + /* For gettimeofday */ + cfg->clock_res = 1; +#endif +} + +/* + * Perform post load actions + */ +gboolean +rspamd_config_post_load(struct rspamd_config *cfg, + enum rspamd_post_load_options opts) +{ + + auto ret = TRUE; + + rspamd_adjust_clocks_resolution(cfg); + rspamd_logger_configure_modules(cfg->debug_modules); + + if (cfg->one_shot_mode) { + msg_info_config("enabling one shot mode (was %d max shots)", + cfg->default_max_shots); + cfg->default_max_shots = 1; + } + +#if defined(WITH_HYPERSCAN) && !defined(__aarch64__) && !defined(__powerpc64__) + if (!cfg->disable_hyperscan) { + if (!(cfg->libs_ctx->crypto_ctx->cpu_config & CPUID_SSSE3)) { + msg_warn_config("CPU doesn't have SSSE3 instructions set " + "required for hyperscan, disable it"); + cfg->disable_hyperscan = TRUE; + } + } +#endif + + rspamd_regexp_library_init(cfg); + rspamd_multipattern_library_init(cfg->hs_cache_dir); + + if (opts & RSPAMD_CONFIG_INIT_URL) { + if (cfg->tld_file == nullptr) { + /* Try to guess tld file */ + auto fpath = fmt::format("{0}{1}{2}", RSPAMD_SHAREDIR, + G_DIR_SEPARATOR, "effective_tld_names.dat"); + + if (access(fpath.c_str(), R_OK) != -1) { + msg_debug_config("url_tld option is not specified but %s is available," + " therefore this file is assumed as TLD file for URL" + " extraction", + fpath.c_str()); + cfg->tld_file = rspamd_mempool_strdup(cfg->cfg_pool, fpath.c_str()); + } + else { + if (opts & RSPAMD_CONFIG_INIT_VALIDATE) { + msg_err_config("no url_tld option has been specified"); + ret = FALSE; + } + } + } + else { + if (access(cfg->tld_file, R_OK) == -1) { + if (opts & RSPAMD_CONFIG_INIT_VALIDATE) { + ret = FALSE; + msg_err_config("cannot access tld file %s: %s", cfg->tld_file, + strerror(errno)); + } + else { + msg_debug_config("cannot access tld file %s: %s", cfg->tld_file, + strerror(errno)); + cfg->tld_file = nullptr; + } + } + } + + if (opts & RSPAMD_CONFIG_INIT_NO_TLD) { + rspamd_url_init(nullptr); + } + else { + rspamd_url_init(cfg->tld_file); + } + + rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_urls_config_dtor, + nullptr); + } + + init_dynamic_config(cfg); + /* Insert classifiers symbols */ + rspamd_config_insert_classify_symbols(cfg); + + /* Parse format string that we have */ + if (!rspamd_config_parse_log_format(cfg)) { + msg_err_config("cannot parse log format, task logging will not be available"); + if (opts & RSPAMD_CONFIG_INIT_VALIDATE) { + ret = FALSE; + } + } + + if (opts & RSPAMD_CONFIG_INIT_SYMCACHE) { + /* Init config cache */ + ret = rspamd_symcache_init(cfg->cache) && ret; + + /* Init re cache */ + rspamd_re_cache_init(cfg->re_cache, cfg); + + /* Try load Hypersan */ + auto hs_ret = rspamd_re_cache_load_hyperscan(cfg->re_cache, + cfg->hs_cache_dir ? cfg->hs_cache_dir : RSPAMD_DBDIR "/", + true); + + if (hs_ret == RSPAMD_HYPERSCAN_LOAD_ERROR) { + msg_debug_config("cannot load hyperscan database, disable it"); + } + } + + if (opts & RSPAMD_CONFIG_INIT_LIBS) { + /* Config other libraries */ + ret = rspamd_config_libs(cfg->libs_ctx, cfg) && ret; + + if (!ret) { + msg_err_config("cannot configure libraries, fatal error"); + return FALSE; + } + } + + /* Validate cache */ + if (opts & RSPAMD_CONFIG_INIT_VALIDATE) { + /* Check for actions sanity */ + auto seen_controller = FALSE; + + auto *cur = cfg->workers; + while (cur) { + auto *wcf = (struct rspamd_worker_conf *) cur->data; + + if (wcf->type == g_quark_from_static_string("controller")) { + seen_controller = TRUE; + break; + } + + cur = g_list_next(cur); + } + + if (!seen_controller) { + msg_warn_config("controller worker is unconfigured: learning," + " periodic scripts, maps watching and many other" + " Rspamd features will be broken"); + } + + ret = rspamd_symcache_validate(cfg->cache, cfg, FALSE) && ret; + } + + if (opts & RSPAMD_CONFIG_INIT_POST_LOAD_LUA) { + rspamd_lua_run_config_post_init(RSPAMD_LUA_CFG_STATE(cfg), cfg); + } + + if (opts & RSPAMD_CONFIG_INIT_PRELOAD_MAPS) { + rspamd_map_preload(cfg); + } + + return ret; +} + +struct rspamd_classifier_config * +rspamd_config_new_classifier(struct rspamd_config *cfg, + struct rspamd_classifier_config *c) +{ + if (c == nullptr) { + c = + rspamd_mempool_alloc0_type(cfg->cfg_pool, + struct rspamd_classifier_config); + c->min_prob_strength = 0.05; + c->min_token_hits = 2; + } + + if (c->labels == nullptr) { + c->labels = g_hash_table_new_full(rspamd_str_hash, + rspamd_str_equal, + nullptr, + (GDestroyNotify) g_list_free); + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) g_hash_table_destroy, + c->labels); + } + + return c; +} + +struct rspamd_statfile_config * +rspamd_config_new_statfile(struct rspamd_config *cfg, + struct rspamd_statfile_config *c) +{ + if (c == nullptr) { + c = + rspamd_mempool_alloc0_type(cfg->cfg_pool, struct rspamd_statfile_config); + } + + return c; +} + +void rspamd_config_init_metric(struct rspamd_config *cfg) +{ + cfg->grow_factor = 1.0; + cfg->symbols = g_hash_table_new(rspamd_str_hash, rspamd_str_equal); + cfg->groups = g_hash_table_new(rspamd_strcase_hash, rspamd_strcase_equal); + + cfg->subject = SPAM_SUBJECT; + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) g_hash_table_unref, + cfg->symbols); + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) g_hash_table_unref, + cfg->groups); +} + +struct rspamd_symbols_group * +rspamd_config_new_group(struct rspamd_config *cfg, const gchar *name) +{ + struct rspamd_symbols_group *gr; + + gr = rspamd_mempool_alloc0_type(cfg->cfg_pool, struct rspamd_symbols_group); + gr->symbols = g_hash_table_new(rspamd_strcase_hash, + rspamd_strcase_equal); + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) g_hash_table_unref, gr->symbols); + gr->name = rspamd_mempool_strdup(cfg->cfg_pool, name); + + if (strcmp(gr->name, "ungrouped") == 0) { + gr->flags |= RSPAMD_SYMBOL_GROUP_UNGROUPED; + } + + g_hash_table_insert(cfg->groups, gr->name, gr); + + return gr; +} + +static void +rspamd_worker_conf_dtor(struct rspamd_worker_conf *wcf) +{ + if (wcf) { + ucl_object_unref(wcf->options); + g_queue_free(wcf->active_workers); + g_hash_table_unref(wcf->params); + g_free(wcf); + } +} + +static void +rspamd_worker_conf_cfg_fin(gpointer d) +{ + auto *wcf = (struct rspamd_worker_conf *) d; + + REF_RELEASE(wcf); +} + +struct rspamd_worker_conf * +rspamd_config_new_worker(struct rspamd_config *cfg, + struct rspamd_worker_conf *c) +{ + if (c == nullptr) { + c = g_new0(struct rspamd_worker_conf, 1); + c->params = g_hash_table_new(rspamd_str_hash, rspamd_str_equal); + c->active_workers = g_queue_new(); +#ifdef HAVE_SC_NPROCESSORS_ONLN + auto nproc = sysconf(_SC_NPROCESSORS_ONLN); + c->count = MIN(DEFAULT_MAX_WORKERS, MAX(1, nproc - 2)); +#else + c->count = DEFAULT_MAX_WORKERS; +#endif + c->rlimit_nofile = 0; + c->rlimit_maxcore = 0; + c->enabled = TRUE; + + REF_INIT_RETAIN(c, rspamd_worker_conf_dtor); + rspamd_mempool_add_destructor(cfg->cfg_pool, + rspamd_worker_conf_cfg_fin, c); + } + + return c; +} + + +static bool +rspamd_include_map_handler(const guchar *data, gsize len, + const ucl_object_t *args, void *ud) +{ + auto *cfg = (struct rspamd_config *) ud; + + auto ftok = rspamd_ftok_t{.len = len + 1, .begin = (char *) data}; + auto *map_line = rspamd_mempool_ftokdup(cfg->cfg_pool, &ftok); + + auto *cbdata = new rspamd_ucl_map_cbdata{cfg}; + auto **pcbdata = new rspamd_ucl_map_cbdata *(cbdata); + + return rspamd_map_add(cfg, + map_line, + "ucl include", + rspamd_ucl_read_cb, + rspamd_ucl_fin_cb, + rspamd_ucl_dtor_cb, + (void **) pcbdata, + nullptr, RSPAMD_MAP_DEFAULT) != nullptr; +} + +/* + * Variables: + * $CONFDIR - configuration directory + * $LOCAL_CONFDIR - local configuration directory + * $RUNDIR - local states directory + * $DBDIR - databases dir + * $LOGDIR - logs dir + * $PLUGINSDIR - plugins dir + * $PREFIX - installation prefix + * $VERSION - rspamd version + */ + +#define RSPAMD_CONFDIR_MACRO "CONFDIR" +#define RSPAMD_LOCAL_CONFDIR_MACRO "LOCAL_CONFDIR" +#define RSPAMD_RUNDIR_MACRO "RUNDIR" +#define RSPAMD_DBDIR_MACRO "DBDIR" +#define RSPAMD_LOGDIR_MACRO "LOGDIR" +#define RSPAMD_PLUGINSDIR_MACRO "PLUGINSDIR" +#define RSPAMD_SHAREDIR_MACRO "SHAREDIR" +#define RSPAMD_RULESDIR_MACRO "RULESDIR" +#define RSPAMD_WWWDIR_MACRO "WWWDIR" +#define RSPAMD_PREFIX_MACRO "PREFIX" +#define RSPAMD_VERSION_MACRO "VERSION" +#define RSPAMD_VERSION_MAJOR_MACRO "VERSION_MAJOR" +#define RSPAMD_VERSION_MINOR_MACRO "VERSION_MINOR" +#define RSPAMD_BRANCH_VERSION_MACRO "BRANCH_VERSION" +#define RSPAMD_HOSTNAME_MACRO "HOSTNAME" + +void rspamd_ucl_add_conf_variables(struct ucl_parser *parser, GHashTable *vars) +{ + GHashTableIter it; + gpointer k, v; + + ucl_parser_register_variable(parser, + RSPAMD_CONFDIR_MACRO, + RSPAMD_CONFDIR); + ucl_parser_register_variable(parser, + RSPAMD_LOCAL_CONFDIR_MACRO, + RSPAMD_LOCAL_CONFDIR); + ucl_parser_register_variable(parser, RSPAMD_RUNDIR_MACRO, + RSPAMD_RUNDIR); + ucl_parser_register_variable(parser, RSPAMD_DBDIR_MACRO, + RSPAMD_DBDIR); + ucl_parser_register_variable(parser, RSPAMD_LOGDIR_MACRO, + RSPAMD_LOGDIR); + ucl_parser_register_variable(parser, + RSPAMD_PLUGINSDIR_MACRO, + RSPAMD_PLUGINSDIR); + ucl_parser_register_variable(parser, + RSPAMD_SHAREDIR_MACRO, + RSPAMD_SHAREDIR); + ucl_parser_register_variable(parser, + RSPAMD_RULESDIR_MACRO, + RSPAMD_RULESDIR); + ucl_parser_register_variable(parser, RSPAMD_WWWDIR_MACRO, + RSPAMD_WWWDIR); + ucl_parser_register_variable(parser, RSPAMD_PREFIX_MACRO, + RSPAMD_PREFIX); + ucl_parser_register_variable(parser, RSPAMD_VERSION_MACRO, RVERSION); + ucl_parser_register_variable(parser, RSPAMD_VERSION_MAJOR_MACRO, + RSPAMD_VERSION_MAJOR); + ucl_parser_register_variable(parser, RSPAMD_VERSION_MINOR_MACRO, + RSPAMD_VERSION_MINOR); + ucl_parser_register_variable(parser, RSPAMD_BRANCH_VERSION_MACRO, + RSPAMD_VERSION_BRANCH); + + auto hostlen = sysconf(_SC_HOST_NAME_MAX); + + if (hostlen <= 0) { + hostlen = 256; + } + else { + hostlen++; + } + + auto hostbuf = std::string{}; + hostbuf.resize(hostlen); + + if (gethostname(hostbuf.data(), hostlen) != 0) { + hostbuf = "unknown"; + } + + /* UCL copies variables, so it is safe to pass an ephemeral buffer here */ + ucl_parser_register_variable(parser, RSPAMD_HOSTNAME_MACRO, + hostbuf.c_str()); + + if (vars != nullptr) { + g_hash_table_iter_init(&it, vars); + + while (g_hash_table_iter_next(&it, &k, &v)) { + ucl_parser_register_variable(parser, (const char *) k, (const char *) v); + } + } +} + +void rspamd_ucl_add_conf_macros(struct ucl_parser *parser, + struct rspamd_config *cfg) +{ + ucl_parser_register_macro(parser, + "include_map", + rspamd_include_map_handler, + cfg); +} + +static void +symbols_classifiers_callback(gpointer key, gpointer value, gpointer ud) +{ + auto *cfg = (struct rspamd_config *) ud; + + /* Actually, statistics should act like any ordinary symbol */ + rspamd_symcache_add_symbol(cfg->cache, (const char *) key, 0, nullptr, nullptr, + SYMBOL_TYPE_CLASSIFIER | SYMBOL_TYPE_NOSTAT, -1); +} + +void rspamd_config_insert_classify_symbols(struct rspamd_config *cfg) +{ + g_hash_table_foreach(cfg->classifiers_symbols, + symbols_classifiers_callback, + cfg); +} + +struct rspamd_classifier_config * +rspamd_config_find_classifier(struct rspamd_config *cfg, const gchar *name) +{ + if (name == nullptr) { + return nullptr; + } + + auto *cur = cfg->classifiers; + while (cur) { + auto *cf = (struct rspamd_classifier_config *) cur->data; + + if (g_ascii_strcasecmp(cf->name, name) == 0) { + return cf; + } + + cur = g_list_next(cur); + } + + return nullptr; +} + +gboolean +rspamd_config_check_statfiles(struct rspamd_classifier_config *cf) +{ + gboolean has_other = FALSE, res = FALSE, cur_class = FALSE; + + /* First check classes directly */ + auto *cur = cf->statfiles; + while (cur) { + auto *st = (struct rspamd_statfile_config *) cur->data; + if (!has_other) { + cur_class = st->is_spam; + has_other = TRUE; + } + else { + if (cur_class != st->is_spam) { + return TRUE; + } + } + + cur = g_list_next(cur); + } + + if (!has_other) { + /* We have only one statfile */ + return FALSE; + } + /* We have not detected any statfile that has different class, so turn on heuristic based on symbol's name */ + has_other = FALSE; + cur = cf->statfiles; + while (cur) { + auto *st = (struct rspamd_statfile_config *) cur->data; + if (rspamd_substring_search_caseless(st->symbol, + strlen(st->symbol), "spam", 4) != -1) { + st->is_spam = TRUE; + } + else if (rspamd_substring_search_caseless(st->symbol, + strlen(st->symbol), "ham", 3) != -1) { + st->is_spam = FALSE; + } + + if (!has_other) { + cur_class = st->is_spam; + has_other = TRUE; + } + else { + if (cur_class != st->is_spam) { + res = TRUE; + } + } + + cur = g_list_next(cur); + } + + return res; +} + +static gchar * +rspamd_ucl_read_cb(gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + auto *cbdata = (struct rspamd_ucl_map_cbdata *) data->cur_data; + auto *prev = (struct rspamd_ucl_map_cbdata *) data->prev_data; + + if (cbdata == nullptr) { + cbdata = new rspamd_ucl_map_cbdata{prev->cfg}; + data->cur_data = cbdata; + } + cbdata->buf.append(chunk, len); + + /* Say not to copy any part of this buffer */ + return nullptr; +} + +static void +rspamd_ucl_fin_cb(struct map_cb_data *data, void **target) +{ + auto *cbdata = (struct rspamd_ucl_map_cbdata *) data->cur_data; + auto *prev = (struct rspamd_ucl_map_cbdata *) data->prev_data; + auto *cfg = data->map->cfg; + + if (cbdata == nullptr) { + msg_err_config("map fin error: new data is nullptr"); + return; + } + + /* New data available */ + auto *parser = ucl_parser_new(0); + if (!ucl_parser_add_chunk(parser, (unsigned char *) cbdata->buf.data(), + cbdata->buf.size())) { + msg_err_config("cannot parse map %s: %s", + data->map->name, + ucl_parser_get_error(parser)); + ucl_parser_free(parser); + } + else { + auto *obj = ucl_parser_get_object(parser); + ucl_object_iter_t it = nullptr; + + for (auto *cur = ucl_object_iterate(obj, &it, true); cur != nullptr; cur = ucl_object_iterate(obj, &it, true)) { + ucl_object_replace_key(cbdata->cfg->cfg_ucl_obj, (ucl_object_t *) cur, + cur->key, cur->keylen, false); + } + + ucl_parser_free(parser); + ucl_object_unref(obj); + } + + if (target) { + *target = data->cur_data; + } + + delete prev; +} + +static void +rspamd_ucl_dtor_cb(struct map_cb_data *data) +{ + auto *cbdata = (struct rspamd_ucl_map_cbdata *) data->cur_data; + + delete cbdata; +} + +gboolean +rspamd_check_module(struct rspamd_config *cfg, module_t *mod) +{ + gboolean ret = TRUE; + + if (mod != nullptr) { + if (mod->module_version != RSPAMD_CUR_MODULE_VERSION) { + msg_err_config("module %s has incorrect version %xd (%xd expected)", + mod->name, (gint) mod->module_version, RSPAMD_CUR_MODULE_VERSION); + ret = FALSE; + } + if (ret && mod->rspamd_version != RSPAMD_VERSION_NUM) { + msg_err_config("module %s has incorrect rspamd version %xL (%xL expected)", + mod->name, mod->rspamd_version, RSPAMD_VERSION_NUM); + ret = FALSE; + } + if (ret && strcmp(mod->rspamd_features, RSPAMD_FEATURES) != 0) { + msg_err_config("module %s has incorrect rspamd features '%s' ('%s' expected)", + mod->name, mod->rspamd_features, RSPAMD_FEATURES); + ret = FALSE; + } + } + else { + ret = FALSE; + } + + return ret; +} + +gboolean +rspamd_check_worker(struct rspamd_config *cfg, worker_t *wrk) +{ + gboolean ret = TRUE; + + if (wrk != nullptr) { + if (wrk->worker_version != RSPAMD_CUR_WORKER_VERSION) { + msg_err_config("worker %s has incorrect version %xd (%xd expected)", + wrk->name, wrk->worker_version, RSPAMD_CUR_WORKER_VERSION); + ret = FALSE; + } + if (ret && wrk->rspamd_version != RSPAMD_VERSION_NUM) { + msg_err_config("worker %s has incorrect rspamd version %xL (%xL expected)", + wrk->name, wrk->rspamd_version, RSPAMD_VERSION_NUM); + ret = FALSE; + } + if (ret && strcmp(wrk->rspamd_features, RSPAMD_FEATURES) != 0) { + msg_err_config("worker %s has incorrect rspamd features '%s' ('%s' expected)", + wrk->name, wrk->rspamd_features, RSPAMD_FEATURES); + ret = FALSE; + } + } + else { + ret = FALSE; + } + + return ret; +} + +gboolean +rspamd_init_filters(struct rspamd_config *cfg, bool reconfig, bool strict) +{ + GList *cur; + module_t *mod, **pmod; + guint i = 0; + struct module_ctx *mod_ctx, *cur_ctx; + gboolean ret = TRUE; + + /* Init all compiled modules */ + + for (pmod = cfg->compiled_modules; pmod != nullptr && *pmod != nullptr; pmod++) { + mod = *pmod; + if (rspamd_check_module(cfg, mod)) { + if (mod->module_init_func(cfg, &mod_ctx) == 0) { + g_assert(mod_ctx != nullptr); + g_ptr_array_add(cfg->c_modules, mod_ctx); + mod_ctx->mod = mod; + mod->ctx_offset = i++; + } + } + } + + /* Now check what's enabled */ + cur = g_list_first(cfg->filters); + + while (cur) { + /* Perform modules configuring */ + mod_ctx = nullptr; + PTR_ARRAY_FOREACH(cfg->c_modules, i, cur_ctx) + { + if (g_ascii_strcasecmp(cur_ctx->mod->name, + (const gchar *) cur->data) == 0) { + mod_ctx = cur_ctx; + break; + } + } + + if (mod_ctx) { + mod = mod_ctx->mod; + mod_ctx->enabled = rspamd_config_is_module_enabled(cfg, mod->name); + + if (reconfig) { + if (!mod->module_reconfig_func(cfg)) { + msg_err_config("reconfig of %s failed!", mod->name); + } + else { + msg_info_config("reconfig of %s", mod->name); + } + } + else { + if (!mod->module_config_func(cfg, strict)) { + msg_err_config("config of %s failed", mod->name); + ret = FALSE; + + if (strict) { + return FALSE; + } + } + } + } + + if (mod_ctx == nullptr) { + msg_warn_config("requested unknown module %s", cur->data); + } + + cur = g_list_next(cur); + } + + ret = rspamd_init_lua_filters(cfg, 0, strict) && ret; + + return ret; +} + +static void +rspamd_config_new_symbol(struct rspamd_config *cfg, const gchar *symbol, + gdouble score, const gchar *description, const gchar *group, + guint flags, guint priority, gint nshots) +{ + struct rspamd_symbols_group *sym_group; + struct rspamd_symbol *sym_def; + double *score_ptr; + + sym_def = + rspamd_mempool_alloc0_type(cfg->cfg_pool, struct rspamd_symbol); + score_ptr = rspamd_mempool_alloc_type(cfg->cfg_pool, double); + + if (isnan(score)) { + /* In fact, it could be defined later */ + msg_debug_config("score is not defined for symbol %s, set it to zero", + symbol); + score = 0.0; + /* Also set priority to 0 to allow override by anything */ + sym_def->priority = 0; + flags |= RSPAMD_SYMBOL_FLAG_UNSCORED; + } + else { + sym_def->priority = priority; + } + + *score_ptr = score; + sym_def->score = score; + sym_def->weight_ptr = score_ptr; + sym_def->name = rspamd_mempool_strdup(cfg->cfg_pool, symbol); + sym_def->flags = flags; + sym_def->nshots = nshots != 0 ? nshots : cfg->default_max_shots; + sym_def->groups = g_ptr_array_sized_new(1); + rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard, + sym_def->groups); + + if (description) { + sym_def->description = rspamd_mempool_strdup(cfg->cfg_pool, description); + } + + msg_debug_config("registered symbol %s with weight %.2f in and group %s", + sym_def->name, score, group); + + g_hash_table_insert(cfg->symbols, sym_def->name, sym_def); + + /* Search for symbol group */ + if (group == nullptr) { + group = "ungrouped"; + sym_def->flags |= RSPAMD_SYMBOL_FLAG_UNGROUPED; + } + else { + if (strcmp(group, "ungrouped") == 0) { + sym_def->flags |= RSPAMD_SYMBOL_FLAG_UNGROUPED; + } + } + + sym_group = reinterpret_cast<rspamd_symbols_group *>(g_hash_table_lookup(cfg->groups, group)); + if (sym_group == nullptr) { + /* Create new group */ + sym_group = rspamd_config_new_group(cfg, group); + } + + sym_def->gr = sym_group; + g_hash_table_insert(sym_group->symbols, sym_def->name, sym_def); + + if (!(sym_def->flags & RSPAMD_SYMBOL_FLAG_UNGROUPED)) { + g_ptr_array_add(sym_def->groups, sym_group); + } +} + + +gboolean +rspamd_config_add_symbol(struct rspamd_config *cfg, + const gchar *symbol, + gdouble score, + const gchar *description, + const gchar *group, + guint flags, + guint priority, + gint nshots) +{ + struct rspamd_symbol *sym_def; + struct rspamd_symbols_group *sym_group; + guint i; + + g_assert(cfg != nullptr); + g_assert(symbol != nullptr); + + sym_def = reinterpret_cast<rspamd_symbol *>(g_hash_table_lookup(cfg->symbols, symbol)); + + if (sym_def != nullptr) { + if (group != nullptr) { + gboolean has_group = FALSE; + + PTR_ARRAY_FOREACH(sym_def->groups, i, sym_group) + { + if (g_ascii_strcasecmp(sym_group->name, group) == 0) { + /* Group is already here */ + has_group = TRUE; + break; + } + } + + if (!has_group) { + /* Non-empty group has a priority over non-grouped one */ + sym_group = reinterpret_cast<rspamd_symbols_group *>(g_hash_table_lookup(cfg->groups, group)); + + if (sym_group == nullptr) { + /* Create new group */ + sym_group = rspamd_config_new_group(cfg, group); + } + + if ((!sym_def->gr) || (sym_def->flags & RSPAMD_SYMBOL_FLAG_UNGROUPED)) { + sym_def->gr = sym_group; + sym_def->flags &= ~RSPAMD_SYMBOL_FLAG_UNGROUPED; + } + + g_hash_table_insert(sym_group->symbols, sym_def->name, sym_def); + sym_def->flags &= ~(RSPAMD_SYMBOL_FLAG_UNGROUPED); + g_ptr_array_add(sym_def->groups, sym_group); + } + } + + if (sym_def->priority > priority && + (isnan(score) || !(sym_def->flags & RSPAMD_SYMBOL_FLAG_UNSCORED))) { + msg_debug_config("symbol %s has been already registered with " + "priority %ud, do not override (new priority: %ud)", + symbol, + sym_def->priority, + priority); + /* But we can still add description */ + if (!sym_def->description && description) { + sym_def->description = rspamd_mempool_strdup(cfg->cfg_pool, + description); + } + + /* Or nshots in case of non-default setting */ + if (nshots != 0 && sym_def->nshots == cfg->default_max_shots) { + sym_def->nshots = nshots; + } + + return FALSE; + } + else { + + if (!isnan(score)) { + msg_debug_config("symbol %s has been already registered with " + "priority %ud, override it with new priority: %ud, " + "old score: %.2f, new score: %.2f", + symbol, + sym_def->priority, + priority, + sym_def->score, + score); + + *sym_def->weight_ptr = score; + sym_def->score = score; + sym_def->priority = priority; + sym_def->flags &= ~RSPAMD_SYMBOL_FLAG_UNSCORED; + } + + sym_def->flags = flags; + + if (nshots != 0) { + sym_def->nshots = nshots; + } + else { + /* Do not reset unless we have exactly lower priority */ + if (sym_def->priority < priority) { + sym_def->nshots = cfg->default_max_shots; + } + } + + if (description) { + sym_def->description = rspamd_mempool_strdup(cfg->cfg_pool, + description); + } + + + /* We also check group information in this case */ + if (group != nullptr && sym_def->gr != nullptr && + strcmp(group, sym_def->gr->name) != 0) { + + sym_group = reinterpret_cast<rspamd_symbols_group *>(g_hash_table_lookup(cfg->groups, group)); + + if (sym_group == nullptr) { + /* Create new group */ + sym_group = rspamd_config_new_group(cfg, group); + } + + if (!(sym_group->flags & RSPAMD_SYMBOL_GROUP_UNGROUPED)) { + msg_debug_config("move symbol %s from group %s to %s", + sym_def->name, sym_def->gr->name, group); + g_hash_table_remove(sym_def->gr->symbols, sym_def->name); + sym_def->gr = sym_group; + g_hash_table_insert(sym_group->symbols, sym_def->name, sym_def); + } + } + + return TRUE; + } + } + + /* This is called merely when we have an undefined symbol */ + rspamd_config_new_symbol(cfg, symbol, score, description, + group, flags, priority, nshots); + + return TRUE; +} + +gboolean +rspamd_config_add_symbol_group(struct rspamd_config *cfg, + const gchar *symbol, + const gchar *group) +{ + struct rspamd_symbol *sym_def; + struct rspamd_symbols_group *sym_group; + guint i; + + g_assert(cfg != nullptr); + g_assert(symbol != nullptr); + g_assert(group != nullptr); + + sym_def = reinterpret_cast<rspamd_symbol *>(g_hash_table_lookup(cfg->symbols, symbol)); + + if (sym_def != nullptr) { + gboolean has_group = FALSE; + + PTR_ARRAY_FOREACH(sym_def->groups, i, sym_group) + { + if (g_ascii_strcasecmp(sym_group->name, group) == 0) { + /* Group is already here */ + has_group = TRUE; + break; + } + } + + if (!has_group) { + /* Non-empty group has a priority over non-grouped one */ + sym_group = reinterpret_cast<rspamd_symbols_group *>(g_hash_table_lookup(cfg->groups, group)); + + if (sym_group == nullptr) { + /* Create new group */ + sym_group = rspamd_config_new_group(cfg, group); + } + + if (!sym_def->gr) { + sym_def->gr = sym_group; + } + + g_hash_table_insert(sym_group->symbols, sym_def->name, sym_def); + sym_def->flags &= ~(RSPAMD_SYMBOL_FLAG_UNGROUPED); + g_ptr_array_add(sym_def->groups, sym_group); + + return TRUE; + } + } + + return FALSE; +} + +gboolean +rspamd_config_is_enabled_from_ucl(rspamd_mempool_t *pool, + const ucl_object_t *obj) +{ + + const ucl_object_t *enabled; + + enabled = ucl_object_lookup(obj, "enabled"); + + if (enabled) { + if (ucl_object_type(enabled) == UCL_BOOLEAN) { + return ucl_object_toboolean(enabled); + } + else if (ucl_object_type(enabled) == UCL_STRING) { + gint ret = rspamd_config_parse_flag(ucl_object_tostring(enabled), 0); + + if (ret == 0) { + return FALSE; + } + else if (ret == -1) { + + msg_info_pool_check("wrong value for the `enabled` key"); + return FALSE; + } + /* Default return is TRUE here */ + } + } + + + const ucl_object_t *disabled; + + disabled = ucl_object_lookup(obj, "disabled"); + + if (disabled) { + if (ucl_object_type(disabled) == UCL_BOOLEAN) { + return !ucl_object_toboolean(disabled); + } + else if (ucl_object_type(disabled) == UCL_STRING) { + gint ret = rspamd_config_parse_flag(ucl_object_tostring(disabled), 0); + + if (ret == 0) { + return TRUE; + } + else if (ret == -1) { + + msg_info_pool_check("wrong value for the `disabled` key"); + return FALSE; + } + + return FALSE; + } + } + + return TRUE; +} + +gboolean +rspamd_config_is_module_enabled(struct rspamd_config *cfg, + const gchar *module_name) +{ + gboolean is_c = FALSE, enabled; + const ucl_object_t *conf; + GList *cur; + struct rspamd_symbols_group *gr; + lua_State *L = RSPAMD_LUA_CFG_STATE(cfg); + struct module_ctx *cur_ctx; + guint i; + + PTR_ARRAY_FOREACH(cfg->c_modules, i, cur_ctx) + { + if (g_ascii_strcasecmp(cur_ctx->mod->name, module_name) == 0) { + is_c = TRUE; + break; + } + } + + if (g_hash_table_lookup(cfg->explicit_modules, module_name) != nullptr) { + /* Always load module */ + rspamd_plugins_table_push_elt(L, "enabled", module_name); + + return TRUE; + } + + if (is_c) { + gboolean found = FALSE; + + cur = g_list_first(cfg->filters); + + while (cur) { + if (strcmp((char *) cur->data, module_name) == 0) { + found = TRUE; + break; + } + + cur = g_list_next(cur); + } + + if (!found) { + msg_info_config("internal module %s is disable in `filters` line", + module_name); + rspamd_plugins_table_push_elt(L, + "disabled_explicitly", module_name); + + return FALSE; + } + } + + conf = ucl_object_lookup(cfg->cfg_ucl_obj, module_name); + + if (conf == nullptr) { + rspamd_plugins_table_push_elt(L, "disabled_unconfigured", module_name); + + msg_info_config("%s module %s is enabled but has not been configured", + is_c ? "internal" : "lua", module_name); + + if (!is_c) { + msg_info_config("%s disabling unconfigured lua module", module_name); + return FALSE; + } + } + else { + enabled = rspamd_config_is_enabled_from_ucl(cfg->cfg_pool, conf); + + if (!enabled) { + rspamd_plugins_table_push_elt(L, + "disabled_explicitly", module_name); + + msg_info_config( + "%s module %s is disabled in the configuration", + is_c ? "internal" : "lua", module_name); + return FALSE; + } + } + + /* Now we check symbols group */ + gr = reinterpret_cast<rspamd_symbols_group *>(g_hash_table_lookup(cfg->groups, module_name)); + + if (gr) { + if (gr->flags & RSPAMD_SYMBOL_GROUP_DISABLED) { + rspamd_plugins_table_push_elt(L, + "disabled_explicitly", module_name); + msg_info_config("%s module %s is disabled in the configuration as " + "its group has been disabled", + is_c ? "internal" : "lua", module_name); + + return FALSE; + } + } + + rspamd_plugins_table_push_elt(L, "enabled", module_name); + + return TRUE; +} + +static gboolean +rspamd_config_action_from_ucl(struct rspamd_config *cfg, + struct rspamd_action *act, + const ucl_object_t *obj, + guint priority) +{ + auto threshold = NAN; + int flags = 0; + + auto obj_type = ucl_object_type(obj); + + if (obj_type == UCL_OBJECT) { + obj_type = ucl_object_type(obj); + + const auto *elt = ucl_object_lookup_any(obj, "score", "threshold", nullptr); + + if (elt) { + threshold = ucl_object_todouble(elt); + } + + elt = ucl_object_lookup(obj, "flags"); + + if (elt && ucl_object_type(elt) == UCL_ARRAY) { + const ucl_object_t *cur; + ucl_object_iter_t it = nullptr; + + while ((cur = ucl_object_iterate(elt, &it, true)) != nullptr) { + if (ucl_object_type(cur) == UCL_STRING) { + const gchar *fl_str = ucl_object_tostring(cur); + + if (g_ascii_strcasecmp(fl_str, "no_threshold") == 0) { + flags |= RSPAMD_ACTION_NO_THRESHOLD; + } + else if (g_ascii_strcasecmp(fl_str, "threshold_only") == 0) { + flags |= RSPAMD_ACTION_THRESHOLD_ONLY; + } + else if (g_ascii_strcasecmp(fl_str, "ham") == 0) { + flags |= RSPAMD_ACTION_HAM; + } + else { + msg_warn_config("unknown action flag: %s", fl_str); + } + } + } + } + + elt = ucl_object_lookup(obj, "milter"); + + if (elt) { + const gchar *milter_action = ucl_object_tostring(elt); + + if (strcmp(milter_action, "discard") == 0) { + flags |= RSPAMD_ACTION_MILTER; + act->action_type = METRIC_ACTION_DISCARD; + } + else if (strcmp(milter_action, "quarantine") == 0) { + flags |= RSPAMD_ACTION_MILTER; + act->action_type = METRIC_ACTION_QUARANTINE; + } + else { + msg_warn_config("unknown milter action: %s", milter_action); + } + } + } + else if (obj_type == UCL_FLOAT || obj_type == UCL_INT) { + threshold = ucl_object_todouble(obj); + } + + /* TODO: add lua references support */ + + if (isnan(threshold) && !(flags & RSPAMD_ACTION_NO_THRESHOLD)) { + msg_err_config("action %s has no threshold being set and it is not" + " a no threshold action", + act->name); + + return FALSE; + } + + act->threshold = threshold; + act->flags = flags; + + enum rspamd_action_type std_act; + + if (!(flags & RSPAMD_ACTION_MILTER)) { + if (rspamd_action_from_str(act->name, &std_act)) { + act->action_type = std_act; + } + else { + act->action_type = METRIC_ACTION_CUSTOM; + } + } + + return TRUE; +} + +gboolean +rspamd_config_set_action_score(struct rspamd_config *cfg, + const gchar *action_name, + const ucl_object_t *obj) +{ + enum rspamd_action_type std_act; + const ucl_object_t *elt; + guint priority = ucl_object_get_priority(obj), obj_type; + + g_assert(cfg != nullptr); + g_assert(action_name != nullptr); + + obj_type = ucl_object_type(obj); + + if (obj_type == UCL_OBJECT) { + elt = ucl_object_lookup(obj, "priority"); + + if (elt) { + priority = ucl_object_toint(elt); + } + } + + /* Here are dragons: + * We have `canonical` name for actions, such as `soft reject` and + * configuration names for actions (used to be more convenient), such + * as `soft_reject`. Unfortunately, we must have heuristic for this + * variance of names. + */ + + if (rspamd_action_from_str(action_name, &std_act)) { + action_name = rspamd_action_to_str(std_act); + } + + auto actions = RSPAMD_CFG_ACTIONS(cfg); + auto existing_act_it = actions->actions_by_name.find(action_name); + + if (existing_act_it != actions->actions_by_name.end()) { + auto *act = existing_act_it->second.get(); + /* Existing element */ + if (act->priority <= priority) { + /* We can replace data */ + auto old_pri = act->priority; + auto old_thr = act->threshold; + + if (rspamd_config_action_from_ucl(cfg, act, obj, priority)) { + msg_info_config("action %s has been already registered with " + "priority %ud, override it with new priority: %ud, " + "old threshold: %.2f, new threshold: %.2f", + action_name, + old_pri, + priority, + old_thr, + act->threshold); + actions->sort(); + } + else { + return FALSE; + } + } + else { + msg_info_config("action %s has been already registered with " + "priority %ud, do not override (new priority: %ud)", + action_name, + act->priority, + priority); + } + } + else { + /* Add new element */ + auto act = std::make_shared<rspamd_action>(); + act->name = rspamd_mempool_strdup(cfg->cfg_pool, action_name); + + if (rspamd_config_action_from_ucl(cfg, act.get(), obj, priority)) { + actions->add_action(std::move(act)); + } + else { + return FALSE; + } + } + + return TRUE; +} + +gboolean +rspamd_config_maybe_disable_action(struct rspamd_config *cfg, + const gchar *action_name, + guint priority) +{ + auto actions = RSPAMD_CFG_ACTIONS(cfg); + auto maybe_act = rspamd::find_map(actions->actions_by_name, action_name); + + if (maybe_act) { + auto *act = maybe_act.value().get().get(); + if (priority >= act->priority) { + msg_info_config("disable action %s; old priority: %ud, new priority: %ud", + action_name, + act->priority, + priority); + + act->threshold = NAN; + act->priority = priority; + act->flags |= RSPAMD_ACTION_NO_THRESHOLD; + + return TRUE; + } + else { + msg_info_config("action %s has been already registered with " + "priority %ud, cannot disable it with new priority: %ud", + action_name, + act->priority, + priority); + } + } + + return FALSE; +} + +struct rspamd_action * +rspamd_config_get_action(struct rspamd_config *cfg, const gchar *name) +{ + auto actions = RSPAMD_CFG_ACTIONS(cfg); + auto maybe_act = rspamd::find_map(actions->actions_by_name, name); + + if (maybe_act) { + return maybe_act.value().get().get(); + } + + return nullptr; +} + +struct rspamd_action * +rspamd_config_get_action_by_type(struct rspamd_config *cfg, + enum rspamd_action_type type) +{ + for (const auto &act: RSPAMD_CFG_ACTIONS(cfg)->actions) { + if (act->action_type == type) { + return act.get(); + } + } + + return nullptr; +} + +void rspamd_config_actions_foreach(struct rspamd_config *cfg, + void (*func)(struct rspamd_action *act, void *d), + void *data) +{ + for (const auto &act: RSPAMD_CFG_ACTIONS(cfg)->actions) { + func(act.get(), data); + } +} + +void rspamd_config_actions_foreach_enumerate(struct rspamd_config *cfg, + void (*func)(int idx, struct rspamd_action *act, void *d), + void *data) +{ + for (const auto &[idx, act]: rspamd::enumerate(RSPAMD_CFG_ACTIONS(cfg)->actions)) { + func(idx, act.get(), data); + } +} + +gsize rspamd_config_actions_size(struct rspamd_config *cfg) +{ + return RSPAMD_CFG_ACTIONS(cfg)->actions.size(); +} + +gboolean +rspamd_config_radix_from_ucl(struct rspamd_config *cfg, const ucl_object_t *obj, const gchar *description, + struct rspamd_radix_map_helper **target, GError **err, + struct rspamd_worker *worker, const gchar *map_name) +{ + ucl_type_t type; + ucl_object_iter_t it = nullptr; + const ucl_object_t *cur, *cur_elt; + const gchar *str; + + /* Cleanup */ + *target = nullptr; + + LL_FOREACH(obj, cur_elt) + { + type = ucl_object_type(cur_elt); + + switch (type) { + case UCL_STRING: + /* Either map or a list of IPs */ + str = ucl_object_tostring(cur_elt); + + if (rspamd_map_is_map(str)) { + if (rspamd_map_add_from_ucl(cfg, cur_elt, + description, + rspamd_radix_read, + rspamd_radix_fin, + rspamd_radix_dtor, + (void **) target, + worker, RSPAMD_MAP_DEFAULT) == nullptr) { + g_set_error(err, + g_quark_from_static_string("rspamd-config"), + EINVAL, "bad map definition %s for %s", str, + ucl_object_key(obj)); + return FALSE; + } + + return TRUE; + } + else { + /* Just a list */ + if (!*target) { + *target = rspamd_map_helper_new_radix( + rspamd_map_add_fake(cfg, description, map_name)); + } + + rspamd_map_helper_insert_radix_resolve(*target, str, ""); + } + break; + case UCL_OBJECT: + /* Should be a map description */ + if (rspamd_map_add_from_ucl(cfg, cur_elt, + description, + rspamd_radix_read, + rspamd_radix_fin, + rspamd_radix_dtor, + (void **) target, + worker, RSPAMD_MAP_DEFAULT) == nullptr) { + g_set_error(err, + g_quark_from_static_string("rspamd-config"), + EINVAL, "bad map object for %s", ucl_object_key(obj)); + return FALSE; + } + + return TRUE; + break; + case UCL_ARRAY: + /* List of IP addresses */ + it = ucl_object_iterate_new(cur_elt); + + while ((cur = ucl_object_iterate_safe(it, true)) != nullptr) { + + + if (ucl_object_type(cur) == UCL_STRING) { + str = ucl_object_tostring(cur); + if (!*target) { + *target = rspamd_map_helper_new_radix( + rspamd_map_add_fake(cfg, description, map_name)); + } + + rspamd_map_helper_insert_radix_resolve(*target, str, ""); + } + else { + g_set_error(err, + g_quark_from_static_string("rspamd-config"), + EINVAL, "bad element inside array object for %s: expected string, got: %s", + ucl_object_key(obj), ucl_object_type_to_string(ucl_object_type(cur))); + ucl_object_iterate_free(it); + return FALSE; + } + } + + ucl_object_iterate_free(it); + break; + default: + g_set_error(err, g_quark_from_static_string("rspamd-config"), + EINVAL, "bad map type %s for %s", + ucl_object_type_to_string(type), + ucl_object_key(obj)); + return FALSE; + } + } + + /* Destroy on cfg cleanup */ + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) rspamd_map_helper_destroy_radix, + *target); + + return TRUE; +} + +constexpr const auto action_types = frozen::make_unordered_map<frozen::string, enum rspamd_action_type>({ + {"reject", METRIC_ACTION_REJECT}, + {"greylist", METRIC_ACTION_GREYLIST}, + {"add header", METRIC_ACTION_ADD_HEADER}, + {"add_header", METRIC_ACTION_ADD_HEADER}, + {"rewrite subject", METRIC_ACTION_REWRITE_SUBJECT}, + {"rewrite_subject", METRIC_ACTION_REWRITE_SUBJECT}, + {"soft reject", METRIC_ACTION_SOFT_REJECT}, + {"soft_reject", METRIC_ACTION_SOFT_REJECT}, + {"no action", METRIC_ACTION_NOACTION}, + {"no_action", METRIC_ACTION_NOACTION}, + {"accept", METRIC_ACTION_NOACTION}, + {"quarantine", METRIC_ACTION_QUARANTINE}, + {"discard", METRIC_ACTION_DISCARD}, + +}); + +gboolean +rspamd_action_from_str(const gchar *data, enum rspamd_action_type *result) +{ + auto maybe_action = rspamd::find_map(action_types, std::string_view{data}); + + if (maybe_action) { + *result = maybe_action.value().get(); + return true; + } + else { + return false; + } +} + +const gchar * +rspamd_action_to_str(enum rspamd_action_type action) +{ + switch (action) { + case METRIC_ACTION_REJECT: + return "reject"; + case METRIC_ACTION_SOFT_REJECT: + return "soft reject"; + case METRIC_ACTION_REWRITE_SUBJECT: + return "rewrite subject"; + case METRIC_ACTION_ADD_HEADER: + return "add header"; + case METRIC_ACTION_GREYLIST: + return "greylist"; + case METRIC_ACTION_NOACTION: + return "no action"; + case METRIC_ACTION_MAX: + return "invalid max action"; + case METRIC_ACTION_CUSTOM: + return "custom"; + case METRIC_ACTION_DISCARD: + return "discard"; + case METRIC_ACTION_QUARANTINE: + return "quarantine"; + } + + return "unknown action"; +} + +const gchar * +rspamd_action_to_str_alt(enum rspamd_action_type action) +{ + switch (action) { + case METRIC_ACTION_REJECT: + return "reject"; + case METRIC_ACTION_SOFT_REJECT: + return "soft_reject"; + case METRIC_ACTION_REWRITE_SUBJECT: + return "rewrite_subject"; + case METRIC_ACTION_ADD_HEADER: + return "add_header"; + case METRIC_ACTION_GREYLIST: + return "greylist"; + case METRIC_ACTION_NOACTION: + return "no action"; + case METRIC_ACTION_MAX: + return "invalid max action"; + case METRIC_ACTION_CUSTOM: + return "custom"; + case METRIC_ACTION_DISCARD: + return "discard"; + case METRIC_ACTION_QUARANTINE: + return "quarantine"; + } + + return "unknown action"; +} + +static void +rspamd_config_settings_elt_dtor(struct rspamd_config_settings_elt *e) +{ + if (e->symbols_enabled) { + ucl_object_unref(e->symbols_enabled); + } + if (e->symbols_disabled) { + ucl_object_unref(e->symbols_disabled); + } +} + +guint32 +rspamd_config_name_to_id(const gchar *name, gsize namelen) +{ + guint64 h; + + h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, + name, namelen, 0x0); + /* Take the lower part of hash as LE number */ + return ((guint32) GUINT64_TO_LE(h)); +} + +struct rspamd_config_settings_elt * +rspamd_config_find_settings_id_ref(struct rspamd_config *cfg, + guint32 id) +{ + struct rspamd_config_settings_elt *cur; + + DL_FOREACH(cfg->setting_ids, cur) + { + if (cur->id == id) { + REF_RETAIN(cur); + return cur; + } + } + + return nullptr; +} + +struct rspamd_config_settings_elt *rspamd_config_find_settings_name_ref( + struct rspamd_config *cfg, + const gchar *name, gsize namelen) +{ + guint32 id; + + id = rspamd_config_name_to_id(name, namelen); + + return rspamd_config_find_settings_id_ref(cfg, id); +} + +void rspamd_config_register_settings_id(struct rspamd_config *cfg, + const gchar *name, + ucl_object_t *symbols_enabled, + ucl_object_t *symbols_disabled, + enum rspamd_config_settings_policy policy) +{ + struct rspamd_config_settings_elt *elt; + guint32 id; + + id = rspamd_config_name_to_id(name, strlen(name)); + elt = rspamd_config_find_settings_id_ref(cfg, id); + + if (elt) { + /* Need to replace */ + struct rspamd_config_settings_elt *nelt; + + DL_DELETE(cfg->setting_ids, elt); + + nelt = rspamd_mempool_alloc0_type(cfg->cfg_pool, struct rspamd_config_settings_elt); + + nelt->id = id; + nelt->name = rspamd_mempool_strdup(cfg->cfg_pool, name); + + if (symbols_enabled) { + nelt->symbols_enabled = ucl_object_ref(symbols_enabled); + } + + if (symbols_disabled) { + nelt->symbols_disabled = ucl_object_ref(symbols_disabled); + } + + nelt->policy = policy; + + REF_INIT_RETAIN(nelt, rspamd_config_settings_elt_dtor); + msg_warn_config("replace settings id %ud (%s)", id, name); + rspamd_symcache_process_settings_elt(cfg->cache, elt); + DL_APPEND(cfg->setting_ids, nelt); + + /* + * Need to unref old element twice as there are two reference holders: + * 1. Config structure as we call REF_INIT_RETAIN + * 2. rspamd_config_find_settings_id_ref also increases refcount + */ + REF_RELEASE(elt); + REF_RELEASE(elt); + } + else { + elt = rspamd_mempool_alloc0_type(cfg->cfg_pool, struct rspamd_config_settings_elt); + + elt->id = id; + elt->name = rspamd_mempool_strdup(cfg->cfg_pool, name); + + if (symbols_enabled) { + elt->symbols_enabled = ucl_object_ref(symbols_enabled); + } + + if (symbols_disabled) { + elt->symbols_disabled = ucl_object_ref(symbols_disabled); + } + + elt->policy = policy; + + msg_info_config("register new settings id %ud (%s)", id, name); + REF_INIT_RETAIN(elt, rspamd_config_settings_elt_dtor); + rspamd_symcache_process_settings_elt(cfg->cache, elt); + DL_APPEND(cfg->setting_ids, elt); + } +} + +int rspamd_config_ev_backend_get(struct rspamd_config *cfg) +{ +#define AUTO_BACKEND (ev_supported_backends() & ~EVBACKEND_IOURING) + if (cfg == nullptr || cfg->events_backend == nullptr) { + return AUTO_BACKEND; + } + + if (strcmp(cfg->events_backend, "auto") == 0) { + return AUTO_BACKEND; + } + else if (strcmp(cfg->events_backend, "epoll") == 0) { + if (ev_supported_backends() & EVBACKEND_EPOLL) { + return EVBACKEND_EPOLL; + } + else { + msg_warn_config("unsupported events_backend: %s; defaulting to auto", + cfg->events_backend); + return AUTO_BACKEND; + } + } + else if (strcmp(cfg->events_backend, "iouring") == 0) { + if (ev_supported_backends() & EVBACKEND_IOURING) { + return EVBACKEND_IOURING; + } + else { + msg_warn_config("unsupported events_backend: %s; defaulting to auto", + cfg->events_backend); + return AUTO_BACKEND; + } + } + else if (strcmp(cfg->events_backend, "kqueue") == 0) { + if (ev_supported_backends() & EVBACKEND_KQUEUE) { + return EVBACKEND_KQUEUE; + } + else { + msg_warn_config("unsupported events_backend: %s; defaulting to auto", + cfg->events_backend); + return AUTO_BACKEND; + } + } + else if (strcmp(cfg->events_backend, "poll") == 0) { + return EVBACKEND_POLL; + } + else if (strcmp(cfg->events_backend, "select") == 0) { + return EVBACKEND_SELECT; + } + else { + msg_warn_config("unknown events_backend: %s; defaulting to auto", + cfg->events_backend); + } + + return AUTO_BACKEND; +} + +const gchar * +rspamd_config_ev_backend_to_string(int ev_backend, gboolean *effective) +{ +#define SET_EFFECTIVE(b) \ + do { \ + if ((effective) != nullptr) *(effective) = b; \ + } while (0) + + if ((ev_backend & EVBACKEND_ALL) == EVBACKEND_ALL) { + SET_EFFECTIVE(TRUE); + return "auto"; + } + + if (ev_backend & EVBACKEND_IOURING) { + SET_EFFECTIVE(TRUE); + return "epoll+io_uring"; + } + if (ev_backend & EVBACKEND_LINUXAIO) { + SET_EFFECTIVE(TRUE); + return "epoll+aio"; + } + if (ev_backend & EVBACKEND_IOURING) { + SET_EFFECTIVE(TRUE); + return "epoll+io_uring"; + } + if (ev_backend & EVBACKEND_LINUXAIO) { + SET_EFFECTIVE(TRUE); + return "epoll+aio"; + } + if (ev_backend & EVBACKEND_EPOLL) { + SET_EFFECTIVE(TRUE); + return "epoll"; + } + if (ev_backend & EVBACKEND_KQUEUE) { + SET_EFFECTIVE(TRUE); + return "kqueue"; + } + if (ev_backend & EVBACKEND_POLL) { + SET_EFFECTIVE(FALSE); + return "poll"; + } + if (ev_backend & EVBACKEND_SELECT) { + SET_EFFECTIVE(FALSE); + return "select"; + } + + SET_EFFECTIVE(FALSE); + return "unknown"; +#undef SET_EFFECTIVE +} + +struct rspamd_external_libs_ctx * +rspamd_init_libs(void) +{ + struct rlimit rlim; + struct ottery_config *ottery_cfg; + + auto *ctx = g_new0(struct rspamd_external_libs_ctx, 1); + ctx->crypto_ctx = rspamd_cryptobox_init(); + ottery_cfg = (struct ottery_config *) g_malloc0(ottery_get_sizeof_config()); + ottery_config_init(ottery_cfg); + ctx->ottery_cfg = ottery_cfg; + + rspamd_openssl_maybe_init(); + + /* Check if we have rdrand */ + if ((ctx->crypto_ctx->cpu_config & CPUID_RDRAND) == 0) { + ottery_config_disable_entropy_sources(ottery_cfg, + OTTERY_ENTROPY_SRC_RDRAND); + } + + g_assert(ottery_init(ottery_cfg) == 0); +#if OPENSSL_VERSION_NUMBER >= 0x1000104fL && OPENSSL_VERSION_NUMBER < 0x30000000L && !defined(LIBRESSL_VERSION_NUMBER) + RAND_set_rand_engine(nullptr); +#endif + + /* Configure utf8 library */ + guint utf8_flags = 0; + + if ((ctx->crypto_ctx->cpu_config & CPUID_SSE41)) { + utf8_flags |= RSPAMD_FAST_UTF8_FLAG_SSE41; + } + if ((ctx->crypto_ctx->cpu_config & CPUID_AVX2)) { + utf8_flags |= RSPAMD_FAST_UTF8_FLAG_AVX2; + } + + rspamd_fast_utf8_library_init(utf8_flags); + +#ifdef HAVE_LOCALE_H + if (getenv("LANG") == nullptr) { + setlocale(LC_ALL, "C"); + setlocale(LC_CTYPE, "C"); + setlocale(LC_MESSAGES, "C"); + setlocale(LC_TIME, "C"); + } + else { + /* Just set the default locale */ + setlocale(LC_ALL, ""); + /* But for some issues we still want C locale */ + setlocale(LC_NUMERIC, "C"); + } +#endif + + ctx->ssl_ctx = rspamd_init_ssl_ctx(); + ctx->ssl_ctx_noverify = rspamd_init_ssl_ctx_noverify(); + rspamd_random_seed_fast(); + + /* Set stack size for pcre */ + getrlimit(RLIMIT_STACK, &rlim); + rlim.rlim_cur = 100 * 1024 * 1024; + rlim.rlim_max = rlim.rlim_cur; + setrlimit(RLIMIT_STACK, &rlim); + + ctx->local_addrs = rspamd_inet_library_init(); + REF_INIT_RETAIN(ctx, rspamd_deinit_libs); + + return ctx; +} + +static struct zstd_dictionary * +rspamd_open_zstd_dictionary(const char *path) +{ + struct zstd_dictionary *dict; + + dict = g_new0(zstd_dictionary, 1); + dict->dict = rspamd_file_xmap(path, PROT_READ, &dict->size, TRUE); + + if (dict->dict == nullptr) { + g_free(dict); + + return nullptr; + } + + dict->id = -1; + + if (dict->id == 0) { + g_free(dict); + + return nullptr; + } + + return dict; +} + +static void +rspamd_free_zstd_dictionary(struct zstd_dictionary *dict) +{ + if (dict) { + munmap(dict->dict, dict->size); + g_free(dict); + } +} + +#ifdef HAVE_OPENBLAS_SET_NUM_THREADS +extern "C" void openblas_set_num_threads(int num_threads); +#endif +#ifdef HAVE_BLI_THREAD_SET_NUM_THREADS +extern "C" void bli_thread_set_num_threads(int num_threads); +#endif + +gboolean +rspamd_config_libs(struct rspamd_external_libs_ctx *ctx, + struct rspamd_config *cfg) +{ + size_t r; + gboolean ret = TRUE; + + g_assert(cfg != nullptr); + + if (ctx != nullptr) { + if (cfg->local_addrs) { + GError *err = nullptr; + ret = rspamd_config_radix_from_ucl(cfg, cfg->local_addrs, + "Local addresses", + (struct rspamd_radix_map_helper **) ctx->local_addrs, + &err, + nullptr, "local addresses"); + + if (!ret) { + msg_err_config("cannot load local addresses: %e", err); + g_error_free(err); + + return ret; + } + } + + rspamd_free_zstd_dictionary(ctx->in_dict); + rspamd_free_zstd_dictionary(ctx->out_dict); + + if (ctx->out_zstream) { + ZSTD_freeCStream((ZSTD_CCtx *) ctx->out_zstream); + ctx->out_zstream = nullptr; + } + + if (ctx->in_zstream) { + ZSTD_freeDStream((ZSTD_DCtx *) ctx->in_zstream); + ctx->in_zstream = nullptr; + } + + if (cfg->zstd_input_dictionary) { + ctx->in_dict = rspamd_open_zstd_dictionary( + cfg->zstd_input_dictionary); + + if (ctx->in_dict == nullptr) { + msg_err_config("cannot open zstd dictionary in %s", + cfg->zstd_input_dictionary); + } + } + if (cfg->zstd_output_dictionary) { + ctx->out_dict = rspamd_open_zstd_dictionary( + cfg->zstd_output_dictionary); + + if (ctx->out_dict == nullptr) { + msg_err_config("cannot open zstd dictionary in %s", + cfg->zstd_output_dictionary); + } + } + + if (cfg->fips_mode) { +#ifdef HAVE_FIPS_MODE + int mode = FIPS_mode(); + unsigned long err = (unsigned long) -1; + + /* Toggle FIPS mode */ + if (mode == 0) { +#if defined(OPENSSL_VERSION_MAJOR) && (OPENSSL_VERSION_MAJOR >= 3) + if (EVP_set_default_properties(nullptr, "fips=yes") != 1) { +#else + if (FIPS_mode_set(1) != 1) { +#endif + err = ERR_get_error(); + } + } + else { + msg_info_config("OpenSSL FIPS mode is already enabled"); + } + + if (err != (unsigned long) -1) { +#if defined(OPENSSL_VERSION_MAJOR) && (OPENSSL_VERSION_MAJOR >= 3) + msg_err_config("EVP_set_default_properties failed: %s", +#else + msg_err_config("FIPS_mode_set failed: %s", +#endif + ERR_error_string(err, nullptr)); + ret = FALSE; + } + else { + msg_info_config("OpenSSL FIPS mode is enabled"); + } +#else + msg_warn_config("SSL FIPS mode is enabled but not supported by OpenSSL library!"); +#endif + } + + rspamd_ssl_ctx_config(cfg, ctx->ssl_ctx); + rspamd_ssl_ctx_config(cfg, ctx->ssl_ctx_noverify); + + /* Init decompression */ + ctx->in_zstream = ZSTD_createDStream(); + r = ZSTD_initDStream((ZSTD_DCtx *) ctx->in_zstream); + + if (ZSTD_isError(r)) { + msg_err("cannot init decompression stream: %s", + ZSTD_getErrorName(r)); + ZSTD_freeDStream((ZSTD_DCtx *) ctx->in_zstream); + ctx->in_zstream = nullptr; + } + + /* Init compression */ + ctx->out_zstream = ZSTD_createCStream(); + r = ZSTD_initCStream((ZSTD_CCtx *) ctx->out_zstream, 1); + + if (ZSTD_isError(r)) { + msg_err("cannot init compression stream: %s", + ZSTD_getErrorName(r)); + ZSTD_freeCStream((ZSTD_CCtx *) ctx->out_zstream); + ctx->out_zstream = nullptr; + } +#ifdef HAVE_OPENBLAS_SET_NUM_THREADS + openblas_set_num_threads(cfg->max_blas_threads); +#endif +#ifdef HAVE_BLI_THREAD_SET_NUM_THREADS + bli_thread_set_num_threads(cfg->max_blas_threads); +#endif + } + + return ret; +} + +gboolean +rspamd_libs_reset_decompression(struct rspamd_external_libs_ctx *ctx) +{ + gsize r; + + if (ctx->in_zstream == nullptr) { + return FALSE; + } + else { + r = ZSTD_DCtx_reset((ZSTD_DCtx *) ctx->in_zstream, ZSTD_reset_session_only); + + if (ZSTD_isError(r)) { + msg_err("cannot init decompression stream: %s", + ZSTD_getErrorName(r)); + ZSTD_freeDStream((ZSTD_DCtx *) ctx->in_zstream); + ctx->in_zstream = nullptr; + + return FALSE; + } + } + + return TRUE; +} + +gboolean +rspamd_libs_reset_compression(struct rspamd_external_libs_ctx *ctx) +{ + gsize r; + + if (ctx->out_zstream == nullptr) { + return FALSE; + } + else { + /* Dictionary will be reused automatically if specified */ + r = ZSTD_CCtx_reset((ZSTD_CCtx *) ctx->out_zstream, ZSTD_reset_session_only); + if (!ZSTD_isError(r)) { + r = ZSTD_CCtx_setPledgedSrcSize((ZSTD_CCtx *) ctx->out_zstream, ZSTD_CONTENTSIZE_UNKNOWN); + } + + if (ZSTD_isError(r)) { + msg_err("cannot init compression stream: %s", + ZSTD_getErrorName(r)); + ZSTD_freeCStream((ZSTD_CCtx *) ctx->out_zstream); + ctx->out_zstream = nullptr; + + return FALSE; + } + } + + return TRUE; +} + +void rspamd_deinit_libs(struct rspamd_external_libs_ctx *ctx) +{ + if (ctx != nullptr) { + g_free(ctx->ottery_cfg); + +#ifdef HAVE_OPENSSL + EVP_cleanup(); + ERR_free_strings(); + rspamd_ssl_ctx_free(ctx->ssl_ctx); + rspamd_ssl_ctx_free(ctx->ssl_ctx_noverify); +#endif + rspamd_inet_library_destroy(); + rspamd_free_zstd_dictionary(ctx->in_dict); + rspamd_free_zstd_dictionary(ctx->out_dict); + + if (ctx->out_zstream) { + ZSTD_freeCStream((ZSTD_CCtx *) ctx->out_zstream); + } + + if (ctx->in_zstream) { + ZSTD_freeDStream((ZSTD_DCtx *) ctx->in_zstream); + } + + rspamd_cryptobox_deinit(ctx->crypto_ctx); + + g_free(ctx); + } +} + +gboolean +rspamd_ip_is_local_cfg(struct rspamd_config *cfg, + const rspamd_inet_addr_t *addr) +{ + struct rspamd_radix_map_helper *local_addrs = nullptr; + + if (cfg && cfg->libs_ctx) { + local_addrs = *(struct rspamd_radix_map_helper **) cfg->libs_ctx->local_addrs; + } + + if (rspamd_inet_address_is_local(addr)) { + return TRUE; + } + + if (local_addrs) { + if (rspamd_match_radix_map_addr(local_addrs, addr) != nullptr) { + return TRUE; + } + } + + return FALSE; +} diff --git a/src/libserver/composites/composites.cxx b/src/libserver/composites/composites.cxx new file mode 100644 index 0000000..aa231a3 --- /dev/null +++ b/src/libserver/composites/composites.cxx @@ -0,0 +1,989 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "logger.h" +#include "expression.h" +#include "task.h" +#include "utlist.h" +#include "scan_result.h" +#include "composites.h" + +#include <cmath> +#include <vector> +#include <variant> +#include "libutil/cxx/util.hxx" +#include "contrib/ankerl/unordered_dense.h" + +#include "composites_internal.hxx" + +#define msg_err_composites(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "composites", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_composites(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "composites", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_composites(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "composites", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +#define msg_debug_composites(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \ + rspamd_composites_log_id, "composites", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(composites) + + +namespace rspamd::composites { +static rspamd_expression_atom_t *rspamd_composite_expr_parse(const gchar *line, gsize len, + rspamd_mempool_t *pool, + gpointer ud, GError **err); +static gdouble rspamd_composite_expr_process(void *ud, rspamd_expression_atom_t *atom); +static gint rspamd_composite_expr_priority(rspamd_expression_atom_t *atom); +static void rspamd_composite_expr_destroy(rspamd_expression_atom_t *atom); +static void composites_foreach_callback(gpointer key, gpointer value, void *data); + +const struct rspamd_atom_subr composite_expr_subr = { + .parse = rspamd::composites::rspamd_composite_expr_parse, + .process = rspamd::composites::rspamd_composite_expr_process, + .priority = rspamd::composites::rspamd_composite_expr_priority, + .destroy = rspamd::composites::rspamd_composite_expr_destroy}; +}// namespace rspamd::composites + +namespace rspamd::composites { + +static constexpr const double epsilon = 0.00001; + +struct symbol_remove_data { + const char *sym; + struct rspamd_composite *comp; + GNode *parent; + std::uint8_t action; +}; + +struct composites_data { + struct rspamd_task *task; + struct rspamd_composite *composite; + struct rspamd_scan_result *metric_res; + ankerl::unordered_dense::map<std::string_view, + std::vector<symbol_remove_data>> + symbols_to_remove; + std::vector<bool> checked; + + explicit composites_data(struct rspamd_task *task, struct rspamd_scan_result *mres) + : task(task), composite(nullptr), metric_res(mres) + { + checked.resize(rspamd_composites_manager_nelts(task->cfg->composites_manager) * 2, + false); + } +}; + +struct rspamd_composite_option_match { + rspamd_regexp_t *re; + std::string match; + + explicit rspamd_composite_option_match(const char *start, std::size_t len) noexcept + : re(nullptr), match(start, len) + { + } + + explicit rspamd_composite_option_match(rspamd_regexp_t *re) noexcept + : re(rspamd_regexp_ref(re)) + { + } + + rspamd_composite_option_match(const rspamd_composite_option_match &other) noexcept + { + if (other.re) { + re = rspamd_regexp_ref(other.re); + } + else { + match = other.match; + re = nullptr; + } + } + rspamd_composite_option_match &operator=(const rspamd_composite_option_match &other) noexcept + { + if (other.re) { + if (re) { + rspamd_regexp_unref(re); + } + re = rspamd_regexp_ref(other.re); + } + else { + if (re) { + rspamd_regexp_unref(re); + } + re = nullptr; + match = other.match; + } + + return *this; + } + + rspamd_composite_option_match(rspamd_composite_option_match &&other) noexcept + { + if (other.re) { + re = other.re; + other.re = nullptr; + } + else { + re = nullptr; + match = std::move(other.match); + } + } + rspamd_composite_option_match &operator=(rspamd_composite_option_match &&other) noexcept + { + if (other.re) { + if (re) { + rspamd_regexp_unref(re); + } + re = other.re; + other.re = nullptr; + } + else { + if (re) { + rspamd_regexp_unref(re); + } + re = nullptr; + match = std::move(other.match); + } + + return *this; + } + + ~rspamd_composite_option_match() + { + if (re) { + rspamd_regexp_unref(re); + } + } + + auto match_opt(const std::string_view &data) const -> bool + { + if (re) { + return rspamd_regexp_search(re, + data.data(), data.size(), + nullptr, nullptr, false, nullptr); + } + else { + return data == match; + } + } + + auto get_pat() const -> std::string_view + { + if (re) { + return std::string_view(rspamd_regexp_get_pattern(re)); + } + else { + return match; + } + } +}; + +enum class rspamd_composite_atom_type { + ATOM_UNKNOWN, + ATOM_COMPOSITE, + ATOM_PLAIN +}; + +struct rspamd_composite_atom { + std::string symbol; + std::string_view norm_symbol; + rspamd_composite_atom_type comp_type = rspamd_composite_atom_type::ATOM_UNKNOWN; + const struct rspamd_composite *ncomp; /* underlying composite */ + std::vector<rspamd_composite_option_match> opts; +}; + +enum rspamd_composite_action : std::uint8_t { + RSPAMD_COMPOSITE_UNTOUCH = 0, + RSPAMD_COMPOSITE_REMOVE_SYMBOL = (1u << 0), + RSPAMD_COMPOSITE_REMOVE_WEIGHT = (1u << 1), + RSPAMD_COMPOSITE_REMOVE_FORCED = (1u << 2) +}; + +static GQuark +rspamd_composites_quark(void) +{ + return g_quark_from_static_string("composites"); +} + +static auto +rspamd_composite_atom_dtor(void *ptr) +{ + auto *atom = reinterpret_cast<rspamd_composite_atom *>(ptr); + + delete atom; +} + +static rspamd_expression_atom_t * +rspamd_composite_expr_parse(const gchar *line, gsize len, + rspamd_mempool_t *pool, + gpointer ud, GError **err) +{ + gsize clen = 0; + const gchar *p, *end; + enum composite_expr_state { + comp_state_read_symbol = 0, + comp_state_read_obrace, + comp_state_read_option, + comp_state_read_regexp, + comp_state_read_regexp_end, + comp_state_read_comma, + comp_state_read_ebrace, + comp_state_read_end + } state = comp_state_read_symbol; + + end = line + len; + p = line; + + /* Find length of the atom using a reduced state machine */ + while (p < end) { + if (state == comp_state_read_end) { + break; + } + + switch (state) { + case comp_state_read_symbol: + clen = rspamd_memcspn(p, "[; \t()><!|&\n", len); + p += clen; + + if (*p == '[') { + state = comp_state_read_obrace; + } + else { + state = comp_state_read_end; + } + break; + case comp_state_read_obrace: + p++; + + if (*p == '/') { + p++; + state = comp_state_read_regexp; + } + else { + state = comp_state_read_option; + } + break; + case comp_state_read_regexp: + if (*p == '\\' && p + 1 < end) { + /* Escaping */ + p++; + } + else if (*p == '/') { + /* End of regexp, possible flags */ + state = comp_state_read_regexp_end; + } + p++; + break; + case comp_state_read_option: + case comp_state_read_regexp_end: + if (*p == ',') { + p++; + state = comp_state_read_comma; + } + else if (*p == ']') { + state = comp_state_read_ebrace; + } + else { + p++; + } + break; + case comp_state_read_comma: + if (!g_ascii_isspace(*p)) { + if (*p == '/') { + state = comp_state_read_regexp; + } + else if (*p == ']') { + state = comp_state_read_ebrace; + } + else { + state = comp_state_read_option; + } + } + else { + /* Skip spaces after comma */ + p++; + } + break; + case comp_state_read_ebrace: + p++; + state = comp_state_read_end; + break; + case comp_state_read_end: + g_assert_not_reached(); + } + } + + if (state != comp_state_read_end) { + g_set_error(err, rspamd_composites_quark(), 100, "invalid composite: %s;" + "parser stopped in state %d", + line, state); + return NULL; + } + + clen = p - line; + p = line; + state = comp_state_read_symbol; + + auto *atom = new rspamd_composite_atom; + auto *res = rspamd_mempool_alloc0_type(pool, rspamd_expression_atom_t); + res->len = clen; + res->str = line; + + /* Full state machine to fill a composite atom */ + const gchar *opt_start = nullptr; + + while (p < end) { + if (state == comp_state_read_end) { + break; + } + + switch (state) { + case comp_state_read_symbol: { + clen = rspamd_memcspn(p, "[; \t()><!|&\n", len); + p += clen; + + if (*p == '[') { + state = comp_state_read_obrace; + } + else { + state = comp_state_read_end; + } + + atom->symbol = std::string{line, clen}; + auto norm_start = std::find_if(atom->symbol.begin(), atom->symbol.end(), + [](char c) { return g_ascii_isalnum(c); }); + if (norm_start == atom->symbol.end()) { + msg_err_pool("invalid composite atom: %s", atom->symbol.c_str()); + } + atom->norm_symbol = make_string_view_from_it(norm_start, atom->symbol.end()); + break; + } + case comp_state_read_obrace: + p++; + + if (*p == '/') { + opt_start = p; + p++; /* Starting slash */ + state = comp_state_read_regexp; + } + else { + state = comp_state_read_option; + opt_start = p; + } + + break; + case comp_state_read_regexp: + if (*p == '\\' && p + 1 < end) { + /* Escaping */ + p++; + } + else if (*p == '/') { + /* End of regexp, possible flags */ + state = comp_state_read_regexp_end; + } + p++; + break; + case comp_state_read_option: + if (*p == ',' || *p == ']') { + /* Plain match, copy option to ensure string_view validity */ + gint opt_len = p - opt_start; + auto *opt_buf = rspamd_mempool_alloc_buffer(pool, opt_len + 1); + rspamd_strlcpy(opt_buf, opt_start, opt_len + 1); + opt_buf = g_strstrip(opt_buf); + atom->opts.emplace_back(opt_buf, strlen(opt_buf)); + + if (*p == ',') { + p++; + state = comp_state_read_comma; + } + else { + state = comp_state_read_ebrace; + } + } + else { + p++; + } + break; + case comp_state_read_regexp_end: + if (*p == ',' || *p == ']') { + auto opt_len = p - opt_start; + rspamd_regexp_t *re; + GError *re_err = nullptr; + + re = rspamd_regexp_new_len(opt_start, opt_len, nullptr, &re_err); + + if (re == nullptr) { + msg_err_pool("cannot create regexp from string %*s: %e", + opt_len, opt_start, re_err); + + g_error_free(re_err); + } + else { + atom->opts.emplace_back(re); + rspamd_regexp_unref(re); + } + + if (*p == ',') { + p++; + state = comp_state_read_comma; + } + else { + state = comp_state_read_ebrace; + } + } + else { + p++; + } + break; + case comp_state_read_comma: + if (!g_ascii_isspace(*p)) { + if (*p == '/') { + state = comp_state_read_regexp; + opt_start = p; + } + else if (*p == ']') { + state = comp_state_read_ebrace; + } + else { + opt_start = p; + state = comp_state_read_option; + } + } + else { + /* Skip spaces after comma */ + p++; + } + break; + case comp_state_read_ebrace: + p++; + state = comp_state_read_end; + break; + case comp_state_read_end: + g_assert_not_reached(); + } + } + + res->data = atom; + + return res; +} + +static auto +process_symbol_removal(rspamd_expression_atom_t *atom, + struct composites_data *cd, + struct rspamd_symbol_result *ms, + const std::string &beg) -> void +{ + struct rspamd_task *task = cd->task; + + if (ms == nullptr) { + return; + } + + /* + * At this point we know that we need to do something about this symbol, + * however, we don't know whether we need to delete it unfortunately, + * that depends on the later decisions when the complete expression is + * evaluated. + */ + auto rd_it = cd->symbols_to_remove.find(ms->name); + + auto fill_removal_structure = [&](symbol_remove_data &nrd) { + nrd.sym = ms->name; + + /* By default remove symbols */ + switch (cd->composite->policy) { + case rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_ALL: + default: + nrd.action = (RSPAMD_COMPOSITE_REMOVE_SYMBOL | RSPAMD_COMPOSITE_REMOVE_WEIGHT); + break; + case rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL: + nrd.action = RSPAMD_COMPOSITE_REMOVE_SYMBOL; + break; + case rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT: + nrd.action = RSPAMD_COMPOSITE_REMOVE_WEIGHT; + break; + case rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_LEAVE: + nrd.action = 0; + break; + } + + for (auto t: beg) { + if (t == '~') { + nrd.action &= ~RSPAMD_COMPOSITE_REMOVE_SYMBOL; + } + else if (t == '-') { + nrd.action &= ~(RSPAMD_COMPOSITE_REMOVE_WEIGHT | + RSPAMD_COMPOSITE_REMOVE_SYMBOL); + } + else if (t == '^') { + nrd.action |= RSPAMD_COMPOSITE_REMOVE_FORCED; + } + else { + break; + } + } + + nrd.comp = cd->composite; + nrd.parent = atom->parent; + }; + + if (rd_it != cd->symbols_to_remove.end()) { + fill_removal_structure(rd_it->second.emplace_back()); + msg_debug_composites("%s: added symbol %s to removal: %d policy, from composite %s", + cd->metric_res->name, + ms->name, rd_it->second.back().action, + cd->composite->sym.c_str()); + } + else { + std::vector<symbol_remove_data> nrd; + fill_removal_structure(nrd.emplace_back()); + msg_debug_composites("%s: added symbol %s to removal: %d policy, from composite %s", + cd->metric_res->name, + ms->name, nrd.front().action, + cd->composite->sym.c_str()); + cd->symbols_to_remove[ms->name] = std::move(nrd); + } +} + +static auto +process_single_symbol(struct composites_data *cd, + std::string_view sym, + struct rspamd_symbol_result **pms, + struct rspamd_composite_atom *atom) -> double +{ + struct rspamd_symbol_result *ms = nullptr; + gdouble rc = 0; + struct rspamd_task *task = cd->task; + + if ((ms = rspamd_task_find_symbol_result(cd->task, sym.data(), cd->metric_res)) == nullptr) { + msg_debug_composites("not found symbol %s in composite %s", sym.data(), + cd->composite->sym.c_str()); + + if (G_UNLIKELY(atom->comp_type == rspamd_composite_atom_type::ATOM_UNKNOWN)) { + const struct rspamd_composite *ncomp; + + if ((ncomp = COMPOSITE_MANAGER_FROM_PTR(task->cfg->composites_manager)->find(sym)) != NULL) { + atom->comp_type = rspamd_composite_atom_type::ATOM_COMPOSITE; + atom->ncomp = ncomp; + } + else { + atom->comp_type = rspamd_composite_atom_type::ATOM_PLAIN; + } + } + + if (atom->comp_type == rspamd_composite_atom_type::ATOM_COMPOSITE) { + msg_debug_composites("symbol %s for composite %s is another composite", + sym.data(), cd->composite->sym.c_str()); + + if (!cd->checked[atom->ncomp->id * 2]) { + msg_debug_composites("composite dependency %s for %s is not checked", + sym.data(), cd->composite->sym.c_str()); + /* Set checked for this symbol to avoid cyclic references */ + cd->checked[cd->composite->id * 2] = true; + auto *saved = cd->composite; /* Save the current composite */ + composites_foreach_callback((gpointer) atom->ncomp->sym.c_str(), + (gpointer) atom->ncomp, (gpointer) cd); + /* Restore state */ + cd->composite = saved; + cd->checked[cd->composite->id * 2] = false; + + ms = rspamd_task_find_symbol_result(cd->task, sym.data(), + cd->metric_res); + } + else { + /* + * XXX: in case of cyclic references this would return 0 + */ + if (cd->checked[atom->ncomp->id * 2 + 1]) { + ms = rspamd_task_find_symbol_result(cd->task, sym.data(), + cd->metric_res); + } + } + } + } + + if (ms) { + msg_debug_composites("found symbol %s in composite %s, weight: %.3f", + sym.data(), cd->composite->sym.c_str(), ms->score); + + /* Now check options */ + for (const auto &cur_opt: atom->opts) { + struct rspamd_symbol_option *opt; + auto found = false; + + DL_FOREACH(ms->opts_head, opt) + { + if (cur_opt.match_opt({opt->option, opt->optlen})) { + found = true; + break; + } + } + + if (!found) { + auto pat = cur_opt.get_pat(); + msg_debug_composites("symbol %s in composite %s misses required option %*s", + sym.data(), + cd->composite->sym.c_str(), + (int) pat.size(), pat.data()); + ms = nullptr; + + break; + } + } + + if (ms) { + if (ms->score == 0) { + rc = epsilon * 16.0; /* Distinguish from 0 */ + } + else { + rc = ms->score; + } + } + } + + *pms = ms; + return rc; +} + +static auto +rspamd_composite_expr_process(void *ud, rspamd_expression_atom_t *atom) -> double +{ + struct composites_data *cd = (struct composites_data *) ud; + struct rspamd_composite_atom *comp_atom = (struct rspamd_composite_atom *) atom->data; + + struct rspamd_symbol_result *ms = NULL; + struct rspamd_task *task = cd->task; + gdouble rc = 0; + + if (cd->checked[cd->composite->id * 2]) { + /* We have already checked this composite, so just return its value */ + if (cd->checked[cd->composite->id * 2 + 1]) { + ms = rspamd_task_find_symbol_result(cd->task, + comp_atom->norm_symbol.data(), + cd->metric_res); + } + + if (ms) { + if (ms->score == 0) { + rc = epsilon; /* Distinguish from 0 */ + } + else { + /* Treat negative and positive scores equally... */ + rc = fabs(ms->score); + } + } + + msg_debug_composites("composite %s is already checked, result: %.2f", + cd->composite->sym.c_str(), rc); + + return rc; + } + + /* Note: sym is zero terminated as it is a view on std::string */ + auto sym = comp_atom->norm_symbol; + auto group_process_functor = [&](auto cond, int sub_start) -> double { + auto max = 0.; + GHashTableIter it; + gpointer k, v; + struct rspamd_symbols_group *gr; + + gr = (struct rspamd_symbols_group *) g_hash_table_lookup(cd->task->cfg->groups, + sym.substr(sub_start).data()); + + if (gr != nullptr) { + g_hash_table_iter_init(&it, gr->symbols); + + while (g_hash_table_iter_next(&it, &k, &v)) { + auto *sdef = (rspamd_symbol *) v; + + if (cond(sdef->score)) { + rc = process_single_symbol(cd, + std::string_view(sdef->name), + &ms, + comp_atom); + + if (fabs(rc) > epsilon) { + process_symbol_removal(atom, + cd, + ms, + comp_atom->symbol); + + if (fabs(rc) > max) { + max = fabs(rc); + } + } + } + } + } + + return max; + }; + + if (sym.size() > 2) { + if (sym.substr(0, 2) == "g:") { + rc = group_process_functor([](auto _) { return true; }, 2); + } + else if (sym.substr(0, 3) == "g+:") { + /* Group, positive symbols only */ + rc = group_process_functor([](auto sc) { return sc > 0.; }, 3); + } + else if (sym.substr(0, 3) == "g-:") { + rc = group_process_functor([](auto sc) { return sc < 0.; }, 3); + } + else { + rc = process_single_symbol(cd, sym, &ms, comp_atom); + + if (fabs(rc) > epsilon) { + process_symbol_removal(atom, + cd, + ms, + comp_atom->symbol); + } + } + } + else { + rc = process_single_symbol(cd, sym, &ms, comp_atom); + + if (fabs(rc) > epsilon) { + process_symbol_removal(atom, + cd, + ms, + comp_atom->symbol); + } + } + + msg_debug_composites("%s: result for atom %s in composite %s is %.4f", + cd->metric_res->name, + comp_atom->norm_symbol.data(), + cd->composite->sym.c_str(), rc); + + return rc; +} + +/* + * We don't have preferences for composites + */ +static gint +rspamd_composite_expr_priority(rspamd_expression_atom_t *atom) +{ + return 0; +} + +static void +rspamd_composite_expr_destroy(rspamd_expression_atom_t *atom) +{ + rspamd_composite_atom_dtor(atom->data); +} + +static void +composites_foreach_callback(gpointer key, gpointer value, void *data) +{ + auto *cd = (struct composites_data *) data; + auto *comp = (struct rspamd_composite *) value; + auto *str_key = (const gchar *) key; + struct rspamd_task *task; + gdouble rc; + + cd->composite = comp; + task = cd->task; + + msg_debug_composites("process composite %s", str_key); + + if (!cd->checked[cd->composite->id * 2]) { + if (rspamd_symcache_is_checked(cd->task, cd->task->cfg->cache, + str_key)) { + msg_debug_composites("composite %s is checked in symcache but not " + "in composites bitfield", + cd->composite->sym.c_str()); + cd->checked[comp->id * 2] = true; + cd->checked[comp->id * 2 + 1] = false; + } + else { + if (rspamd_task_find_symbol_result(cd->task, str_key, + cd->metric_res) != nullptr) { + /* Already set, no need to check */ + msg_debug_composites("composite %s is already in metric " + "in composites bitfield", + cd->composite->sym.c_str()); + cd->checked[comp->id * 2] = true; + cd->checked[comp->id * 2 + 1] = true; + + return; + } + + msg_debug_composites("%s: start processing composite %s", + cd->metric_res->name, + cd->composite->sym.c_str()); + + rc = rspamd_process_expression(comp->expr, RSPAMD_EXPRESSION_FLAG_NOOPT, + cd); + + /* Checked bit */ + cd->checked[comp->id * 2] = true; + + msg_debug_composites("%s: final result for composite %s is %.4f", + cd->metric_res->name, + cd->composite->sym.c_str(), rc); + + /* Result bit */ + if (fabs(rc) > epsilon) { + cd->checked[comp->id * 2 + 1] = true; + rspamd_task_insert_result_full(cd->task, str_key, 1.0, NULL, + RSPAMD_SYMBOL_INSERT_SINGLE, cd->metric_res); + } + else { + cd->checked[comp->id * 2 + 1] = false; + } + } + } +} + + +static auto +remove_symbols(const composites_data &cd, const std::vector<symbol_remove_data> &rd) -> void +{ + struct rspamd_task *task = cd.task; + gboolean skip = FALSE, + has_valid_op = FALSE, + want_remove_score = TRUE, + want_remove_symbol = TRUE, + want_forced = FALSE; + const gchar *disable_score_reason = "no policy", + *disable_symbol_reason = "no policy"; + + task = cd.task; + + for (const auto &cur: rd) { + if (!cd.checked[cur.comp->id * 2 + 1]) { + continue; + } + /* + * First of all exclude all elements with any parent that is negation: + * !A || B -> here we can have both !A and B matched, but we do *NOT* + * want to remove symbol in that case + */ + auto *par = cur.parent; + skip = FALSE; + + while (par) { + if (rspamd_expression_node_is_op(par, OP_NOT)) { + skip = TRUE; + break; + } + + par = par->parent; + } + + if (skip) { + continue; + } + + has_valid_op = TRUE; + /* + * Now we can try to remove symbols/scores + * + * We apply the following logic here: + * - if no composites would like to save score then we remove score + * - if no composites would like to save symbol then we remove symbol + */ + if (!want_forced) { + if (!(cur.action & RSPAMD_COMPOSITE_REMOVE_SYMBOL)) { + want_remove_symbol = FALSE; + disable_symbol_reason = cur.comp->sym.c_str(); + } + + if (!(cur.action & RSPAMD_COMPOSITE_REMOVE_WEIGHT)) { + want_remove_score = FALSE; + disable_score_reason = cur.comp->sym.c_str(); + } + + if (cur.action & RSPAMD_COMPOSITE_REMOVE_FORCED) { + want_forced = TRUE; + disable_symbol_reason = cur.comp->sym.c_str(); + disable_score_reason = cur.comp->sym.c_str(); + } + } + } + + auto *ms = rspamd_task_find_symbol_result(task, rd.front().sym, cd.metric_res); + + if (has_valid_op && ms && !(ms->flags & RSPAMD_SYMBOL_RESULT_IGNORED)) { + + if (want_remove_score || want_forced) { + msg_debug_composites("%s: %s remove symbol weight for %s (was %.2f), " + "score removal affected by %s, symbol removal affected by %s", + cd.metric_res->name, + (want_forced ? "forced" : "normal"), rd.front().sym, ms->score, + disable_score_reason, disable_symbol_reason); + cd.metric_res->score -= ms->score; + ms->score = 0.0; + } + + if (want_remove_symbol || want_forced) { + ms->flags |= RSPAMD_SYMBOL_RESULT_IGNORED; + msg_debug_composites("%s: %s remove symbol %s (score %.2f), " + "score removal affected by %s, symbol removal affected by %s", + cd.metric_res->name, + (want_forced ? "forced" : "normal"), rd.front().sym, ms->score, + disable_score_reason, disable_symbol_reason); + } + } +} + +static void +composites_metric_callback(struct rspamd_task *task) +{ + std::vector<composites_data> comp_data_vec; + struct rspamd_scan_result *mres; + + comp_data_vec.reserve(1); + + DL_FOREACH(task->result, mres) + { + auto &cd = comp_data_vec.emplace_back(task, mres); + + /* Process metric result */ + rspamd_symcache_composites_foreach(task, + task->cfg->cache, + composites_foreach_callback, + &cd); + } + + for (const auto &cd: comp_data_vec) { + /* Remove symbols that are in composites */ + for (const auto &srd_it: cd.symbols_to_remove) { + remove_symbols(cd, srd_it.second); + } + } +} + +}// namespace rspamd::composites + + +void rspamd_composites_process_task(struct rspamd_task *task) +{ + if (task->result && !RSPAMD_TASK_IS_SKIPPED(task)) { + rspamd::composites::composites_metric_callback(task); + } +} diff --git a/src/libserver/composites/composites.h b/src/libserver/composites/composites.h new file mode 100644 index 0000000..5d58029 --- /dev/null +++ b/src/libserver/composites/composites.h @@ -0,0 +1,64 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBSERVER_COMPOSITES_H_ +#define SRC_LIBSERVER_COMPOSITES_H_ + +#include "config.h" +#include "contrib/libucl/ucl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct rspamd_config; + +/** + * Process all results and form composite metrics from existent metrics as it is defined in config + * @param task worker's task that present message from user + */ +void rspamd_composites_process_task(struct rspamd_task *task); + +/** + * Creates a composites manager + * @param cfg + * @return + */ +void *rspamd_composites_manager_create(struct rspamd_config *cfg); +/** + * Returns number of elements in a composite manager + * @return + */ +gsize rspamd_composites_manager_nelts(void *); +/** + * Adds a composite from config + * @return + */ +void *rspamd_composites_manager_add_from_ucl(void *, const char *, const ucl_object_t *); +void *rspamd_composites_manager_add_from_ucl_silent(void *, const char *, const ucl_object_t *); + +/** + * Adds a composite from config + * @return + */ +void *rspamd_composites_manager_add_from_string(void *, const char *, const char *); +void *rspamd_composites_manager_add_from_string_silent(void *, const char *, const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBSERVER_COMPOSITES_H_ */ diff --git a/src/libserver/composites/composites_internal.hxx b/src/libserver/composites/composites_internal.hxx new file mode 100644 index 0000000..038e217 --- /dev/null +++ b/src/libserver/composites/composites_internal.hxx @@ -0,0 +1,112 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_COMPOSITES_INTERNAL_HXX +#define RSPAMD_COMPOSITES_INTERNAL_HXX +#pragma once + +#include <string> +#include "libutil/expression.h" +#include "libutil/cxx/hash_util.hxx" +#include "libserver/cfg_file.h" + +namespace rspamd::composites { + +/** + * Subr for composite expressions + */ +extern const struct rspamd_atom_subr composite_expr_subr; + +enum class rspamd_composite_policy { + RSPAMD_COMPOSITE_POLICY_REMOVE_ALL = 0, + RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL, + RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT, + RSPAMD_COMPOSITE_POLICY_LEAVE, + RSPAMD_COMPOSITE_POLICY_UNKNOWN +}; + +/** + * Static composites structure + */ +struct rspamd_composite { + std::string str_expr; + std::string sym; + struct rspamd_expression *expr; + gint id; + rspamd_composite_policy policy; +}; + +#define COMPOSITE_MANAGER_FROM_PTR(ptr) (reinterpret_cast<rspamd::composites::composites_manager *>(ptr)) + +class composites_manager { +public: + composites_manager(struct rspamd_config *_cfg) + : cfg(_cfg) + { + rspamd_mempool_add_destructor(_cfg->cfg_pool, composites_manager_dtor, this); + } + + auto size(void) const -> std::size_t + { + return all_composites.size(); + } + + auto find(std::string_view name) const -> const rspamd_composite * + { + auto found = composites.find(std::string(name)); + + if (found != composites.end()) { + return found->second.get(); + } + + return nullptr; + } + + auto add_composite(std::string_view, const ucl_object_t *, bool silent_duplicate) -> rspamd_composite *; + auto add_composite(std::string_view name, std::string_view expression, bool silent_duplicate, double score = NAN) -> rspamd_composite *; + +private: + ~composites_manager() = default; + static void composites_manager_dtor(void *ptr) + { + delete COMPOSITE_MANAGER_FROM_PTR(ptr); + } + + auto new_composite(std::string_view composite_name, rspamd_expression *expr, + std::string_view composite_expression) -> auto + { + auto &composite = all_composites.emplace_back(std::make_shared<rspamd_composite>()); + composite->expr = expr; + composite->id = all_composites.size() - 1; + composite->str_expr = composite_expression; + composite->sym = composite_name; + + composites[composite->sym] = composite; + + return composite; + } + + ankerl::unordered_dense::map<std::string, + std::shared_ptr<rspamd_composite>, rspamd::smart_str_hash, rspamd::smart_str_equal> + composites; + /* Store all composites here, even if we have duplicates */ + std::vector<std::shared_ptr<rspamd_composite>> all_composites; + struct rspamd_config *cfg; +}; + +}// namespace rspamd::composites + +#endif//RSPAMD_COMPOSITES_INTERNAL_HXX diff --git a/src/libserver/composites/composites_manager.cxx b/src/libserver/composites/composites_manager.cxx new file mode 100644 index 0000000..1ee5c40 --- /dev/null +++ b/src/libserver/composites/composites_manager.cxx @@ -0,0 +1,330 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <memory> +#include <vector> +#include <cmath> +#include "contrib/ankerl/unordered_dense.h" + +#include "composites.h" +#include "composites_internal.hxx" +#include "libserver/cfg_file.h" +#include "libserver/logger.h" +#include "libserver/maps/map.h" +#include "libutil/cxx/util.hxx" + +namespace rspamd::composites { + +static auto +composite_policy_from_str(const std::string_view &inp) -> enum rspamd_composite_policy { + const static ankerl::unordered_dense::map<std::string_view, + enum rspamd_composite_policy> + names{ + {"remove", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_ALL}, + {"remove_all", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_ALL}, + {"default", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_ALL}, + {"remove_symbol", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL}, + {"remove_weight", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT}, + {"leave", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_LEAVE}, + {"remove_none", rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_LEAVE}, + }; + + auto found = names.find(inp); + if (found != names.end()){ + return found->second;} + +return rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_UNKNOWN; +}// namespace rspamd::composites + +auto composites_manager::add_composite(std::string_view composite_name, const ucl_object_t *obj, bool silent_duplicate) -> rspamd_composite * +{ + + const auto *val = ucl_object_lookup(obj, "enabled"); + if (val != nullptr && !ucl_object_toboolean(val)) { + msg_info_config("composite %s is disabled", composite_name.data()); + return nullptr; + } + + if (composites.contains(composite_name)) { + if (silent_duplicate) { + msg_debug_config("composite %s is redefined", composite_name.data()); + return nullptr; + } + else { + msg_warn_config("composite %s is redefined", composite_name.data()); + } + } + + const char *composite_expression = nullptr; + val = ucl_object_lookup(obj, "expression"); + + if (val == NULL || !ucl_object_tostring_safe(val, &composite_expression)) { + msg_err_config("composite must have an expression defined in %s", + composite_name.data()); + return nullptr; + } + + GError *err = nullptr; + rspamd_expression *expr = nullptr; + + if (!rspamd_parse_expression(composite_expression, 0, &composite_expr_subr, + NULL, cfg->cfg_pool, &err, &expr)) { + msg_err_config("cannot parse composite expression for %s: %e", + composite_name.data(), err); + + if (err) { + g_error_free(err); + } + + return nullptr; + } + + const auto &composite = new_composite(composite_name, expr, composite_expression); + + auto score = std::isnan(cfg->unknown_weight) ? 0.0 : cfg->unknown_weight; + val = ucl_object_lookup(obj, "score"); + + if (val != nullptr) { + ucl_object_todouble_safe(val, &score); + } + + /* Also set score in the metric */ + const auto *group = "composite"; + val = ucl_object_lookup(obj, "group"); + if (val != nullptr) { + group = ucl_object_tostring(val); + } + + const auto *description = composite_expression; + val = ucl_object_lookup(obj, "description"); + if (val != nullptr) { + description = ucl_object_tostring(val); + } + + rspamd_config_add_symbol(cfg, composite_name.data(), score, + description, group, + 0, + ucl_object_get_priority(obj), /* No +1 as it is default... */ + 1); + + const auto *elt = ucl_object_lookup(obj, "groups"); + if (elt && ucl_object_type(elt) == UCL_ARRAY) { + const ucl_object_t *cur_gr; + auto *gr_it = ucl_object_iterate_new(elt); + + while ((cur_gr = ucl_object_iterate_safe(gr_it, true)) != nullptr) { + rspamd_config_add_symbol_group(cfg, composite_name.data(), + ucl_object_tostring(cur_gr)); + } + + ucl_object_iterate_free(gr_it); + } + + val = ucl_object_lookup(obj, "policy"); + if (val) { + composite->policy = composite_policy_from_str(ucl_object_tostring(val)); + + if (composite->policy == rspamd_composite_policy::RSPAMD_COMPOSITE_POLICY_UNKNOWN) { + msg_err_config("composite %s has incorrect policy", composite_name.data()); + return nullptr; + } + } + + return composite.get(); +} + +auto composites_manager::add_composite(std::string_view composite_name, + std::string_view composite_expression, + bool silent_duplicate, double score) -> rspamd_composite * +{ + GError *err = nullptr; + rspamd_expression *expr = nullptr; + + if (composites.contains(composite_name)) { + /* Duplicate composite - refuse to add */ + if (silent_duplicate) { + msg_debug_config("composite %s is redefined", composite_name.data()); + return nullptr; + } + else { + msg_warn_config("composite %s is redefined", composite_name.data()); + } + } + + if (!rspamd_parse_expression(composite_expression.data(), + composite_expression.size(), &composite_expr_subr, + nullptr, cfg->cfg_pool, &err, &expr)) { + msg_err_config("cannot parse composite expression for %s: %e", + composite_name.data(), err); + + if (err) { + g_error_free(err); + } + + return nullptr; + } + + auto final_score = std::isnan(score) ? (std::isnan(cfg->unknown_weight) ? 0.0 : cfg->unknown_weight) : score; + rspamd_config_add_symbol(cfg, composite_name.data(), final_score, + composite_name.data(), "composite", + 0, + 0, + 1); + + return new_composite(composite_name, expr, composite_expression).get(); +} + +struct map_cbdata { + composites_manager *cm; + struct rspamd_config *cfg; + std::string buf; + + explicit map_cbdata(struct rspamd_config *cfg) + : cfg(cfg) + { + cm = COMPOSITE_MANAGER_FROM_PTR(cfg->composites_manager); + } + + static char *map_read(char *chunk, int len, + struct map_cb_data *data, + gboolean _final) + { + + if (data->cur_data == nullptr) { + data->cur_data = data->prev_data; + reinterpret_cast<map_cbdata *>(data->cur_data)->buf.clear(); + } + + auto *cbd = reinterpret_cast<map_cbdata *>(data->cur_data); + + cbd->buf.append(chunk, len); + return nullptr; + } + + static void + map_fin(struct map_cb_data *data, void **target) + { + auto *cbd = reinterpret_cast<map_cbdata *>(data->cur_data); + + if (data->errored) { + if (cbd) { + cbd->buf.clear(); + } + } + else if (cbd != nullptr) { + if (target) { + *target = data->cur_data; + } + + rspamd::string_foreach_line(cbd->buf, [&](std::string_view line) { + auto [name_and_score, expr] = rspamd::string_split_on(line, ' '); + auto [name, score] = rspamd::string_split_on(name_and_score, ':'); + + if (!score.empty()) { + /* I wish it was supported properly */ + //auto conv_res = std::from_chars(value->data(), value->size(), num); + char numbuf[128], *endptr = nullptr; + rspamd_strlcpy(numbuf, score.data(), MIN(score.size(), sizeof(numbuf))); + auto num = g_ascii_strtod(numbuf, &endptr); + + if (fabs(num) >= G_MAXFLOAT || std::isnan(num)) { + msg_err("invalid score for %*s", (int) name_and_score.size(), name_and_score.data()); + return; + } + + auto ret = cbd->cm->add_composite(name, expr, true, num); + + if (ret == nullptr) { + msg_err("cannot add composite %*s", (int) name_and_score.size(), name_and_score.data()); + return; + } + } + else { + msg_err("missing score for %*s", (int) name_and_score.size(), name_and_score.data()); + return; + } + }); + } + else { + msg_err("no data read for composites map"); + } + } + + static void + map_dtor(struct map_cb_data *data) + { + auto *cbd = reinterpret_cast<map_cbdata *>(data->cur_data); + delete cbd; + } +}; +} + + +void * +rspamd_composites_manager_create(struct rspamd_config *cfg) +{ + auto *cm = new rspamd::composites::composites_manager(cfg); + + return reinterpret_cast<void *>(cm); +} + + +gsize rspamd_composites_manager_nelts(void *ptr) +{ + return COMPOSITE_MANAGER_FROM_PTR(ptr)->size(); +} + +void * +rspamd_composites_manager_add_from_ucl(void *cm, const char *sym, const ucl_object_t *obj) +{ + return reinterpret_cast<void *>(COMPOSITE_MANAGER_FROM_PTR(cm)->add_composite(sym, obj, false)); +} + +void * +rspamd_composites_manager_add_from_string(void *cm, const char *sym, const char *expr) +{ + return reinterpret_cast<void *>(COMPOSITE_MANAGER_FROM_PTR(cm)->add_composite(sym, expr, false)); +} + +void * +rspamd_composites_manager_add_from_ucl_silent(void *cm, const char *sym, const ucl_object_t *obj) +{ + return reinterpret_cast<void *>(COMPOSITE_MANAGER_FROM_PTR(cm)->add_composite(sym, obj, true)); +} + +void * +rspamd_composites_manager_add_from_string_silent(void *cm, const char *sym, const char *expr) +{ + return reinterpret_cast<void *>(COMPOSITE_MANAGER_FROM_PTR(cm)->add_composite(sym, expr, true)); +} + + +bool rspamd_composites_add_map_handlers(const ucl_object_t *obj, struct rspamd_config *cfg) +{ + auto **pcbdata = rspamd_mempool_alloc_type(cfg->cfg_pool, rspamd::composites::map_cbdata *); + auto *cbdata = new rspamd::composites::map_cbdata{cfg}; + *pcbdata = cbdata; + + if (struct rspamd_map * m; (m = rspamd_map_add_from_ucl(cfg, obj, "composites map", + rspamd::composites::map_cbdata::map_read, rspamd::composites::map_cbdata::map_fin, + rspamd::composites::map_cbdata::map_dtor, (void **) pcbdata, + nullptr, RSPAMD_MAP_DEFAULT)) == nullptr) { + msg_err_config("cannot load composites map from %s", ucl_object_key(obj)); + return false; + } + + return true; +}
\ No newline at end of file diff --git a/src/libserver/css/CMakeLists.txt b/src/libserver/css/CMakeLists.txt new file mode 100644 index 0000000..c0c9d51 --- /dev/null +++ b/src/libserver/css/CMakeLists.txt @@ -0,0 +1,9 @@ +SET(LIBCSSSRC "${CMAKE_CURRENT_SOURCE_DIR}/css.cxx" + "${CMAKE_CURRENT_SOURCE_DIR}/css_property.cxx" + "${CMAKE_CURRENT_SOURCE_DIR}/css_value.cxx" + "${CMAKE_CURRENT_SOURCE_DIR}/css_selector.cxx" + "${CMAKE_CURRENT_SOURCE_DIR}/css_tokeniser.cxx" + "${CMAKE_CURRENT_SOURCE_DIR}/css_util.cxx" + "${CMAKE_CURRENT_SOURCE_DIR}/css_rule.cxx" + "${CMAKE_CURRENT_SOURCE_DIR}/css_parser.cxx" + PARENT_SCOPE)
\ No newline at end of file diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx new file mode 100644 index 0000000..1b369ed --- /dev/null +++ b/src/libserver/css/css.cxx @@ -0,0 +1,227 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css.hxx" +#include "contrib/ankerl/unordered_dense.h" +#include "css_parser.hxx" +#include "libserver/html/html_tag.hxx" +#include "libserver/html/html_block.hxx" + +/* Keep unit tests implementation here (it'll possibly be moved outside one day) */ +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#define DOCTEST_CONFIG_IMPLEMENT +#include "doctest/doctest.h" + +namespace rspamd::css { + +INIT_LOG_MODULE_PUBLIC(css); + +class css_style_sheet::impl { +public: + using sel_shared_hash = smart_ptr_hash<css_selector>; + using sel_shared_eq = smart_ptr_equal<css_selector>; + using selector_ptr = std::unique_ptr<css_selector>; + using selectors_hash = ankerl::unordered_dense::map<selector_ptr, css_declarations_block_ptr, + sel_shared_hash, sel_shared_eq>; + using universal_selector_t = std::pair<selector_ptr, css_declarations_block_ptr>; + selectors_hash tags_selector; + selectors_hash class_selectors; + selectors_hash id_selectors; + std::optional<universal_selector_t> universal_selector; +}; + +css_style_sheet::css_style_sheet(rspamd_mempool_t *pool) + : pool(pool), pimpl(new impl) +{ +} +css_style_sheet::~css_style_sheet() +{ +} + +auto css_style_sheet::add_selector_rule(std::unique_ptr<css_selector> &&selector, + css_declarations_block_ptr decls) -> void +{ + impl::selectors_hash *target_hash = nullptr; + + switch (selector->type) { + case css_selector::selector_type::SELECTOR_ALL: + if (pimpl->universal_selector) { + /* Another universal selector */ + msg_debug_css("redefined universal selector, merging rules"); + pimpl->universal_selector->second->merge_block(*decls); + } + else { + msg_debug_css("added universal selector"); + pimpl->universal_selector = std::make_pair(std::move(selector), + decls); + } + break; + case css_selector::selector_type::SELECTOR_CLASS: + target_hash = &pimpl->class_selectors; + break; + case css_selector::selector_type::SELECTOR_ID: + target_hash = &pimpl->id_selectors; + break; + case css_selector::selector_type::SELECTOR_TAG: + target_hash = &pimpl->tags_selector; + break; + } + + if (target_hash) { + auto found_it = target_hash->find(selector); + + if (found_it == target_hash->end()) { + /* Easy case, new element */ + target_hash->insert({std::move(selector), decls}); + } + else { + /* The problem with merging is actually in how to handle selectors chains + * For example, we have 2 selectors: + * 1. class id tag -> meaning that we first match class, then we ensure that + * id is also the same and finally we check the tag + * 2. tag class id -> it means that we check first tag, then class and then id + * So we have somehow equal path in the xpath terms. + * I suppose now, that we merely check parent stuff and handle duplicates + * merging when finally resolving paths. + */ + auto sel_str = selector->to_string().value_or("unknown"); + msg_debug_css("found duplicate selector: %*s", (int) sel_str.size(), + sel_str.data()); + found_it->second->merge_block(*decls); + } + } +} + +auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspamd::html::html_block * +{ + std::optional<std::string_view> id_comp, class_comp; + rspamd::html::html_block *res = nullptr; + + if (!tag) { + return nullptr; + } + + /* First, find id in a tag and a class */ + for (const auto ¶m: tag->components) { + if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) { + id_comp = param.value; + } + else if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { + class_comp = param.value; + } + } + + /* ID part */ + if (id_comp && !pimpl->id_selectors.empty()) { + auto found_id_sel = pimpl->id_selectors.find(css_selector{id_comp.value()}); + + if (found_id_sel != pimpl->id_selectors.end()) { + const auto &decl = *(found_id_sel->second); + res = decl.compile_to_block(pool); + } + } + + /* Class part */ + if (class_comp && !pimpl->class_selectors.empty()) { + auto sv_split = [](auto strv, std::string_view delims = " ") -> std::vector<std::string_view> { + std::vector<decltype(strv)> ret; + std::size_t start = 0; + + while (start < strv.size()) { + const auto last = strv.find_first_of(delims, start); + if (start != last) { + ret.emplace_back(strv.substr(start, last - start)); + } + + if (last == std::string_view::npos) { + break; + } + + start = last + 1; + } + + return ret; + }; + + auto elts = sv_split(class_comp.value()); + + for (const auto &e: elts) { + auto found_class_sel = pimpl->class_selectors.find( + css_selector{e, css_selector::selector_type::SELECTOR_CLASS}); + + if (found_class_sel != pimpl->class_selectors.end()) { + const auto &decl = *(found_class_sel->second); + auto *tmp = decl.compile_to_block(pool); + + if (res == nullptr) { + res = tmp; + } + else { + res->propagate_block(*tmp); + } + } + } + } + + /* Tags part */ + if (!pimpl->tags_selector.empty()) { + auto found_tag_sel = pimpl->tags_selector.find( + css_selector{static_cast<tag_id_t>(tag->id)}); + + if (found_tag_sel != pimpl->tags_selector.end()) { + const auto &decl = *(found_tag_sel->second); + auto *tmp = decl.compile_to_block(pool); + + if (res == nullptr) { + res = tmp; + } + else { + res->propagate_block(*tmp); + } + } + } + + /* Finally, universal selector */ + if (pimpl->universal_selector) { + auto *tmp = pimpl->universal_selector->second->compile_to_block(pool); + + if (res == nullptr) { + res = tmp; + } + else { + res->propagate_block(*tmp); + } + } + + return res; +} + +auto css_parse_style(rspamd_mempool_t *pool, + std::string_view input, + std::shared_ptr<css_style_sheet> &&existing) + -> css_return_pair +{ + auto parse_res = rspamd::css::parse_css(pool, input, + std::forward<std::shared_ptr<css_style_sheet>>(existing)); + + if (parse_res.has_value()) { + return std::make_pair(parse_res.value(), css_parse_error()); + } + + return std::make_pair(nullptr, parse_res.error()); +} + +}// namespace rspamd::css
\ No newline at end of file diff --git a/src/libserver/css/css.hxx b/src/libserver/css/css.hxx new file mode 100644 index 0000000..f0f8120 --- /dev/null +++ b/src/libserver/css/css.hxx @@ -0,0 +1,68 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifndef RSPAMD_CSS_HXX +#define RSPAMD_CSS_HXX + +#include <string> +#include <memory> +#include "logger.h" +#include "css_rule.hxx" +#include "css_selector.hxx" + +namespace rspamd::html { +/* Forward declaration */ +struct html_tag; +struct html_block; +}// namespace rspamd::html + +namespace rspamd::css { + +extern int rspamd_css_log_id; + +#define msg_debug_css(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_css_log_id, "css", pool->tag.uid, \ + __FUNCTION__, \ + __VA_ARGS__) +#define msg_err_css(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "css", pool->tag.uid, \ + __FUNCTION__, \ + __VA_ARGS__) + +class css_style_sheet { +public: + css_style_sheet(rspamd_mempool_t *pool); + ~css_style_sheet(); /* must be declared separately due to pimpl */ + auto add_selector_rule(std::unique_ptr<css_selector> &&selector, + css_declarations_block_ptr decls) -> void; + + auto check_tag_block(const rspamd::html::html_tag *tag) -> rspamd::html::html_block *; + +private: + class impl; + rspamd_mempool_t *pool; + std::unique_ptr<impl> pimpl; +}; + +using css_return_pair = std::pair<std::shared_ptr<css_style_sheet>, css_parse_error>; +auto css_parse_style(rspamd_mempool_t *pool, + std::string_view input, + std::shared_ptr<css_style_sheet> &&existing) -> css_return_pair; + +}// namespace rspamd::css + +#endif//RSPAMD_CSS_H
\ No newline at end of file diff --git a/src/libserver/css/css_colors_list.hxx b/src/libserver/css/css_colors_list.hxx new file mode 100644 index 0000000..6dfe54f --- /dev/null +++ b/src/libserver/css/css_colors_list.hxx @@ -0,0 +1,738 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_CSS_COLORS_LIST_HXX +#define RSPAMD_CSS_COLORS_LIST_HXX + +#pragma once + +#include <string_view> +#include "contrib/ankerl/unordered_dense.h" +#include "css_value.hxx" + +namespace rspamd::css { + +/* + * List of all colors, intended to use with hashes/sets + * TODO: think about frozen structs when we can deal with 700 values without + * compiler limits... + */ +static const ankerl::unordered_dense::map<std::string_view, css_color> css_colors_map{ + {"aliceblue", {240, 248, 255}}, + {"antiquewhite", {250, 235, 215}}, + {"antiquewhite1", {255, 239, 219}}, + {"antiquewhite2", {238, 223, 204}}, + {"antiquewhite3", {205, 192, 176}}, + {"antiquewhite4", {139, 131, 120}}, + {"aqua", {0, 255, 255}}, + {"aquamarine", {127, 255, 212}}, + {"aquamarine1", {127, 255, 212}}, + {"aquamarine2", {118, 238, 198}}, + {"aquamarine3", {102, 205, 170}}, + {"aquamarine4", {69, 139, 116}}, + {"azure", {240, 255, 255}}, + {"azure1", {240, 255, 255}}, + {"azure2", {224, 238, 238}}, + {"azure3", {193, 205, 205}}, + {"azure4", {131, 139, 139}}, + {"beige", {245, 245, 220}}, + {"bisque", {255, 228, 196}}, + {"bisque1", {255, 228, 196}}, + {"bisque2", {238, 213, 183}}, + {"bisque3", {205, 183, 158}}, + {"bisque4", {139, 125, 107}}, + {"black", {0, 0, 0}}, + {"blanchedalmond", {255, 235, 205}}, + {"blue", {0, 0, 255}}, + {"blue1", {0, 0, 255}}, + {"blue2", {0, 0, 238}}, + {"blue3", {0, 0, 205}}, + {"blue4", {0, 0, 139}}, + {"blueviolet", {138, 43, 226}}, + {"brown", {165, 42, 42}}, + {"brown1", {255, 64, 64}}, + {"brown2", {238, 59, 59}}, + {"brown3", {205, 51, 51}}, + {"brown4", {139, 35, 35}}, + {"burlywood", {222, 184, 135}}, + {"burlywood1", {255, 211, 155}}, + {"burlywood2", {238, 197, 145}}, + {"burlywood3", {205, 170, 125}}, + {"burlywood4", {139, 115, 85}}, + {"cadetblue", {95, 158, 160}}, + {"cadetblue1", {152, 245, 255}}, + {"cadetblue2", {142, 229, 238}}, + {"cadetblue3", {122, 197, 205}}, + {"cadetblue4", {83, 134, 139}}, + {"chartreuse", {127, 255, 0}}, + {"chartreuse1", {127, 255, 0}}, + {"chartreuse2", {118, 238, 0}}, + {"chartreuse3", {102, 205, 0}}, + {"chartreuse4", {69, 139, 0}}, + {"chocolate", {210, 105, 30}}, + {"chocolate1", {255, 127, 36}}, + {"chocolate2", {238, 118, 33}}, + {"chocolate3", {205, 102, 29}}, + {"chocolate4", {139, 69, 19}}, + {"coral", {255, 127, 80}}, + {"coral1", {255, 114, 86}}, + {"coral2", {238, 106, 80}}, + {"coral3", {205, 91, 69}}, + {"coral4", {139, 62, 47}}, + {"cornflowerblue", {100, 149, 237}}, + {"cornsilk", {255, 248, 220}}, + {"cornsilk1", {255, 248, 220}}, + {"cornsilk2", {238, 232, 205}}, + {"cornsilk3", {205, 200, 177}}, + {"cornsilk4", {139, 136, 120}}, + {"crimson", {220, 20, 60}}, + {"cyan", {0, 255, 255}}, + {"cyan1", {0, 255, 255}}, + {"cyan2", {0, 238, 238}}, + {"cyan3", {0, 205, 205}}, + {"cyan4", {0, 139, 139}}, + {"darkblue", {0, 0, 139}}, + {"darkcyan", {0, 139, 139}}, + {"darkgoldenrod", {184, 134, 11}}, + {"darkgoldenrod1", {255, 185, 15}}, + {"darkgoldenrod2", {238, 173, 14}}, + {"darkgoldenrod3", {205, 149, 12}}, + {"darkgoldenrod4", {139, 101, 8}}, + {"darkgray", {169, 169, 169}}, + {"darkgreen", {0, 100, 0}}, + {"darkgrey", {169, 169, 169}}, + {"darkkhaki", {189, 183, 107}}, + {"darkmagenta", {139, 0, 139}}, + {"darkolivegreen", {85, 107, 47}}, + {"darkolivegreen1", {202, 255, 112}}, + {"darkolivegreen2", {188, 238, 104}}, + {"darkolivegreen3", {162, 205, 90}}, + {"darkolivegreen4", {110, 139, 61}}, + {"darkorange", {255, 140, 0}}, + {"darkorange1", {255, 127, 0}}, + {"darkorange2", {238, 118, 0}}, + {"darkorange3", {205, 102, 0}}, + {"darkorange4", {139, 69, 0}}, + {"darkorchid", {153, 50, 204}}, + {"darkorchid1", {191, 62, 255}}, + {"darkorchid2", {178, 58, 238}}, + {"darkorchid3", {154, 50, 205}}, + {"darkorchid4", {104, 34, 139}}, + {"darkred", {139, 0, 0}}, + {"darksalmon", {233, 150, 122}}, + {"darkseagreen", {143, 188, 143}}, + {"darkseagreen1", {193, 255, 193}}, + {"darkseagreen2", {180, 238, 180}}, + {"darkseagreen3", {155, 205, 155}}, + {"darkseagreen4", {105, 139, 105}}, + {"darkslateblue", {72, 61, 139}}, + {"darkslategray", {47, 79, 79}}, + {"darkslategray1", {151, 255, 255}}, + {"darkslategray2", {141, 238, 238}}, + {"darkslategray3", {121, 205, 205}}, + {"darkslategray4", {82, 139, 139}}, + {"darkslategrey", {47, 79, 79}}, + {"darkturquoise", {0, 206, 209}}, + {"darkviolet", {148, 0, 211}}, + {"deeppink", {255, 20, 147}}, + {"deeppink1", {255, 20, 147}}, + {"deeppink2", {238, 18, 137}}, + {"deeppink3", {205, 16, 118}}, + {"deeppink4", {139, 10, 80}}, + {"deepskyblue", {0, 191, 255}}, + {"deepskyblue1", {0, 191, 255}}, + {"deepskyblue2", {0, 178, 238}}, + {"deepskyblue3", {0, 154, 205}}, + {"deepskyblue4", {0, 104, 139}}, + {"dimgray", {105, 105, 105}}, + {"dimgrey", {105, 105, 105}}, + {"dodgerblue", {30, 144, 255}}, + {"dodgerblue1", {30, 144, 255}}, + {"dodgerblue2", {28, 134, 238}}, + {"dodgerblue3", {24, 116, 205}}, + {"dodgerblue4", {16, 78, 139}}, + {"firebrick", {178, 34, 34}}, + {"firebrick1", {255, 48, 48}}, + {"firebrick2", {238, 44, 44}}, + {"firebrick3", {205, 38, 38}}, + {"firebrick4", {139, 26, 26}}, + {"floralwhite", {255, 250, 240}}, + {"forestgreen", {34, 139, 34}}, + {"fuchsia", {255, 0, 255}}, + {"gainsboro", {220, 220, 220}}, + {"ghostwhite", {248, 248, 255}}, + {"gold", {255, 215, 0}}, + {"gold1", {255, 215, 0}}, + {"gold2", {238, 201, 0}}, + {"gold3", {205, 173, 0}}, + {"gold4", {139, 117, 0}}, + {"goldenrod", {218, 165, 32}}, + {"goldenrod1", {255, 193, 37}}, + {"goldenrod2", {238, 180, 34}}, + {"goldenrod3", {205, 155, 29}}, + {"goldenrod4", {139, 105, 20}}, + {"gray", {190, 190, 190}}, + {"gray0", {0, 0, 0}}, + {"gray1", {3, 3, 3}}, + {"gray10", {26, 26, 26}}, + {"gray100", {255, 255, 255}}, + {"gray11", {28, 28, 28}}, + {"gray12", {31, 31, 31}}, + {"gray13", {33, 33, 33}}, + {"gray14", {36, 36, 36}}, + {"gray15", {38, 38, 38}}, + {"gray16", {41, 41, 41}}, + {"gray17", {43, 43, 43}}, + {"gray18", {46, 46, 46}}, + {"gray19", {48, 48, 48}}, + {"gray2", {5, 5, 5}}, + {"gray20", {51, 51, 51}}, + {"gray21", {54, 54, 54}}, + {"gray22", {56, 56, 56}}, + {"gray23", {59, 59, 59}}, + {"gray24", {61, 61, 61}}, + {"gray25", {64, 64, 64}}, + {"gray26", {66, 66, 66}}, + {"gray27", {69, 69, 69}}, + {"gray28", {71, 71, 71}}, + {"gray29", {74, 74, 74}}, + {"gray3", {8, 8, 8}}, + {"gray30", {77, 77, 77}}, + {"gray31", {79, 79, 79}}, + {"gray32", {82, 82, 82}}, + {"gray33", {84, 84, 84}}, + {"gray34", {87, 87, 87}}, + {"gray35", {89, 89, 89}}, + {"gray36", {92, 92, 92}}, + {"gray37", {94, 94, 94}}, + {"gray38", {97, 97, 97}}, + {"gray39", {99, 99, 99}}, + {"gray4", {10, 10, 10}}, + {"gray40", {102, 102, 102}}, + {"gray41", {105, 105, 105}}, + {"gray42", {107, 107, 107}}, + {"gray43", {110, 110, 110}}, + {"gray44", {112, 112, 112}}, + {"gray45", {115, 115, 115}}, + {"gray46", {117, 117, 117}}, + {"gray47", {120, 120, 120}}, + {"gray48", {122, 122, 122}}, + {"gray49", {125, 125, 125}}, + {"gray5", {13, 13, 13}}, + {"gray50", {127, 127, 127}}, + {"gray51", {130, 130, 130}}, + {"gray52", {133, 133, 133}}, + {"gray53", {135, 135, 135}}, + {"gray54", {138, 138, 138}}, + {"gray55", {140, 140, 140}}, + {"gray56", {143, 143, 143}}, + {"gray57", {145, 145, 145}}, + {"gray58", {148, 148, 148}}, + {"gray59", {150, 150, 150}}, + {"gray6", {15, 15, 15}}, + {"gray60", {153, 153, 153}}, + {"gray61", {156, 156, 156}}, + {"gray62", {158, 158, 158}}, + {"gray63", {161, 161, 161}}, + {"gray64", {163, 163, 163}}, + {"gray65", {166, 166, 166}}, + {"gray66", {168, 168, 168}}, + {"gray67", {171, 171, 171}}, + {"gray68", {173, 173, 173}}, + {"gray69", {176, 176, 176}}, + {"gray7", {18, 18, 18}}, + {"gray70", {179, 179, 179}}, + {"gray71", {181, 181, 181}}, + {"gray72", {184, 184, 184}}, + {"gray73", {186, 186, 186}}, + {"gray74", {189, 189, 189}}, + {"gray75", {191, 191, 191}}, + {"gray76", {194, 194, 194}}, + {"gray77", {196, 196, 196}}, + {"gray78", {199, 199, 199}}, + {"gray79", {201, 201, 201}}, + {"gray8", {20, 20, 20}}, + {"gray80", {204, 204, 204}}, + {"gray81", {207, 207, 207}}, + {"gray82", {209, 209, 209}}, + {"gray83", {212, 212, 212}}, + {"gray84", {214, 214, 214}}, + {"gray85", {217, 217, 217}}, + {"gray86", {219, 219, 219}}, + {"gray87", {222, 222, 222}}, + {"gray88", {224, 224, 224}}, + {"gray89", {227, 227, 227}}, + {"gray9", {23, 23, 23}}, + {"gray90", {229, 229, 229}}, + {"gray91", {232, 232, 232}}, + {"gray92", {235, 235, 235}}, + {"gray93", {237, 237, 237}}, + {"gray94", {240, 240, 240}}, + {"gray95", {242, 242, 242}}, + {"gray96", {245, 245, 245}}, + {"gray97", {247, 247, 247}}, + {"gray98", {250, 250, 250}}, + {"gray99", {252, 252, 252}}, + {"green", {0, 255, 0}}, + {"green1", {0, 255, 0}}, + {"green2", {0, 238, 0}}, + {"green3", {0, 205, 0}}, + {"green4", {0, 139, 0}}, + {"greenyellow", {173, 255, 47}}, + {"grey", {190, 190, 190}}, + {"grey0", {0, 0, 0}}, + {"grey1", {3, 3, 3}}, + {"grey10", {26, 26, 26}}, + {"grey100", {255, 255, 255}}, + {"grey11", {28, 28, 28}}, + {"grey12", {31, 31, 31}}, + {"grey13", {33, 33, 33}}, + {"grey14", {36, 36, 36}}, + {"grey15", {38, 38, 38}}, + {"grey16", {41, 41, 41}}, + {"grey17", {43, 43, 43}}, + {"grey18", {46, 46, 46}}, + {"grey19", {48, 48, 48}}, + {"grey2", {5, 5, 5}}, + {"grey20", {51, 51, 51}}, + {"grey21", {54, 54, 54}}, + {"grey22", {56, 56, 56}}, + {"grey23", {59, 59, 59}}, + {"grey24", {61, 61, 61}}, + {"grey25", {64, 64, 64}}, + {"grey26", {66, 66, 66}}, + {"grey27", {69, 69, 69}}, + {"grey28", {71, 71, 71}}, + {"grey29", {74, 74, 74}}, + {"grey3", {8, 8, 8}}, + {"grey30", {77, 77, 77}}, + {"grey31", {79, 79, 79}}, + {"grey32", {82, 82, 82}}, + {"grey33", {84, 84, 84}}, + {"grey34", {87, 87, 87}}, + {"grey35", {89, 89, 89}}, + {"grey36", {92, 92, 92}}, + {"grey37", {94, 94, 94}}, + {"grey38", {97, 97, 97}}, + {"grey39", {99, 99, 99}}, + {"grey4", {10, 10, 10}}, + {"grey40", {102, 102, 102}}, + {"grey41", {105, 105, 105}}, + {"grey42", {107, 107, 107}}, + {"grey43", {110, 110, 110}}, + {"grey44", {112, 112, 112}}, + {"grey45", {115, 115, 115}}, + {"grey46", {117, 117, 117}}, + {"grey47", {120, 120, 120}}, + {"grey48", {122, 122, 122}}, + {"grey49", {125, 125, 125}}, + {"grey5", {13, 13, 13}}, + {"grey50", {127, 127, 127}}, + {"grey51", {130, 130, 130}}, + {"grey52", {133, 133, 133}}, + {"grey53", {135, 135, 135}}, + {"grey54", {138, 138, 138}}, + {"grey55", {140, 140, 140}}, + {"grey56", {143, 143, 143}}, + {"grey57", {145, 145, 145}}, + {"grey58", {148, 148, 148}}, + {"grey59", {150, 150, 150}}, + {"grey6", {15, 15, 15}}, + {"grey60", {153, 153, 153}}, + {"grey61", {156, 156, 156}}, + {"grey62", {158, 158, 158}}, + {"grey63", {161, 161, 161}}, + {"grey64", {163, 163, 163}}, + {"grey65", {166, 166, 166}}, + {"grey66", {168, 168, 168}}, + {"grey67", {171, 171, 171}}, + {"grey68", {173, 173, 173}}, + {"grey69", {176, 176, 176}}, + {"grey7", {18, 18, 18}}, + {"grey70", {179, 179, 179}}, + {"grey71", {181, 181, 181}}, + {"grey72", {184, 184, 184}}, + {"grey73", {186, 186, 186}}, + {"grey74", {189, 189, 189}}, + {"grey75", {191, 191, 191}}, + {"grey76", {194, 194, 194}}, + {"grey77", {196, 196, 196}}, + {"grey78", {199, 199, 199}}, + {"grey79", {201, 201, 201}}, + {"grey8", {20, 20, 20}}, + {"grey80", {204, 204, 204}}, + {"grey81", {207, 207, 207}}, + {"grey82", {209, 209, 209}}, + {"grey83", {212, 212, 212}}, + {"grey84", {214, 214, 214}}, + {"grey85", {217, 217, 217}}, + {"grey86", {219, 219, 219}}, + {"grey87", {222, 222, 222}}, + {"grey88", {224, 224, 224}}, + {"grey89", {227, 227, 227}}, + {"grey9", {23, 23, 23}}, + {"grey90", {229, 229, 229}}, + {"grey91", {232, 232, 232}}, + {"grey92", {235, 235, 235}}, + {"grey93", {237, 237, 237}}, + {"grey94", {240, 240, 240}}, + {"grey95", {242, 242, 242}}, + {"grey96", {245, 245, 245}}, + {"grey97", {247, 247, 247}}, + {"grey98", {250, 250, 250}}, + {"grey99", {252, 252, 252}}, + {"honeydew", {240, 255, 240}}, + {"honeydew1", {240, 255, 240}}, + {"honeydew2", {224, 238, 224}}, + {"honeydew3", {193, 205, 193}}, + {"honeydew4", {131, 139, 131}}, + {"hotpink", {255, 105, 180}}, + {"hotpink1", {255, 110, 180}}, + {"hotpink2", {238, 106, 167}}, + {"hotpink3", {205, 96, 144}}, + {"hotpink4", {139, 58, 98}}, + {"indianred", {205, 92, 92}}, + {"indianred1", {255, 106, 106}}, + {"indianred2", {238, 99, 99}}, + {"indianred3", {205, 85, 85}}, + {"indianred4", {139, 58, 58}}, + {"indigo", {75, 0, 130}}, + {"ivory", {255, 255, 240}}, + {"ivory1", {255, 255, 240}}, + {"ivory2", {238, 238, 224}}, + {"ivory3", {205, 205, 193}}, + {"ivory4", {139, 139, 131}}, + {"khaki", {240, 230, 140}}, + {"khaki1", {255, 246, 143}}, + {"khaki2", {238, 230, 133}}, + {"khaki3", {205, 198, 115}}, + {"khaki4", {139, 134, 78}}, + {"lavender", {230, 230, 250}}, + {"lavenderblush", {255, 240, 245}}, + {"lavenderblush1", {255, 240, 245}}, + {"lavenderblush2", {238, 224, 229}}, + {"lavenderblush3", {205, 193, 197}}, + {"lavenderblush4", {139, 131, 134}}, + {"lawngreen", {124, 252, 0}}, + {"lemonchiffon", {255, 250, 205}}, + {"lemonchiffon1", {255, 250, 205}}, + {"lemonchiffon2", {238, 233, 191}}, + {"lemonchiffon3", {205, 201, 165}}, + {"lemonchiffon4", {139, 137, 112}}, + {"lightblue", {173, 216, 230}}, + {"lightblue1", {191, 239, 255}}, + {"lightblue2", {178, 223, 238}}, + {"lightblue3", {154, 192, 205}}, + {"lightblue4", {104, 131, 139}}, + {"lightcoral", {240, 128, 128}}, + {"lightcyan", {224, 255, 255}}, + {"lightcyan1", {224, 255, 255}}, + {"lightcyan2", {209, 238, 238}}, + {"lightcyan3", {180, 205, 205}}, + {"lightcyan4", {122, 139, 139}}, + {"lightgoldenrod", {238, 221, 130}}, + {"lightgoldenrod1", {255, 236, 139}}, + {"lightgoldenrod2", {238, 220, 130}}, + {"lightgoldenrod3", {205, 190, 112}}, + {"lightgoldenrod4", {139, 129, 76}}, + {"lightgoldenrodyellow", {250, 250, 210}}, + {"lightgray", {211, 211, 211}}, + {"lightgreen", {144, 238, 144}}, + {"lightgrey", {211, 211, 211}}, + {"lightpink", {255, 182, 193}}, + {"lightpink1", {255, 174, 185}}, + {"lightpink2", {238, 162, 173}}, + {"lightpink3", {205, 140, 149}}, + {"lightpink4", {139, 95, 101}}, + {"lightsalmon", {255, 160, 122}}, + {"lightsalmon1", {255, 160, 122}}, + {"lightsalmon2", {238, 149, 114}}, + {"lightsalmon3", {205, 129, 98}}, + {"lightsalmon4", {139, 87, 66}}, + {"lightseagreen", {32, 178, 170}}, + {"lightskyblue", {135, 206, 250}}, + {"lightskyblue1", {176, 226, 255}}, + {"lightskyblue2", {164, 211, 238}}, + {"lightskyblue3", {141, 182, 205}}, + {"lightskyblue4", {96, 123, 139}}, + {"lightslateblue", {132, 112, 255}}, + {"lightslategray", {119, 136, 153}}, + {"lightslategrey", {119, 136, 153}}, + {"lightsteelblue", {176, 196, 222}}, + {"lightsteelblue1", {202, 225, 255}}, + {"lightsteelblue2", {188, 210, 238}}, + {"lightsteelblue3", {162, 181, 205}}, + {"lightsteelblue4", {110, 123, 139}}, + {"lightyellow", {255, 255, 224}}, + {"lightyellow1", {255, 255, 224}}, + {"lightyellow2", {238, 238, 209}}, + {"lightyellow3", {205, 205, 180}}, + {"lightyellow4", {139, 139, 122}}, + {"lime", {0, 255, 0}}, + {"limegreen", {50, 205, 50}}, + {"linen", {250, 240, 230}}, + {"magenta", {255, 0, 255}}, + {"magenta1", {255, 0, 255}}, + {"magenta2", {238, 0, 238}}, + {"magenta3", {205, 0, 205}}, + {"magenta4", {139, 0, 139}}, + {"maroon", {176, 48, 96}}, + {"maroon1", {255, 52, 179}}, + {"maroon2", {238, 48, 167}}, + {"maroon3", {205, 41, 144}}, + {"maroon4", {139, 28, 98}}, + {"mediumaquamarine", {102, 205, 170}}, + {"mediumblue", {0, 0, 205}}, + {"mediumorchid", {186, 85, 211}}, + {"mediumorchid1", {224, 102, 255}}, + {"mediumorchid2", {209, 95, 238}}, + {"mediumorchid3", {180, 82, 205}}, + {"mediumorchid4", {122, 55, 139}}, + {"mediumpurple", {147, 112, 219}}, + {"mediumpurple1", {171, 130, 255}}, + {"mediumpurple2", {159, 121, 238}}, + {"mediumpurple3", {137, 104, 205}}, + {"mediumpurple4", {93, 71, 139}}, + {"mediumseagreen", {60, 179, 113}}, + {"mediumslateblue", {123, 104, 238}}, + {"mediumspringgreen", {0, 250, 154}}, + {"mediumturquoise", {72, 209, 204}}, + {"mediumvioletred", {199, 21, 133}}, + {"midnightblue", {25, 25, 112}}, + {"mintcream", {245, 255, 250}}, + {"mistyrose", {255, 228, 225}}, + {"mistyrose1", {255, 228, 225}}, + {"mistyrose2", {238, 213, 210}}, + {"mistyrose3", {205, 183, 181}}, + {"mistyrose4", {139, 125, 123}}, + {"moccasin", {255, 228, 181}}, + {"navajowhite", {255, 222, 173}}, + {"navajowhite1", {255, 222, 173}}, + {"navajowhite2", {238, 207, 161}}, + {"navajowhite3", {205, 179, 139}}, + {"navajowhite4", {139, 121, 94}}, + {"navy", {0, 0, 128}}, + {"navyblue", {0, 0, 128}}, + {"oldlace", {253, 245, 230}}, + {"olive", {128, 128, 0}}, + {"olivedrab", {107, 142, 35}}, + {"olivedrab1", {192, 255, 62}}, + {"olivedrab2", {179, 238, 58}}, + {"olivedrab3", {154, 205, 50}}, + {"olivedrab4", {105, 139, 34}}, + {"orange", {255, 165, 0}}, + {"orange1", {255, 165, 0}}, + {"orange2", {238, 154, 0}}, + {"orange3", {205, 133, 0}}, + {"orange4", {139, 90, 0}}, + {"orangered", {255, 69, 0}}, + {"orangered1", {255, 69, 0}}, + {"orangered2", {238, 64, 0}}, + {"orangered3", {205, 55, 0}}, + {"orangered4", {139, 37, 0}}, + {"orchid", {218, 112, 214}}, + {"orchid1", {255, 131, 250}}, + {"orchid2", {238, 122, 233}}, + {"orchid3", {205, 105, 201}}, + {"orchid4", {139, 71, 137}}, + {"palegoldenrod", {238, 232, 170}}, + {"palegreen", {152, 251, 152}}, + {"palegreen1", {154, 255, 154}}, + {"palegreen2", {144, 238, 144}}, + {"palegreen3", {124, 205, 124}}, + {"palegreen4", {84, 139, 84}}, + {"paleturquoise", {175, 238, 238}}, + {"paleturquoise1", {187, 255, 255}}, + {"paleturquoise2", {174, 238, 238}}, + {"paleturquoise3", {150, 205, 205}}, + {"paleturquoise4", {102, 139, 139}}, + {"palevioletred", {219, 112, 147}}, + {"palevioletred1", {255, 130, 171}}, + {"palevioletred2", {238, 121, 159}}, + {"palevioletred3", {205, 104, 137}}, + {"palevioletred4", {139, 71, 93}}, + {"papayawhip", {255, 239, 213}}, + {"peachpuff", {255, 218, 185}}, + {"peachpuff1", {255, 218, 185}}, + {"peachpuff2", {238, 203, 173}}, + {"peachpuff3", {205, 175, 149}}, + {"peachpuff4", {139, 119, 101}}, + {"peru", {205, 133, 63}}, + {"pink", {255, 192, 203}}, + {"pink1", {255, 181, 197}}, + {"pink2", {238, 169, 184}}, + {"pink3", {205, 145, 158}}, + {"pink4", {139, 99, 108}}, + {"plum", {221, 160, 221}}, + {"plum1", {255, 187, 255}}, + {"plum2", {238, 174, 238}}, + {"plum3", {205, 150, 205}}, + {"plum4", {139, 102, 139}}, + {"powderblue", {176, 224, 230}}, + {"purple", {160, 32, 240}}, + {"purple1", {155, 48, 255}}, + {"purple2", {145, 44, 238}}, + {"purple3", {125, 38, 205}}, + {"purple4", {85, 26, 139}}, + {"rebeccapurple", {102, 51, 153}}, + {"red", {255, 0, 0}}, + {"red1", {255, 0, 0}}, + {"red2", {238, 0, 0}}, + {"red3", {205, 0, 0}}, + {"red4", {139, 0, 0}}, + {"rosybrown", {188, 143, 143}}, + {"rosybrown1", {255, 193, 193}}, + {"rosybrown2", {238, 180, 180}}, + {"rosybrown3", {205, 155, 155}}, + {"rosybrown4", {139, 105, 105}}, + {"royalblue", {65, 105, 225}}, + {"royalblue1", {72, 118, 255}}, + {"royalblue2", {67, 110, 238}}, + {"royalblue3", {58, 95, 205}}, + {"royalblue4", {39, 64, 139}}, + {"saddlebrown", {139, 69, 19}}, + {"salmon", {250, 128, 114}}, + {"salmon1", {255, 140, 105}}, + {"salmon2", {238, 130, 98}}, + {"salmon3", {205, 112, 84}}, + {"salmon4", {139, 76, 57}}, + {"sandybrown", {244, 164, 96}}, + {"seagreen", {46, 139, 87}}, + {"seagreen1", {84, 255, 159}}, + {"seagreen2", {78, 238, 148}}, + {"seagreen3", {67, 205, 128}}, + {"seagreen4", {46, 139, 87}}, + {"seashell", {255, 245, 238}}, + {"seashell1", {255, 245, 238}}, + {"seashell2", {238, 229, 222}}, + {"seashell3", {205, 197, 191}}, + {"seashell4", {139, 134, 130}}, + {"sienna", {160, 82, 45}}, + {"sienna1", {255, 130, 71}}, + {"sienna2", {238, 121, 66}}, + {"sienna3", {205, 104, 57}}, + {"sienna4", {139, 71, 38}}, + {"silver", {192, 192, 192}}, + {"skyblue", {135, 206, 235}}, + {"skyblue1", {135, 206, 255}}, + {"skyblue2", {126, 192, 238}}, + {"skyblue3", {108, 166, 205}}, + {"skyblue4", {74, 112, 139}}, + {"slateblue", {106, 90, 205}}, + {"slateblue1", {131, 111, 255}}, + {"slateblue2", {122, 103, 238}}, + {"slateblue3", {105, 89, 205}}, + {"slateblue4", {71, 60, 139}}, + {"slategray", {112, 128, 144}}, + {"slategray1", {198, 226, 255}}, + {"slategray2", {185, 211, 238}}, + {"slategray3", {159, 182, 205}}, + {"slategray4", {108, 123, 139}}, + {"slategrey", {112, 128, 144}}, + {"snow", {255, 250, 250}}, + {"snow1", {255, 250, 250}}, + {"snow2", {238, 233, 233}}, + {"snow3", {205, 201, 201}}, + {"snow4", {139, 137, 137}}, + {"springgreen", {0, 255, 127}}, + {"springgreen1", {0, 255, 127}}, + {"springgreen2", {0, 238, 118}}, + {"springgreen3", {0, 205, 102}}, + {"springgreen4", {0, 139, 69}}, + {"steelblue", {70, 130, 180}}, + {"steelblue1", {99, 184, 255}}, + {"steelblue2", {92, 172, 238}}, + {"steelblue3", {79, 148, 205}}, + {"steelblue4", {54, 100, 139}}, + {"tan", {210, 180, 140}}, + {"tan1", {255, 165, 79}}, + {"tan2", {238, 154, 73}}, + {"tan3", {205, 133, 63}}, + {"tan4", {139, 90, 43}}, + {"teal", {0, 128, 128}}, + {"thistle", {216, 191, 216}}, + {"thistle1", {255, 225, 255}}, + {"thistle2", {238, 210, 238}}, + {"thistle3", {205, 181, 205}}, + {"thistle4", {139, 123, 139}}, + {"tomato", {255, 99, 71}}, + {"tomato1", {255, 99, 71}}, + {"tomato2", {238, 92, 66}}, + {"tomato3", {205, 79, 57}}, + {"tomato4", {139, 54, 38}}, + {"turquoise", {64, 224, 208}}, + {"turquoise1", {0, 245, 255}}, + {"turquoise2", {0, 229, 238}}, + {"turquoise3", {0, 197, 205}}, + {"turquoise4", {0, 134, 139}}, + {"violet", {238, 130, 238}}, + {"violetred", {208, 32, 144}}, + {"violetred1", {255, 62, 150}}, + {"violetred2", {238, 58, 140}}, + {"violetred3", {205, 50, 120}}, + {"violetred4", {139, 34, 82}}, + {"webgray", {128, 128, 128}}, + {"webgreen", {0, 128, 0}}, + {"webgrey", {128, 128, 128}}, + {"webmaroon", {128, 0, 0}}, + {"webpurple", {128, 0, 128}}, + {"wheat", {245, 222, 179}}, + {"wheat1", {255, 231, 186}}, + {"wheat2", {238, 216, 174}}, + {"wheat3", {205, 186, 150}}, + {"wheat4", {139, 126, 102}}, + {"white", {255, 255, 255}}, + {"whitesmoke", {245, 245, 245}}, + {"x11gray", {190, 190, 190}}, + {"x11green", {0, 255, 0}}, + {"x11grey", {190, 190, 190}}, + {"x11maroon", {176, 48, 96}}, + {"x11purple", {160, 32, 240}}, + {"yellow", {255, 255, 0}}, + {"yellow1", {255, 255, 0}}, + {"yellow2", {238, 238, 0}}, + {"yellow3", {205, 205, 0}}, + {"yellow4", {139, 139, 0}}, + {"yellowgreen", {154, 205, 50}}, + {"activeborder", {180, 180, 180}}, + {"activecaption", {153, 180, 209}}, + {"appworkspace", {171, 171, 171}}, + {"background", {0, 0, 0}}, + {"buttonhighlight", {255, 255, 255}}, + {"buttonshadow", {160, 160, 160}}, + {"captiontext", {0, 0, 0}}, + {"inactiveborder", {244, 247, 252}}, + {"inactivecaption", {191, 205, 219}}, + {"inactivecaptiontext", {0, 0, 0}}, + {"infobackground", {255, 255, 225}}, + {"infotext", {0, 0, 0}}, + {"menu", {240, 240, 240}}, + {"menutext", {0, 0, 0}}, + {"scrollbar", {200, 200, 200}}, + {"threeddarkshadow", {0, 0, 0}}, + {"threedface", {0, 0, 0}}, + {"threedhighlight", {0, 0, 0}}, + {"threedlightshadow", {0, 0, 0}}, + {"threedshadow", {0, 0, 0}}, + {"transparent", {0, 0, 0, 0}}, + {"window", {255, 255, 255}}, + {"windowframe", {100, 100, 100}}, + {"windowtext", {0, 0, 0}}, +}; + +}// namespace rspamd::css + +#endif//RSPAMD_CSS_COLORS_LIST_HXX diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx new file mode 100644 index 0000000..aed035a --- /dev/null +++ b/src/libserver/css/css_parser.cxx @@ -0,0 +1,892 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_parser.hxx" +#include "css_tokeniser.hxx" +#include "css_selector.hxx" +#include "css_rule.hxx" +#include "css_util.hxx" +#include "css.hxx" +#include "fmt/core.h" + +#include <vector> +#include <unicode/utf8.h> + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +namespace rspamd::css { + +const css_consumed_block css_parser_eof_block{}; + +auto css_consumed_block::attach_block(consumed_block_ptr &&block) -> bool +{ + if (std::holds_alternative<std::monostate>(content)) { + /* Switch from monostate */ + content = std::vector<consumed_block_ptr>(); + } + else if (!std::holds_alternative<std::vector<consumed_block_ptr>>(content)) { + /* A single component, cannot attach a block ! */ + return false; + } + + auto &value_vec = std::get<std::vector<consumed_block_ptr>>(content); + value_vec.push_back(std::move(block)); + + return true; +} + +auto css_consumed_block::add_function_argument(consumed_block_ptr &&block) -> bool +{ + if (!std::holds_alternative<css_function_block>(content)) { + return false; + } + + auto &&func_bloc = std::get<css_function_block>(content); + func_bloc.args.push_back(std::move(block)); + + return true; +} + +auto css_consumed_block::token_type_str(void) const -> const char * +{ + const auto *ret = ""; + + switch (tag) { + case parser_tag_type::css_top_block: + ret = "top"; + break; + case parser_tag_type::css_qualified_rule: + ret = "qualified rule"; + break; + case parser_tag_type::css_at_rule: + ret = "at rule"; + break; + case parser_tag_type::css_simple_block: + ret = "simple block"; + break; + case parser_tag_type::css_function: + ret = "function"; + break; + case parser_tag_type::css_function_arg: + ret = "function arg"; + break; + case parser_tag_type::css_component: + ret = "component"; + break; + case parser_tag_type::css_eof_block: + ret = "eof"; + break; + } + + return ret; +} + +auto css_consumed_block::debug_str(void) -> std::string +{ + std::string ret = fmt::format(R"("type": "{}", "value": )", token_type_str()); + + std::visit([&](auto &arg) { + using T = std::decay_t<decltype(arg)>; + + if constexpr (std::is_same_v<T, std::vector<consumed_block_ptr>>) { + /* Array of blocks */ + ret += "["; + for (const auto &block: arg) { + ret += "{"; + ret += block->debug_str(); + ret += "}, "; + } + + if (*(--ret.end()) == ' ') { + ret.pop_back(); + ret.pop_back(); /* Last ',' */ + } + ret += "]"; + } + else if constexpr (std::is_same_v<T, std::monostate>) { + /* Empty block */ + ret += R"("empty")"; + } + else if constexpr (std::is_same_v<T, css_function_block>) { + ret += R"({ "content": {"token": )"; + ret += "\"" + arg.function.debug_token_str() + "\", "; + ret += R"("arguments": [)"; + + for (const auto &block: arg.args) { + ret += "{"; + ret += block->debug_str(); + ret += "}, "; + } + if (*(--ret.end()) == ' ') { + ret.pop_back(); + ret.pop_back(); /* Last ',' */ + } + ret += "]}}"; + } + else { + /* Single element block */ + ret += "\"" + arg.debug_token_str() + "\""; + } + }, + content); + + return ret; +} + +class css_parser { +public: + css_parser(void) = delete; /* Require mempool to be set for logging */ + explicit css_parser(rspamd_mempool_t *pool) + : pool(pool) + { + style_object.reset(); + error.type = css_parse_error_type::PARSE_ERROR_NO_ERROR; + } + + /* + * This constructor captures existing via unique_ptr, but it does not + * destruct it on errors (we assume that it is owned somewhere else) + */ + explicit css_parser(std::shared_ptr<css_style_sheet> &&existing, rspamd_mempool_t *pool) + : style_object(existing), pool(pool) + { + error.type = css_parse_error_type::PARSE_ERROR_NO_ERROR; + } + + /* + * Process input css blocks + */ + std::unique_ptr<css_consumed_block> consume_css_blocks(const std::string_view &sv); + /* + * Process a single css rule + */ + std::unique_ptr<css_consumed_block> consume_css_rule(const std::string_view &sv); + std::optional<css_parse_error> consume_input(const std::string_view &sv); + + auto get_object_maybe(void) -> tl::expected<std::shared_ptr<css_style_sheet>, css_parse_error> + { + if (style_object) { + return style_object; + } + + return tl::make_unexpected(error); + } + + /* Helper parser methods */ + static bool need_unescape(const std::string_view &sv); + +private: + std::shared_ptr<css_style_sheet> style_object; + std::unique_ptr<css_tokeniser> tokeniser; + + css_parse_error error; + rspamd_mempool_t *pool; + + int rec_level = 0; + const int max_rec = 20; + bool eof = false; + + /* Consumers */ + auto component_value_consumer(std::unique_ptr<css_consumed_block> &top) -> bool; + auto function_consumer(std::unique_ptr<css_consumed_block> &top) -> bool; + auto simple_block_consumer(std::unique_ptr<css_consumed_block> &top, + css_parser_token::token_type expected_end, + bool consume_current) -> bool; + auto qualified_rule_consumer(std::unique_ptr<css_consumed_block> &top) -> bool; + auto at_rule_consumer(std::unique_ptr<css_consumed_block> &top) -> bool; +}; + +/* + * Find if we need to unescape css + */ +bool css_parser::need_unescape(const std::string_view &sv) +{ + bool in_quote = false; + char quote_char, prev_c = 0; + + for (const auto c: sv) { + if (!in_quote) { + if (c == '"' || c == '\'') { + in_quote = true; + quote_char = c; + } + else if (c == '\\') { + return true; + } + } + else { + if (c == quote_char) { + if (prev_c != '\\') { + in_quote = false; + } + } + prev_c = c; + } + } + + return false; +} + +auto css_parser::function_consumer(std::unique_ptr<css_consumed_block> &top) -> bool +{ + auto ret = true, want_more = true; + + msg_debug_css("consume function block; top block: %s, recursion level %d", + top->token_type_str(), rec_level); + + if (++rec_level > max_rec) { + msg_err_css("max nesting reached, ignore style"); + error = css_parse_error(css_parse_error_type::PARSE_ERROR_BAD_NESTING, + "maximum nesting has reached when parsing function value"); + return false; + } + + while (ret && want_more && !eof) { + auto next_token = tokeniser->next_token(); + + switch (next_token.type) { + case css_parser_token::token_type::eof_token: + eof = true; + break; + case css_parser_token::token_type::whitespace_token: + /* Ignore whitespaces */ + break; + case css_parser_token::token_type::ebrace_token: + ret = true; + want_more = false; + break; + case css_parser_token::token_type::comma_token: + case css_parser_token::token_type::delim_token: + case css_parser_token::token_type::obrace_token: + break; + default: + /* Attach everything to the function block */ + top->add_function_argument(std::make_unique<css_consumed_block>( + css::css_consumed_block::parser_tag_type::css_function_arg, + std::move(next_token))); + break; + } + } + + --rec_level; + + return ret; +} + +auto css_parser::simple_block_consumer(std::unique_ptr<css_consumed_block> &top, + css_parser_token::token_type expected_end, + bool consume_current) -> bool +{ + auto ret = true; + std::unique_ptr<css_consumed_block> block; + + msg_debug_css("consume simple block; top block: %s, recursion level %d", + top->token_type_str(), rec_level); + + if (!consume_current && ++rec_level > max_rec) { + msg_err_css("max nesting reached, ignore style"); + error = css_parse_error(css_parse_error_type::PARSE_ERROR_BAD_NESTING, + "maximum nesting has reached when parsing simple block value"); + return false; + } + + if (!consume_current) { + block = std::make_unique<css_consumed_block>( + css_consumed_block::parser_tag_type::css_simple_block); + } + + + while (ret && !eof) { + auto next_token = tokeniser->next_token(); + + if (next_token.type == expected_end) { + break; + } + + switch (next_token.type) { + case css_parser_token::token_type::eof_token: + eof = true; + break; + case css_parser_token::token_type::whitespace_token: + /* Ignore whitespaces */ + break; + default: + tokeniser->pushback_token(next_token); + ret = component_value_consumer(consume_current ? top : block); + break; + } + } + + if (!consume_current && ret) { + msg_debug_css("attached node 'simple block' rule %s; length=%d", + block->token_type_str(), (int) block->size()); + top->attach_block(std::move(block)); + } + + if (!consume_current) { + --rec_level; + } + + return ret; +} + +auto css_parser::qualified_rule_consumer(std::unique_ptr<css_consumed_block> &top) -> bool +{ + msg_debug_css("consume qualified block; top block: %s, recursion level %d", + top->token_type_str(), rec_level); + + if (++rec_level > max_rec) { + msg_err_css("max nesting reached, ignore style"); + error = css_parse_error(css_parse_error_type::PARSE_ERROR_BAD_NESTING, + "maximum nesting has reached when parsing qualified rule value"); + return false; + } + + auto ret = true, want_more = true; + auto block = std::make_unique<css_consumed_block>( + css_consumed_block::parser_tag_type::css_qualified_rule); + + while (ret && want_more && !eof) { + auto next_token = tokeniser->next_token(); + switch (next_token.type) { + case css_parser_token::token_type::eof_token: + eof = true; + break; + case css_parser_token::token_type::cdo_token: + case css_parser_token::token_type::cdc_token: + if (top->tag == css_consumed_block::parser_tag_type::css_top_block) { + /* Ignore */ + ret = true; + } + else { + } + break; + case css_parser_token::token_type::ocurlbrace_token: + ret = simple_block_consumer(block, + css_parser_token::token_type::ecurlbrace_token, false); + want_more = false; + break; + case css_parser_token::token_type::whitespace_token: + /* Ignore whitespaces */ + break; + default: + tokeniser->pushback_token(next_token); + ret = component_value_consumer(block); + break; + }; + } + + if (ret) { + if (top->tag == css_consumed_block::parser_tag_type::css_top_block) { + msg_debug_css("attached node qualified rule %s; length=%d", + block->token_type_str(), (int) block->size()); + top->attach_block(std::move(block)); + } + } + + --rec_level; + + return ret; +} + +auto css_parser::at_rule_consumer(std::unique_ptr<css_consumed_block> &top) -> bool +{ + msg_debug_css("consume at-rule block; top block: %s, recursion level %d", + top->token_type_str(), rec_level); + + if (++rec_level > max_rec) { + msg_err_css("max nesting reached, ignore style"); + error = css_parse_error(css_parse_error_type::PARSE_ERROR_BAD_NESTING, + "maximum nesting has reached when parsing at keyword"); + return false; + } + + auto ret = true, want_more = true; + auto block = std::make_unique<css_consumed_block>( + css_consumed_block::parser_tag_type::css_at_rule); + + while (ret && want_more && !eof) { + auto next_token = tokeniser->next_token(); + switch (next_token.type) { + case css_parser_token::token_type::eof_token: + eof = true; + break; + case css_parser_token::token_type::cdo_token: + case css_parser_token::token_type::cdc_token: + if (top->tag == css_consumed_block::parser_tag_type::css_top_block) { + /* Ignore */ + ret = true; + } + else { + } + break; + case css_parser_token::token_type::ocurlbrace_token: + ret = simple_block_consumer(block, + css_parser_token::token_type::ecurlbrace_token, false); + want_more = false; + break; + case css_parser_token::token_type::whitespace_token: + /* Ignore whitespaces */ + break; + case css_parser_token::token_type::semicolon_token: + want_more = false; + break; + default: + tokeniser->pushback_token(next_token); + ret = component_value_consumer(block); + break; + }; + } + + if (ret) { + if (top->tag == css_consumed_block::parser_tag_type::css_top_block) { + msg_debug_css("attached node qualified rule %s; length=%d", + block->token_type_str(), (int) block->size()); + top->attach_block(std::move(block)); + } + } + + --rec_level; + + return ret; +} + +auto css_parser::component_value_consumer(std::unique_ptr<css_consumed_block> &top) -> bool +{ + auto ret = true, need_more = true; + std::unique_ptr<css_consumed_block> block; + + msg_debug_css("consume component block; top block: %s, recursion level %d", + top->token_type_str(), rec_level); + + if (++rec_level > max_rec) { + error = css_parse_error(css_parse_error_type::PARSE_ERROR_BAD_NESTING, + "maximum nesting has reached when parsing component value"); + return false; + } + + while (ret && need_more && !eof) { + auto next_token = tokeniser->next_token(); + + switch (next_token.type) { + case css_parser_token::token_type::eof_token: + eof = true; + break; + case css_parser_token::token_type::ocurlbrace_token: + block = std::make_unique<css_consumed_block>( + css_consumed_block::parser_tag_type::css_simple_block); + ret = simple_block_consumer(block, + css_parser_token::token_type::ecurlbrace_token, + true); + need_more = false; + break; + case css_parser_token::token_type::obrace_token: + block = std::make_unique<css_consumed_block>( + css_consumed_block::parser_tag_type::css_simple_block); + ret = simple_block_consumer(block, + css_parser_token::token_type::ebrace_token, + true); + need_more = false; + break; + case css_parser_token::token_type::osqbrace_token: + block = std::make_unique<css_consumed_block>( + css_consumed_block::parser_tag_type::css_simple_block); + ret = simple_block_consumer(block, + css_parser_token::token_type::esqbrace_token, + true); + need_more = false; + break; + case css_parser_token::token_type::whitespace_token: + /* Ignore whitespaces */ + break; + case css_parser_token::token_type::function_token: { + need_more = false; + block = std::make_unique<css_consumed_block>( + css_consumed_block::parser_tag_type::css_function, + std::move(next_token)); + + /* Consume the rest */ + ret = function_consumer(block); + break; + } + default: + block = std::make_unique<css_consumed_block>( + css_consumed_block::parser_tag_type::css_component, + std::move(next_token)); + need_more = false; + break; + } + } + + if (ret && block) { + msg_debug_css("attached node component rule %s; length=%d", + block->token_type_str(), (int) block->size()); + top->attach_block(std::move(block)); + } + + --rec_level; + + return ret; +} + +auto css_parser::consume_css_blocks(const std::string_view &sv) -> std::unique_ptr<css_consumed_block> +{ + tokeniser = std::make_unique<css_tokeniser>(pool, sv); + auto ret = true; + + auto consumed_blocks = + std::make_unique<css_consumed_block>(css_consumed_block::parser_tag_type::css_top_block); + + while (!eof && ret) { + auto next_token = tokeniser->next_token(); + + switch (next_token.type) { + case css_parser_token::token_type::whitespace_token: + /* Ignore whitespaces */ + break; + case css_parser_token::token_type::eof_token: + eof = true; + break; + case css_parser_token::token_type::at_keyword_token: + tokeniser->pushback_token(next_token); + ret = at_rule_consumer(consumed_blocks); + break; + default: + tokeniser->pushback_token(next_token); + ret = qualified_rule_consumer(consumed_blocks); + break; + } + } + + tokeniser.reset(nullptr); /* No longer needed */ + + return consumed_blocks; +} + +auto css_parser::consume_css_rule(const std::string_view &sv) -> std::unique_ptr<css_consumed_block> +{ + tokeniser = std::make_unique<css_tokeniser>(pool, sv); + auto ret = true; + + auto rule_block = + std::make_unique<css_consumed_block>(css_consumed_block::parser_tag_type::css_simple_block); + + while (!eof && ret) { + auto next_token = tokeniser->next_token(); + + switch (next_token.type) { + case css_parser_token::token_type::eof_token: + eof = true; + break; + case css_parser_token::token_type::whitespace_token: + /* Ignore whitespaces */ + break; + default: + tokeniser->pushback_token(next_token); + ret = component_value_consumer(rule_block); + break; + } + } + + tokeniser.reset(nullptr); /* No longer needed */ + + return rule_block; +} + +std::optional<css_parse_error> +css_parser::consume_input(const std::string_view &sv) +{ + auto &&consumed_blocks = consume_css_blocks(sv); + const auto &rules = consumed_blocks->get_blocks_or_empty(); + + if (rules.empty()) { + if (error.type == css_parse_error_type::PARSE_ERROR_NO_ERROR) { + return css_parse_error(css_parse_error_type::PARSE_ERROR_EMPTY, + "no css rules consumed"); + } + else { + return error; + } + } + + if (!style_object) { + style_object = std::make_shared<css_style_sheet>(pool); + } + + for (auto &&rule: rules) { + /* + * For now, we do not need any of the at rules, so we can safely ignore them + */ + auto &&children = rule->get_blocks_or_empty(); + + if (children.size() > 1 && + children[0]->tag == css_consumed_block::parser_tag_type::css_component) { + auto simple_block = std::find_if(children.begin(), children.end(), + [](auto &bl) { + return bl->tag == css_consumed_block::parser_tag_type::css_simple_block; + }); + + if (simple_block != children.end()) { + /* + * We have a component and a simple block, + * so we can parse a selector and then extract + * declarations from a simple block + */ + + /* First, tag all components as preamble */ + auto selector_it = children.cbegin(); + + auto selector_token_functor = [&selector_it, &simple_block](void) + -> const css_consumed_block & { + for (;;) { + if (selector_it == simple_block) { + return css_parser_eof_block; + } + + const auto &ret = (*selector_it); + + ++selector_it; + + return *ret; + } + }; + + auto selectors_vec = process_selector_tokens(pool, selector_token_functor); + + if (selectors_vec.size() > 0) { + msg_debug_css("processed %d selectors", (int) selectors_vec.size()); + auto decls_it = (*simple_block)->get_blocks_or_empty().cbegin(); + auto decls_end = (*simple_block)->get_blocks_or_empty().cend(); + auto declaration_token_functor = [&decls_it, &decls_end](void) + -> const css_consumed_block & { + for (;;) { + if (decls_it == decls_end) { + return css_parser_eof_block; + } + + const auto &ret = (*decls_it); + + ++decls_it; + + return *ret; + } + }; + + auto declarations_vec = process_declaration_tokens(pool, + declaration_token_functor); + + if (declarations_vec && !declarations_vec->get_rules().empty()) { + msg_debug_css("processed %d rules", + (int) declarations_vec->get_rules().size()); + + for (auto &&selector: selectors_vec) { + style_object->add_selector_rule(std::move(selector), + declarations_vec); + } + } + } + } + } + } + + auto debug_str = consumed_blocks->debug_str(); + msg_debug_css("consumed css: {%*s}", (int) debug_str.size(), debug_str.data()); + + return std::nullopt; +} + +auto get_selectors_parser_functor(rspamd_mempool_t *pool, + const std::string_view &st) -> blocks_gen_functor +{ + css_parser parser(pool); + + auto &&consumed_blocks = parser.consume_css_blocks(st); + const auto &rules = consumed_blocks->get_blocks_or_empty(); + + auto rules_it = rules.begin(); + auto &&children = (*rules_it)->get_blocks_or_empty(); + auto cur = children.begin(); + auto last = children.end(); + + /* + * We use move only wrapper to state the fact that the cosumed blocks + * are moved into the closure, not copied. + * It prevents us from thinking about copies of the blocks and + * functors. + * Mutable lambda is required to copy iterators inside of the closure, + * as, again, it is C++ where lifetime of the objects must be explicitly + * transferred. On the other hand, we could move all stuff inside and remove + * mutable. + */ + return [cur, consumed_blocks = std::move(consumed_blocks), last](void) mutable + -> const css_consumed_block & { + if (cur != last) { + const auto &ret = (*cur); + + ++cur; + + return *ret; + } + + return css_parser_eof_block; + }; +} + +auto get_rules_parser_functor(rspamd_mempool_t *pool, + const std::string_view &st) -> blocks_gen_functor +{ + css_parser parser(pool); + + auto &&consumed_blocks = parser.consume_css_rule(st); + const auto &rules = consumed_blocks->get_blocks_or_empty(); + + auto cur = rules.begin(); + auto last = rules.end(); + + return [cur, consumed_blocks = std::move(consumed_blocks), last](void) mutable + -> const css_consumed_block & { + if (cur != last) { + const auto &ret = (*cur); + + ++cur; + + return *ret; + } + + return css_parser_eof_block; + }; +} + + +/* + * Wrapper for the parser + */ +auto parse_css(rspamd_mempool_t *pool, const std::string_view &st, + std::shared_ptr<css_style_sheet> &&other) + -> tl::expected<std::shared_ptr<css_style_sheet>, css_parse_error> +{ + css_parser parser(std::forward<std::shared_ptr<css_style_sheet>>(other), pool); + std::string_view processed_input; + + if (css_parser::need_unescape(st)) { + processed_input = rspamd::css::unescape_css(pool, st); + } + else { + /* Lowercase inplace */ + auto *nspace = rspamd_mempool_alloc_buffer(pool, st.size()); + rspamd_str_copy_lc(st.data(), nspace, st.size()); + processed_input = std::string_view{nspace, st.size()}; + } + + auto maybe_error = parser.consume_input(processed_input); + if (!maybe_error) { + return parser.get_object_maybe(); + } + + return tl::make_unexpected(maybe_error.value()); +} + +auto parse_css_declaration(rspamd_mempool_t *pool, const std::string_view &st) + -> rspamd::html::html_block * +{ + std::string_view processed_input; + + if (css_parser::need_unescape(st)) { + processed_input = rspamd::css::unescape_css(pool, st); + } + else { + auto *nspace = reinterpret_cast<char *>(rspamd_mempool_alloc(pool, st.size())); + auto nlen = rspamd_str_copy_lc(st.data(), nspace, st.size()); + processed_input = std::string_view{nspace, nlen}; + } + auto &&res = process_declaration_tokens(pool, + get_rules_parser_functor(pool, processed_input)); + + if (res) { + return res->compile_to_block(pool); + } + + return nullptr; +} + +TEST_SUITE("css") +{ + TEST_CASE("parse colors") + { + const std::vector<const char *> cases{ + "P { CoLoR: rgb(100%, 50%, 0%); opacity: -1; width: 1em; display: none; } /* very transparent solid orange тест */", + "p { color: rgb(100%, 50%, 0%); opacity: 2; display: inline; } /* very transparent solid orange */", + "p { color: rgb(100%, 50%, 0%); opacity: 0.5; } /* very transparent solid orange */\n", + "p { color: rgb(100%, 50%, 0%); opacity: 1; width: 99%; } /* very transparent solid orange */\n", + "p { color: rgb(100%, 50%, 0%); opacity: 10%; width: 99%; } /* very transparent solid orange */\n", + "p { color: rgb(100%, 50%, 0%); opacity: 10%; width: 100px; } /* very transparent solid orange */\n", + "p { color: rgb(100%, 50%, 0%); opacity: 10% } /* very transparent solid orange */\n", + "* { color: hsl(0, 100%, 50%) !important } /* red */\n", + "* { color: hsl(120, 100%, 50%) important } /* lime */\n", + "* { color: hsl(120, 100%, 25%) } /* dark green */\n", + "* { color: hsl(120, 100%, 75%) } /* light green */\n", + "* { color: hsl(120, 75%, 75%) } /* pastel green, and so on */\n", + "em { color: #f00 } /* #rgb */\n", + "em { color: #ff0000 } /* #rrggbb */\n", + "em { color: rgb(255,0,0) }\n", + "em { color: rgb(100%, 0%, 0%) }\n", + "body {color: black; background: white }\n", + "h1 { color: maroon }\n", + "h2 { color: olive }\n", + "em { color: rgb(255,0,0) } /* integer range 0 - 255 */\n", + "em { color: rgb(300,0,0) } /* clipped to rgb(255,0,0) */\n", + "em { color: rgb(255,-10,0) } /* clipped to rgb(255,0,0) */\n", + "em { color: rgb(110%, 0%, 0%) } /* clipped to rgb(100%,0%,0%) */\n", + "em { color: rgb(255,0,0) } /* integer range 0 - 255 */\n", + "em { color: rgba(255,0,0,1) /* the same, with explicit opacity of 1 */\n", + "em { color: rgb(100%,0%,0%) } /* float range 0.0% - 100.0% */\n", + "em { color: rgba(100%,0%,0%,1) } /* the same, with explicit opacity of 1 */\n", + "p { color: rgba(0,0,255,0.5) } /* semi-transparent solid blue */\n", + "p { color: rgba(100%, 50%, 0%, 0.1) } /* very transparent solid orange */", + ".chat-icon[_ng-cnj-c0]::before{content:url(group-2.63e87cd21fbf8c966dd.svg);width:60px;height:60px;display:block}", + "tt{color:#1e3482}", + "tt{unicode-range: u+0049-u+004a,u+0020;}", + "@import url(https://fonts.googleapis.com/css?family=arial:300,400,7000;", + "tt{color:black;\v}", + "tt{color:black;\f}", + }; + + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "css", 0); + for (const auto &c: cases) { + SUBCASE((std::string("parse css: ") + c).c_str()) + { + CHECK(parse_css(pool, c, nullptr).value().get() != nullptr); + } + } + + /* We now merge all styles together */ + SUBCASE("merged css parse") + { + std::shared_ptr<css_style_sheet> merged; + for (const auto &c: cases) { + auto ret = parse_css(pool, c, std::move(merged)); + merged.swap(ret.value()); + } + + CHECK(merged.get() != nullptr); + } + + rspamd_mempool_delete(pool); + } +} +}// namespace rspamd::css diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx new file mode 100644 index 0000000..d5a9671 --- /dev/null +++ b/src/libserver/css/css_parser.hxx @@ -0,0 +1,244 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_CSS_PARSER_HXX +#define RSPAMD_CSS_PARSER_HXX + +#include <variant> +#include <vector> +#include <memory> +#include <string> + +#include "function2/function2.hpp" +#include "css_tokeniser.hxx" +#include "parse_error.hxx" +#include "contrib/expected/expected.hpp" +#include "logger.h" + +/* Forward declaration */ +namespace rspamd::html { +struct html_block; +} + +namespace rspamd::css { + +/* + * Represents a consumed token by a parser + */ +class css_consumed_block { +public: + enum class parser_tag_type : std::uint8_t { + css_top_block = 0, + css_qualified_rule, + css_at_rule, + css_simple_block, + css_function, + css_function_arg, + css_component, + css_eof_block, + }; + using consumed_block_ptr = std::unique_ptr<css_consumed_block>; + + struct css_function_block { + css_parser_token function; + std::vector<consumed_block_ptr> args; + + css_function_block(css_parser_token &&tok) + : function(std::forward<css_parser_token>(tok)) + { + } + + auto as_string() const -> std::string_view + { + return function.get_string_or_default(""); + } + + static auto empty_function() -> const css_function_block & + { + static const css_function_block invalid( + css_parser_token(css_parser_token::token_type::eof_token, + css_parser_token_placeholder())); + return invalid; + } + }; + + css_consumed_block() + : tag(parser_tag_type::css_eof_block) + { + } + css_consumed_block(parser_tag_type tag) + : tag(tag) + { + if (tag == parser_tag_type::css_top_block || + tag == parser_tag_type::css_qualified_rule || + tag == parser_tag_type::css_simple_block) { + /* Pre-allocate content for known vector blocks */ + std::vector<consumed_block_ptr> vec; + vec.reserve(4); + content = std::move(vec); + } + } + /* Construct a block from a single lexer token (for trivial blocks) */ + explicit css_consumed_block(parser_tag_type tag, css_parser_token &&tok) + : tag(tag) + { + if (tag == parser_tag_type::css_function) { + content = css_function_block{std::move(tok)}; + } + else { + content = std::move(tok); + } + } + + /* Attach a new block to the compound block, consuming block inside */ + auto attach_block(consumed_block_ptr &&block) -> bool; + /* Attach a new argument to the compound function block, consuming block inside */ + auto add_function_argument(consumed_block_ptr &&block) -> bool; + + auto assign_token(css_parser_token &&tok) -> void + { + content = std::move(tok); + } + + /* Empty blocks used to avoid type checks in loops */ + const inline static std::vector<consumed_block_ptr> empty_block_vec{}; + + auto is_blocks_vec() const -> bool + { + return (std::holds_alternative<std::vector<consumed_block_ptr>>(content)); + } + + auto get_blocks_or_empty() const -> const std::vector<consumed_block_ptr> & + { + if (is_blocks_vec()) { + return std::get<std::vector<consumed_block_ptr>>(content); + } + + return empty_block_vec; + } + + auto is_token() const -> bool + { + return (std::holds_alternative<css_parser_token>(content)); + } + + auto get_token_or_empty() const -> const css_parser_token & + { + if (is_token()) { + return std::get<css_parser_token>(content); + } + + return css_parser_eof_token(); + } + + auto is_function() const -> bool + { + return (std::holds_alternative<css_function_block>(content)); + } + + auto get_function_or_invalid() const -> const css_function_block & + { + if (is_function()) { + return std::get<css_function_block>(content); + } + + return css_function_block::empty_function(); + } + + auto size() const -> std::size_t + { + auto ret = 0; + + std::visit([&](auto &arg) { + using T = std::decay_t<decltype(arg)>; + + if constexpr (std::is_same_v<T, std::vector<consumed_block_ptr>>) { + /* Array of blocks */ + ret = arg.size(); + } + else if constexpr (std::is_same_v<T, std::monostate>) { + /* Empty block */ + ret = 0; + } + else { + /* Single element block */ + ret = 1; + } + }, + content); + + return ret; + } + + auto is_eof() -> bool + { + return tag == parser_tag_type::css_eof_block; + } + + /* Debug methods */ + auto token_type_str(void) const -> const char *; + auto debug_str(void) -> std::string; + +public: + parser_tag_type tag; + +private: + std::variant<std::monostate, + std::vector<consumed_block_ptr>, + css_parser_token, + css_function_block> + content; +}; + +extern const css_consumed_block css_parser_eof_block; + +using blocks_gen_functor = fu2::unique_function<const css_consumed_block &(void)>; + +class css_style_sheet; +/* + * Update the existing stylesheet with another stylesheet + */ +auto parse_css(rspamd_mempool_t *pool, const std::string_view &st, + std::shared_ptr<css_style_sheet> &&other) + -> tl::expected<std::shared_ptr<css_style_sheet>, css_parse_error>; + +/* + * Creates a functor to consume css selectors sequence + */ +auto get_selectors_parser_functor(rspamd_mempool_t *pool, + const std::string_view &st) -> blocks_gen_functor; + +/* + * Creates a functor to process a rule definition (e.g. from embedded style tag for + * an element) + */ +auto get_rules_parser_functor(rspamd_mempool_t *pool, + const std::string_view &st) -> blocks_gen_functor; + +/** + * Parses a css declaration (e.g. embedded css and returns a completed html block) + * @param pool + * @param st + * @return + */ +auto parse_css_declaration(rspamd_mempool_t *pool, const std::string_view &st) + -> rspamd::html::html_block *; + +}// namespace rspamd::css + +#endif//RSPAMD_CSS_PARSER_HXX diff --git a/src/libserver/css/css_property.cxx b/src/libserver/css/css_property.cxx new file mode 100644 index 0000000..1557109 --- /dev/null +++ b/src/libserver/css/css_property.cxx @@ -0,0 +1,69 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_property.hxx" +#include "frozen/unordered_map.h" +#include "frozen/string.h" +#include "libutil/cxx/util.hxx" + +namespace rspamd::css { + +constexpr const auto prop_names_map = frozen::make_unordered_map<frozen::string, css_property_type>({ + {"font", css_property_type::PROPERTY_FONT}, + {"font-color", css_property_type::PROPERTY_FONT_COLOR}, + {"font-size", css_property_type::PROPERTY_FONT_SIZE}, + {"color", css_property_type::PROPERTY_COLOR}, + {"bgcolor", css_property_type::PROPERTY_BGCOLOR}, + {"background-color", css_property_type::PROPERTY_BGCOLOR}, + {"background", css_property_type::PROPERTY_BACKGROUND}, + {"height", css_property_type::PROPERTY_HEIGHT}, + {"width", css_property_type::PROPERTY_WIDTH}, + {"display", css_property_type::PROPERTY_DISPLAY}, + {"visibility", css_property_type::PROPERTY_VISIBILITY}, + {"opacity", css_property_type::PROPERTY_OPACITY}, +}); + +/* Ensure that we have all cases listed */ +static_assert(prop_names_map.size() >= static_cast<int>(css_property_type::PROPERTY_NYI)); + +auto token_string_to_property(const std::string_view &inp) + -> css_property_type +{ + + css_property_type ret = css_property_type::PROPERTY_NYI; + + auto known_type = find_map(prop_names_map, inp); + + if (known_type) { + ret = known_type.value().get(); + } + + return ret; +} + +auto css_property::from_token(const css_parser_token &tok) + -> tl::expected<css_property, css_parse_error> +{ + if (tok.type == css_parser_token::token_type::ident_token) { + auto sv = tok.get_string_or_default(""); + + return css_property{token_string_to_property(sv), css_property_flag::FLAG_NORMAL}; + } + + return tl::unexpected{css_parse_error(css_parse_error_type::PARSE_ERROR_NYI)}; +} + +}// namespace rspamd::css diff --git a/src/libserver/css/css_property.hxx b/src/libserver/css/css_property.hxx new file mode 100644 index 0000000..9661222 --- /dev/null +++ b/src/libserver/css/css_property.hxx @@ -0,0 +1,172 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifndef RSPAMD_CSS_PROPERTY_HXX +#define RSPAMD_CSS_PROPERTY_HXX + +#include <string> +#include "css_tokeniser.hxx" +#include "parse_error.hxx" +#include "contrib/expected/expected.hpp" + +namespace rspamd::css { + +/* + * To be extended with properties that are interesting from the email + * point of view + */ +enum class css_property_type : std::uint16_t { + PROPERTY_FONT = 0, + PROPERTY_FONT_COLOR, + PROPERTY_FONT_SIZE, + PROPERTY_COLOR, + PROPERTY_BGCOLOR, + PROPERTY_BACKGROUND, + PROPERTY_HEIGHT, + PROPERTY_WIDTH, + PROPERTY_DISPLAY, + PROPERTY_VISIBILITY, + PROPERTY_OPACITY, + PROPERTY_NYI, +}; + +enum class css_property_flag : std::uint16_t { + FLAG_NORMAL, + FLAG_IMPORTANT, + FLAG_NOT_IMPORTANT +}; + +struct alignas(int) css_property { + css_property_type type; + css_property_flag flag; + + css_property(css_property_type t, css_property_flag fl = css_property_flag::FLAG_NORMAL) + : type(t), flag(fl) + { + } + static tl::expected<css_property, css_parse_error> from_token( + const css_parser_token &tok); + + constexpr auto to_string(void) const -> const char * + { + const char *ret = "nyi"; + + switch (type) { + case css_property_type::PROPERTY_FONT: + ret = "font"; + break; + case css_property_type::PROPERTY_FONT_COLOR: + ret = "font-color"; + break; + case css_property_type::PROPERTY_FONT_SIZE: + ret = "font-size"; + break; + case css_property_type::PROPERTY_COLOR: + ret = "color"; + break; + case css_property_type::PROPERTY_BGCOLOR: + ret = "bgcolor"; + break; + case css_property_type::PROPERTY_BACKGROUND: + ret = "background"; + break; + case css_property_type::PROPERTY_HEIGHT: + ret = "height"; + break; + case css_property_type::PROPERTY_WIDTH: + ret = "width"; + break; + case css_property_type::PROPERTY_DISPLAY: + ret = "display"; + break; + case css_property_type::PROPERTY_VISIBILITY: + ret = "visibility"; + break; + case css_property_type::PROPERTY_OPACITY: + ret = "opacity"; + break; + default: + break; + } + + return ret; + } + + /* Helpers to define which values are valid for which properties */ + auto is_color(void) const -> bool + { + return type == css_property_type::PROPERTY_COLOR || + type == css_property_type::PROPERTY_BACKGROUND || + type == css_property_type::PROPERTY_BGCOLOR || + type == css_property_type::PROPERTY_FONT_COLOR || + type == css_property_type::PROPERTY_FONT; + } + auto is_dimension(void) const -> bool + { + return type == css_property_type::PROPERTY_HEIGHT || + type == css_property_type::PROPERTY_WIDTH || + type == css_property_type::PROPERTY_FONT_SIZE || + type == css_property_type::PROPERTY_FONT; + } + + auto is_normal_number(void) const -> bool + { + return type == css_property_type::PROPERTY_OPACITY; + } + + auto is_display(void) const -> bool + { + return type == css_property_type::PROPERTY_DISPLAY; + } + + auto is_visibility(void) const -> bool + { + return type == css_property_type::PROPERTY_VISIBILITY; + } + + auto operator==(const css_property &other) const + { + return type == other.type; + } +}; + + +}// namespace rspamd::css + +/* Make properties hashable */ +namespace std { +template<> +class hash<rspamd::css::css_property> { +public: + using is_avalanching = void; + /* Mix bits to provide slightly better distribution but being constexpr */ + constexpr size_t operator()(const rspamd::css::css_property &prop) const + { + std::size_t key = 0xdeadbeef ^ static_cast<std::size_t>(prop.type); + key = (~key) + (key << 21); + key = key ^ (key >> 24); + key = (key + (key << 3)) + (key << 8); + key = key ^ (key >> 14); + key = (key + (key << 2)) + (key << 4); + key = key ^ (key >> 28); + key = key + (key << 31); + return key; + } +}; +}// namespace std + +#endif//RSPAMD_CSS_PROPERTY_HXX
\ No newline at end of file diff --git a/src/libserver/css/css_rule.cxx b/src/libserver/css/css_rule.cxx new file mode 100644 index 0000000..4e33ac7 --- /dev/null +++ b/src/libserver/css/css_rule.cxx @@ -0,0 +1,531 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_rule.hxx" +#include "css.hxx" +#include "libserver/html/html_block.hxx" +#include <limits> + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +namespace rspamd::css { + +/* Class methods */ +void css_rule::override_values(const css_rule &other) +{ + int bits = 0; + /* Ensure that our bitset is large enough */ + static_assert(1 << std::variant_size_v<decltype(css_value::value)> < + std::numeric_limits<int>::max()); + + for (const auto &v: values) { + bits |= static_cast<int>(1 << v.value.index()); + } + + for (const auto &ov: other.values) { + if (isset(&bits, static_cast<int>(1 << ov.value.index()))) { + /* We need to override the existing value */ + /* + * The algorithm is not very efficient, + * so we need to sort the values first and have a O(N) algorithm + * On the other hand, values vectors are usually limited to the + * number of elements about less then 10, so this O(N^2) algorithm + * is probably ok here + */ + for (auto &v: values) { + if (v.value.index() == ov.value.index()) { + v = ov; + } + } + } + } + + /* Copy only not set values */ + std::copy_if(other.values.begin(), other.values.end(), std::back_inserter(values), + [&bits](const auto &elt) -> bool { + return (bits & (1 << static_cast<int>(elt.value.index()))) == 0; + }); +} + +void css_rule::merge_values(const css_rule &other) +{ + unsigned int bits = 0; + + for (const auto &v: values) { + bits |= 1 << v.value.index(); + } + + /* Copy only not set values */ + std::copy_if(other.values.begin(), other.values.end(), std::back_inserter(values), + [&bits](const auto &elt) -> bool { + return (bits & (1 << elt.value.index())) == 0; + }); +} + +auto css_declarations_block::add_rule(rule_shared_ptr rule) -> bool +{ + auto it = rules.find(rule); + auto &&remote_prop = rule->get_prop(); + auto ret = true; + + if (rule->get_values().size() == 0) { + /* Ignore rules with no values */ + return false; + } + + if (it != rules.end()) { + auto &&local_rule = *it; + auto &&local_prop = local_rule->get_prop(); + + if (local_prop.flag == css_property_flag::FLAG_IMPORTANT) { + if (remote_prop.flag == css_property_flag::FLAG_IMPORTANT) { + local_rule->override_values(*rule); + } + else { + /* Override remote not important over local important */ + local_rule->merge_values(*rule); + } + } + else if (local_prop.flag == css_property_flag::FLAG_NOT_IMPORTANT) { + if (remote_prop.flag == css_property_flag::FLAG_NOT_IMPORTANT) { + local_rule->override_values(*rule); + } + else { + /* Override local not important over important */ + local_rule->merge_values(*rule); + } + } + else { + if (remote_prop.flag == css_property_flag::FLAG_IMPORTANT) { + /* Override with remote */ + local_rule->override_values(*rule); + } + else if (remote_prop.flag == css_property_flag::FLAG_NOT_IMPORTANT) { + /* Ignore remote not important over local normal */ + ret = false; + } + else { + /* Merge both */ + local_rule->merge_values(*rule); + } + } + } + else { + rules.insert(std::move(rule)); + } + + return ret; +} + +}// namespace rspamd::css + +namespace rspamd::css { + +/* Static functions */ + +static auto +allowed_property_value(const css_property &prop, const css_consumed_block &parser_block) + -> std::optional<css_value> +{ + if (prop.is_color()) { + if (parser_block.is_token()) { + /* A single token */ + const auto &tok = parser_block.get_token_or_empty(); + + if (tok.type == css_parser_token::token_type::hash_token) { + return css_value::maybe_color_from_hex(tok.get_string_or_default("")); + } + else if (tok.type == css_parser_token::token_type::ident_token) { + auto &&ret = css_value::maybe_color_from_string(tok.get_string_or_default("")); + + return ret; + } + } + else if (parser_block.is_function()) { + const auto &func = parser_block.get_function_or_invalid(); + + auto &&ret = css_value::maybe_color_from_function(func); + return ret; + } + } + if (prop.is_dimension()) { + if (parser_block.is_token()) { + /* A single token */ + const auto &tok = parser_block.get_token_or_empty(); + + if (tok.type == css_parser_token::token_type::number_token) { + return css_value::maybe_dimension_from_number(tok); + } + } + } + if (prop.is_display()) { + if (parser_block.is_token()) { + /* A single token */ + const auto &tok = parser_block.get_token_or_empty(); + + if (tok.type == css_parser_token::token_type::ident_token) { + return css_value::maybe_display_from_string(tok.get_string_or_default("")); + } + } + } + if (prop.is_visibility()) { + if (parser_block.is_token()) { + /* A single token */ + const auto &tok = parser_block.get_token_or_empty(); + + if (tok.type == css_parser_token::token_type::ident_token) { + return css_value::maybe_display_from_string(tok.get_string_or_default("")); + } + } + } + if (prop.is_normal_number()) { + if (parser_block.is_token()) { + /* A single token */ + const auto &tok = parser_block.get_token_or_empty(); + + if (tok.type == css_parser_token::token_type::number_token) { + return css_value{tok.get_normal_number_or_default(0)}; + } + } + } + + return std::nullopt; +} + +auto process_declaration_tokens(rspamd_mempool_t *pool, + blocks_gen_functor &&next_block_functor) + -> css_declarations_block_ptr +{ + css_declarations_block_ptr ret; + bool can_continue = true; + css_property cur_property{css_property_type::PROPERTY_NYI, + css_property_flag::FLAG_NORMAL}; + static const css_property bad_property{css_property_type::PROPERTY_NYI, + css_property_flag::FLAG_NORMAL}; + std::shared_ptr<css_rule> cur_rule; + + enum { + parse_property, + parse_value, + ignore_value, /* For unknown properties */ + } state = parse_property; + + auto seen_not = false; + ret = std::make_shared<css_declarations_block>(); + + while (can_continue) { + const auto &next_tok = next_block_functor(); + + switch (next_tok.tag) { + case css_consumed_block::parser_tag_type::css_component: + /* Component can be a property or a compound list of values */ + if (state == parse_property) { + cur_property = css_property::from_token(next_tok.get_token_or_empty()) + .value_or(bad_property); + + if (cur_property.type == css_property_type::PROPERTY_NYI) { + state = ignore_value; + /* Ignore everything till ; */ + continue; + } + + msg_debug_css("got css property: %s", cur_property.to_string()); + + /* We now expect colon block */ + const auto &expect_colon_block = next_block_functor(); + + if (expect_colon_block.tag != css_consumed_block::parser_tag_type::css_component) { + state = ignore_value; /* Ignore up to the next rule */ + } + else { + const auto &expect_colon_tok = expect_colon_block.get_token_or_empty(); + + if (expect_colon_tok.type != css_parser_token::token_type::colon_token) { + msg_debug_css("invalid rule, no colon after property"); + state = ignore_value; /* Ignore up to the next rule */ + } + else { + state = parse_value; + cur_rule = std::make_shared<css_rule>(cur_property); + } + } + } + else if (state == parse_value) { + /* Check semicolon */ + if (next_tok.is_token()) { + const auto &parser_tok = next_tok.get_token_or_empty(); + + if (parser_tok.type == css_parser_token::token_type::semicolon_token && cur_rule) { + ret->add_rule(std::move(cur_rule)); + state = parse_property; + seen_not = false; + continue; + } + else if (parser_tok.type == css_parser_token::token_type::delim_token) { + if (parser_tok.get_string_or_default("") == "!") { + /* Probably something like !important */ + seen_not = true; + } + } + else if (parser_tok.type == css_parser_token::token_type::ident_token) { + if (parser_tok.get_string_or_default("") == "important") { + if (seen_not) { + msg_debug_css("add !important flag to property %s", + cur_property.to_string()); + cur_property.flag = css_property_flag::FLAG_NOT_IMPORTANT; + } + else { + msg_debug_css("add important flag to property %s", + cur_property.to_string()); + cur_property.flag = css_property_flag::FLAG_IMPORTANT; + } + + seen_not = false; + + continue; + } + else { + seen_not = false; + } + } + } + + auto maybe_value = allowed_property_value(cur_property, next_tok); + + if (maybe_value) { + msg_debug_css("added value %s to the property %s", + maybe_value.value().debug_str().c_str(), + cur_property.to_string()); + cur_rule->add_value(maybe_value.value()); + } + } + else { + /* Ignore all till ; */ + if (next_tok.is_token()) { + const auto &parser_tok = next_tok.get_token_or_empty(); + + if (parser_tok.type == css_parser_token::token_type::semicolon_token) { + state = parse_property; + } + } + } + break; + case css_consumed_block::parser_tag_type::css_function: + if (state == parse_value) { + auto maybe_value = allowed_property_value(cur_property, next_tok); + + if (maybe_value && cur_rule) { + msg_debug_css("added value %s to the property %s", + maybe_value.value().debug_str().c_str(), + cur_property.to_string()); + cur_rule->add_value(maybe_value.value()); + } + } + break; + case css_consumed_block::parser_tag_type::css_eof_block: + if (state == parse_value) { + ret->add_rule(std::move(cur_rule)); + } + can_continue = false; + break; + default: + can_continue = false; + break; + } + } + + return ret; /* copy elision */ +} + +auto css_declarations_block::merge_block(const css_declarations_block &other, merge_type how) -> void +{ + const auto &other_rules = other.get_rules(); + + + for (auto &rule: other_rules) { + auto &&found_it = rules.find(rule); + + if (found_it != rules.end()) { + /* Duplicate, need to merge */ + switch (how) { + case merge_type::merge_override: + /* Override */ + (*found_it)->override_values(*rule); + break; + case merge_type::merge_duplicate: + /* Merge values */ + add_rule(rule); + break; + case merge_type::merge_parent: + /* Do not merge parent rule if more specific local one is presented */ + break; + } + } + else { + /* New property, just insert */ + rules.insert(rule); + } + } +} + +auto css_declarations_block::compile_to_block(rspamd_mempool_t *pool) const -> rspamd::html::html_block * +{ + auto *block = rspamd_mempool_alloc0_type(pool, rspamd::html::html_block); + auto opacity = -1; + const css_rule *font_rule = nullptr, *background_rule = nullptr; + + for (const auto &rule: rules) { + auto prop = rule->get_prop().type; + const auto &vals = rule->get_values(); + + if (vals.empty()) { + continue; + } + + switch (prop) { + case css_property_type::PROPERTY_VISIBILITY: + case css_property_type::PROPERTY_DISPLAY: { + auto disp = vals.back().to_display().value_or(css_display_value::DISPLAY_INLINE); + block->set_display(disp); + break; + } + case css_property_type::PROPERTY_FONT_SIZE: { + auto fs = vals.back().to_dimension(); + if (fs) { + block->set_font_size(fs.value().dim, fs.value().is_percent); + } + } + case css_property_type::PROPERTY_OPACITY: { + opacity = vals.back().to_number().value_or(opacity); + break; + } + case css_property_type::PROPERTY_FONT_COLOR: + case css_property_type::PROPERTY_COLOR: { + auto color = vals.back().to_color(); + if (color) { + block->set_fgcolor(color.value()); + } + break; + } + case css_property_type::PROPERTY_BGCOLOR: { + auto color = vals.back().to_color(); + if (color) { + block->set_bgcolor(color.value()); + } + break; + } + case css_property_type::PROPERTY_HEIGHT: { + auto w = vals.back().to_dimension(); + if (w) { + block->set_width(w.value().dim, w.value().is_percent); + } + break; + } + case css_property_type::PROPERTY_WIDTH: { + auto h = vals.back().to_dimension(); + if (h) { + block->set_width(h.value().dim, h.value().is_percent); + } + break; + } + /* Optional attributes */ + case css_property_type::PROPERTY_FONT: + font_rule = rule.get(); + break; + case css_property_type::PROPERTY_BACKGROUND: + background_rule = rule.get(); + break; + default: + /* Do nothing for now */ + break; + } + } + + /* Optional properties */ + if (!(block->fg_color_mask) && font_rule) { + auto &vals = font_rule->get_values(); + + for (const auto &val: vals) { + auto maybe_color = val.to_color(); + + if (maybe_color) { + block->set_fgcolor(maybe_color.value()); + } + } + } + + if (!(block->font_mask) && font_rule) { + auto &vals = font_rule->get_values(); + + for (const auto &val: vals) { + auto maybe_dim = val.to_dimension(); + + if (maybe_dim) { + block->set_font_size(maybe_dim.value().dim, maybe_dim.value().is_percent); + } + } + } + + if (!(block->bg_color_mask) && background_rule) { + auto &vals = background_rule->get_values(); + + for (const auto &val: vals) { + auto maybe_color = val.to_color(); + + if (maybe_color) { + block->set_bgcolor(maybe_color.value()); + } + } + } + + return block; +} + +void css_rule::add_value(const css_value &value) +{ + values.push_back(value); +} + + +TEST_SUITE("css") +{ + TEST_CASE("simple css rules") + { + const std::vector<std::pair<const char *, std::vector<css_property>>> cases{ + {"font-size:12.0pt;line-height:115%", + {css_property(css_property_type::PROPERTY_FONT_SIZE)}}, + {"font-size:12.0pt;display:none", + {css_property(css_property_type::PROPERTY_FONT_SIZE), + css_property(css_property_type::PROPERTY_DISPLAY)}}}; + + auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "css", 0); + + for (const auto &c: cases) { + auto res = process_declaration_tokens(pool, + get_rules_parser_functor(pool, c.first)); + + CHECK(res.get() != nullptr); + + for (auto i = 0; i < c.second.size(); i++) { + CHECK(res->has_property(c.second[i])); + } + } + } +} + +}// namespace rspamd::css
\ No newline at end of file diff --git a/src/libserver/css/css_rule.hxx b/src/libserver/css/css_rule.hxx new file mode 100644 index 0000000..114b83e --- /dev/null +++ b/src/libserver/css/css_rule.hxx @@ -0,0 +1,153 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifndef RSPAMD_CSS_RULE_HXX +#define RSPAMD_CSS_RULE_HXX + +#include "css_value.hxx" +#include "css_property.hxx" +#include "css_parser.hxx" +#include "contrib/ankerl/unordered_dense.h" +#include "libutil/cxx/util.hxx" +#include "libutil/cxx/hash_util.hxx" +#include <vector> +#include <memory> + +namespace rspamd::html { +/* Forward declaration */ +struct html_block; +}// namespace rspamd::html + +namespace rspamd::css { + +class css_rule { + css_property prop; + using css_values_vec = std::vector<css_value>; + css_values_vec values; + +public: + /* We must create css rule explicitly from a property and values */ + css_rule() = delete; + + css_rule(const css_rule &other) = delete; + + /* Constructors */ + css_rule(css_rule &&other) noexcept = default; + + explicit css_rule(css_property &&prop, css_values_vec &&values) noexcept + : prop(prop), values(std::forward<css_values_vec>(values)) + { + } + + explicit css_rule(const css_property &prop) noexcept + : prop(prop), values{} + { + } + + /* Methods */ + /* Comparison is special, as we care merely about property, not the values */ + auto operator==(const css_rule &other) const + { + return prop == other.prop; + } + + constexpr const css_values_vec &get_values(void) const + { + return values; + } + constexpr const css_property &get_prop(void) const + { + return prop; + } + + /* Import values from another rules according to the importance */ + void override_values(const css_rule &other); + void merge_values(const css_rule &other); + void add_value(const css_value &value); +}; + +}// namespace rspamd::css + +/* Make rules hashable by property */ +namespace std { +template<> +class hash<rspamd::css::css_rule> { +public: + using is_avalanching = void; + constexpr auto operator()(const rspamd::css::css_rule &rule) const -> auto + { + return hash<rspamd::css::css_property>()(rule.get_prop()); + } +}; + +}// namespace std + +namespace rspamd::css { + +/** + * Class that is designed to hold css declaration (a set of rules) + */ +class css_declarations_block { +public: + using rule_shared_ptr = std::shared_ptr<css_rule>; + using rule_shared_hash = smart_ptr_hash<css_rule>; + using rule_shared_eq = smart_ptr_equal<css_rule>; + enum class merge_type { + merge_duplicate, + merge_parent, + merge_override + }; + + css_declarations_block() = default; + auto add_rule(rule_shared_ptr rule) -> bool; + auto merge_block(const css_declarations_block &other, + merge_type how = merge_type::merge_duplicate) -> void; + auto get_rules(void) const -> const auto & + { + return rules; + } + + /** + * Returns if a declaration block has some property + * @param prop + * @return + */ + auto has_property(const css_property &prop) const -> bool + { + return (rules.find(css_rule{prop}) != rules.end()); + } + + /** + * Compile CSS declaration to the html block + * @param pool used to carry memory required for html_block + * @return html block structure + */ + auto compile_to_block(rspamd_mempool_t *pool) const -> rspamd::html::html_block *; + +private: + ankerl::unordered_dense::set<rule_shared_ptr, rule_shared_hash, rule_shared_eq> rules; +}; + +using css_declarations_block_ptr = std::shared_ptr<css_declarations_block>; + +auto process_declaration_tokens(rspamd_mempool_t *pool, + blocks_gen_functor &&next_token_functor) + -> css_declarations_block_ptr; + +}// namespace rspamd::css + +#endif//RSPAMD_CSS_RULE_HXX
\ No newline at end of file diff --git a/src/libserver/css/css_rule_parser.rl b/src/libserver/css/css_rule_parser.rl new file mode 100644 index 0000000..e3b1876 --- /dev/null +++ b/src/libserver/css/css_rule_parser.rl @@ -0,0 +1,27 @@ +%%{ + machine css_parser; + alphtype unsigned char; + include css_syntax "css_syntax.rl"; + + main := declaration; +}%% + +%% write data; + +#include <cstddef> + +namespace rspamd::css { + +int +foo (const unsigned char *data, std::size_t len) +{ + const unsigned char *p = data, *pe = data + len, *eof; + int cs; + + %% write init; + %% write exec; + + return cs; +} + +}
\ No newline at end of file diff --git a/src/libserver/css/css_selector.cxx b/src/libserver/css/css_selector.cxx new file mode 100644 index 0000000..a62ffff --- /dev/null +++ b/src/libserver/css/css_selector.cxx @@ -0,0 +1,226 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_selector.hxx" +#include "css.hxx" +#include "libserver/html/html.hxx" +#include "fmt/core.h" +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +namespace rspamd::css { + +auto process_selector_tokens(rspamd_mempool_t *pool, + blocks_gen_functor &&next_token_functor) + -> selectors_vec +{ + selectors_vec ret; + bool can_continue = true; + enum class selector_process_state { + selector_parse_start = 0, + selector_expect_ident, + selector_ident_consumed, + selector_ignore_attribute, + selector_ignore_function, + selector_ignore_combination + } state = selector_process_state::selector_parse_start; + std::unique_ptr<css_selector> cur_selector; + + + while (can_continue) { + const auto &next_tok = next_token_functor(); + + if (next_tok.tag == css_consumed_block::parser_tag_type::css_component) { + const auto &parser_tok = next_tok.get_token_or_empty(); + + if (state == selector_process_state::selector_parse_start) { + /* + * At the beginning of the parsing we can expect either + * delim or an ident, everything else is discarded for now + */ + msg_debug_css("start consume selector"); + + switch (parser_tok.type) { + case css_parser_token::token_type::delim_token: { + auto delim_c = parser_tok.get_delim(); + + if (delim_c == '.') { + cur_selector = std::make_unique<css_selector>( + css_selector::selector_type::SELECTOR_CLASS); + state = selector_process_state::selector_expect_ident; + } + else if (delim_c == '#') { + cur_selector = std::make_unique<css_selector>( + css_selector::selector_type::SELECTOR_ID); + state = selector_process_state::selector_expect_ident; + } + else if (delim_c == '*') { + cur_selector = std::make_unique<css_selector>( + css_selector::selector_type::SELECTOR_ALL); + state = selector_process_state::selector_ident_consumed; + } + break; + } + case css_parser_token::token_type::ident_token: { + auto tag_id = html::html_tag_by_name(parser_tok.get_string_or_default("")); + + if (tag_id) { + cur_selector = std::make_unique<css_selector>(tag_id.value()); + } + state = selector_process_state::selector_ident_consumed; + break; + } + case css_parser_token::token_type::hash_token: + cur_selector = std::make_unique<css_selector>( + css_selector::selector_type::SELECTOR_ID); + cur_selector->value = + parser_tok.get_string_or_default(""); + state = selector_process_state::selector_ident_consumed; + break; + default: + msg_debug_css("cannot consume more of a selector, invalid parser token: %s; expected start", + next_tok.token_type_str()); + can_continue = false; + break; + } + } + else if (state == selector_process_state::selector_expect_ident) { + /* + * We got something like a selector start, so we expect + * a plain ident + */ + if (parser_tok.type == css_parser_token::token_type::ident_token && cur_selector) { + cur_selector->value = parser_tok.get_string_or_default(""); + state = selector_process_state::selector_ident_consumed; + } + else { + msg_debug_css("cannot consume more of a selector, invalid parser token: %s; expected ident", + next_tok.token_type_str()); + can_continue = false; + } + } + else if (state == selector_process_state::selector_ident_consumed) { + if (parser_tok.type == css_parser_token::token_type::comma_token && cur_selector) { + /* Got full selector, attach it to the vector and go further */ + msg_debug_css("attached selector: %s", cur_selector->debug_str().c_str()); + ret.push_back(std::move(cur_selector)); + state = selector_process_state::selector_parse_start; + } + else if (parser_tok.type == css_parser_token::token_type::semicolon_token) { + /* TODO: implement adjustments */ + state = selector_process_state::selector_ignore_function; + } + else if (parser_tok.type == css_parser_token::token_type::osqbrace_token) { + /* TODO: implement attributes checks */ + state = selector_process_state::selector_ignore_attribute; + } + else { + /* TODO: implement selectors combinations */ + state = selector_process_state::selector_ignore_combination; + } + } + else { + /* Ignore state; ignore all till ',' token or eof token */ + if (parser_tok.type == css_parser_token::token_type::comma_token && cur_selector) { + /* Got full selector, attach it to the vector and go further */ + ret.push_back(std::move(cur_selector)); + state = selector_process_state::selector_parse_start; + } + else { + auto debug_str = parser_tok.get_string_or_default(""); + msg_debug_css("ignore token %*s", (int) debug_str.size(), + debug_str.data()); + } + } + } + else { + /* End of parsing */ + if (state == selector_process_state::selector_ident_consumed && cur_selector) { + msg_debug_css("attached selector: %s", cur_selector->debug_str().c_str()); + ret.push_back(std::move(cur_selector)); + } + else { + msg_debug_css("not attached selector, state: %d", static_cast<int>(state)); + } + can_continue = false; + } + } + + return ret; /* copy elision */ +} + +auto css_selector::debug_str() const -> std::string +{ + std::string ret; + + if (type == selector_type::SELECTOR_ID) { + ret += "#"; + } + else if (type == selector_type::SELECTOR_CLASS) { + ret += "."; + } + else if (type == selector_type::SELECTOR_ALL) { + ret = "*"; + + return ret; + } + + std::visit([&](auto arg) -> void { + using T = std::decay_t<decltype(arg)>; + + if constexpr (std::is_same_v<T, tag_id_t>) { + ret += fmt::format("tag: {}", static_cast<int>(arg)); + } + else { + ret += arg; + } + }, + value); + + return ret; +} + +TEST_SUITE("css") +{ + TEST_CASE("simple css selectors") + { + const std::vector<std::pair<const char *, std::vector<css_selector::selector_type>>> cases{ + {"em", {css_selector::selector_type::SELECTOR_TAG}}, + {"*", {css_selector::selector_type::SELECTOR_ALL}}, + {".class", {css_selector::selector_type::SELECTOR_CLASS}}, + {"#id", {css_selector::selector_type::SELECTOR_ID}}, + {"em,.class,#id", {css_selector::selector_type::SELECTOR_TAG, css_selector::selector_type::SELECTOR_CLASS, css_selector::selector_type::SELECTOR_ID}}, + }; + + auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "css", 0); + + for (const auto &c: cases) { + auto res = process_selector_tokens(pool, + get_selectors_parser_functor(pool, c.first)); + + CHECK(c.second.size() == res.size()); + + for (auto i = 0; i < c.second.size(); i++) { + CHECK(res[i]->type == c.second[i]); + } + } + + rspamd_mempool_delete(pool); + } +} + +}// namespace rspamd::css diff --git a/src/libserver/css/css_selector.hxx b/src/libserver/css/css_selector.hxx new file mode 100644 index 0000000..65b185a --- /dev/null +++ b/src/libserver/css/css_selector.hxx @@ -0,0 +1,134 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_CSS_SELECTOR_HXX +#define RSPAMD_CSS_SELECTOR_HXX + +#include <variant> +#include <string> +#include <optional> +#include <vector> +#include <memory> + +#include "function2/function2.hpp" +#include "parse_error.hxx" +#include "css_parser.hxx" +#include "libserver/html/html_tags.h" +#include "libcryptobox/cryptobox.h" + +namespace rspamd::css { + +/* + * Holds a value for css selector, internal is handled by variant + */ +struct css_selector { + enum class selector_type { + SELECTOR_TAG, /* e.g. tr, for this value we use tag_id_t */ + SELECTOR_CLASS, /* generic class, e.g. .class */ + SELECTOR_ID, /* e.g. #id */ + SELECTOR_ALL /* * selector */ + }; + + selector_type type; + std::variant<tag_id_t, std::string_view> value; + + /* Conditions for the css selector */ + /* Dependency on attributes */ + struct css_attribute_condition { + std::string_view attribute; + std::string_view op = ""; + std::string_view value = ""; + }; + + /* General dependency chain */ + using css_selector_ptr = std::unique_ptr<css_selector>; + using css_selector_dep = std::variant<css_attribute_condition, css_selector_ptr>; + std::vector<css_selector_dep> dependencies; + + auto to_tag(void) const -> std::optional<tag_id_t> + { + if (type == selector_type::SELECTOR_TAG) { + return std::get<tag_id_t>(value); + } + return std::nullopt; + } + + auto to_string(void) const -> std::optional<const std::string_view> + { + if (type != selector_type::SELECTOR_TAG) { + return std::string_view(std::get<std::string_view>(value)); + } + return std::nullopt; + }; + + explicit css_selector(selector_type t) + : type(t) + { + } + explicit css_selector(tag_id_t t) + : type(selector_type::SELECTOR_TAG) + { + value = t; + } + explicit css_selector(const std::string_view &st, selector_type t = selector_type::SELECTOR_ID) + : type(t) + { + value = st; + } + + auto operator==(const css_selector &other) const -> bool + { + return type == other.type && value == other.value; + } + + auto debug_str(void) const -> std::string; +}; + + +using selectors_vec = std::vector<std::unique_ptr<css_selector>>; + +/* + * Consume selectors token and split them to the list of selectors + */ +auto process_selector_tokens(rspamd_mempool_t *pool, + blocks_gen_functor &&next_token_functor) + -> selectors_vec; + +}// namespace rspamd::css + +/* Selectors hashing */ +namespace std { +template<> +class hash<rspamd::css::css_selector> { +public: + using is_avalanching = void; + auto operator()(const rspamd::css::css_selector &sel) const -> std::size_t + { + if (sel.type == rspamd::css::css_selector::selector_type::SELECTOR_TAG) { + return static_cast<std::size_t>(std::get<tag_id_t>(sel.value)); + } + else { + const auto &sv = std::get<std::string_view>(sel.value); + + return rspamd_cryptobox_fast_hash(sv.data(), sv.size(), 0xdeadbabe); + } + } +}; +}// namespace std + +#endif//RSPAMD_CSS_SELECTOR_HXX diff --git a/src/libserver/css/css_selector_parser.rl b/src/libserver/css/css_selector_parser.rl new file mode 100644 index 0000000..f5ae936 --- /dev/null +++ b/src/libserver/css/css_selector_parser.rl @@ -0,0 +1,27 @@ +%%{ + machine css_parser; + alphtype unsigned char; + include css_syntax "css_syntax.rl"; + + main := selectors_group; +}%% + +%% write data; + +#include <cstddef> + +namespace rspamd::css { + +int +parse_css_selector (const unsigned char *data, std::size_t len) +{ + const unsigned char *p = data, *pe = data + len, *eof; + int cs; + + %% write init; + %% write exec; + + return cs; +} + +}
\ No newline at end of file diff --git a/src/libserver/css/css_style.hxx b/src/libserver/css/css_style.hxx new file mode 100644 index 0000000..429e58f --- /dev/null +++ b/src/libserver/css/css_style.hxx @@ -0,0 +1,66 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_CSS_STYLE_HXX +#define RSPAMD_CSS_STYLE_HXX + +#include <memory> +#include <vector> +#include "css_rule.hxx" +#include "css_selector.hxx" + +namespace rspamd::css { + +/* + * Full CSS style representation + */ +class css_style { +public: + /* Make class trivial */ + css_style(const css_style &other) = default; + + css_style(const std::shared_ptr<css_style> &_parent) + : parent(_parent) + { + propagate_from_parent(); + } + css_style(const std::shared_ptr<css_style> &_parent, + const std::vector<std::shared_ptr<css_selector>> &_selectors) + : parent(_parent) + { + selectors.reserve(_selectors.size()); + + for (const auto &sel_ptr: _selectors) { + selectors.emplace_back(sel_ptr); + } + + propagate_from_parent(); + } + +private: + std::vector<std::weak_ptr<css_selector>> selectors; + std::weak_ptr<css_style> parent; + std::vector<css_rule> rules; + +private: + void propagate_from_parent(void); /* Construct full style using parent */ +}; + +}// namespace rspamd::css + +#endif//RSPAMD_CSS_STYLE_HXX diff --git a/src/libserver/css/css_syntax.rl b/src/libserver/css/css_syntax.rl new file mode 100644 index 0000000..93da44b --- /dev/null +++ b/src/libserver/css/css_syntax.rl @@ -0,0 +1,110 @@ +%%{ + # CSS3 EBNF derived + machine css_syntax; + + # Primitive Atoms + COMMENT = ( + '/*' ( any )* :>> '*/' + ); + QUOTED_STRING = ('"' ( [^"\\] | /\\./ )* "'"); + BARE_URL_CHARS = ((0x21 + | 0x23..0x26 + | 0x2A..0xFF)+); + BARE_URL = BARE_URL_CHARS; + URL = 'url(' ( QUOTED_STRING | space* BARE_URL space* ) ')'; + nonascii = [^0x00-0x7F]; + nmstart = ([_a-zA-Z] | nonascii); + nmchar = ([_a-zA-Z0-9] | 0x2D | nonascii); + name = nmchar+; + num = ([0-9]+ | ([0-9]* '.' [0-9]+)); + CRLF = "\r\n" | ("\r" [^\n]) | ([^\r] "\n"); + IDENT = ([\-]? nmstart nmchar*); + ATTR = 'attr(' IDENT ')'; + + DIMENSION = '-'? num space? ( 'ch' | 'cm' | 'em' | 'ex' | 'fr' | 'in' | 'mm' | 'pc' | 'pt' | 'px' | 'Q' | 'rem' | 'vh' | 'vmax' | 'vmin' | 'vw' | 'dpi' ); + NUMBER = '-'? num; + HASH = '#' name; + HEX = '#' [0-9a-fA-F]{1,6}; + PERCENTAGE = '-'? num '%'; + INCLUDES = '~='; + DASHMATCH = '|='; + PREFIXMATCH = '^='; + SUFFIXMATCH = '$='; + SUBSTRINGMATCH = '*='; + PLUS = '+'; + GREATER = '>'; + COMMA = ','; + TILDE = '~'; + S = space; + + # Property name + property = ( QUOTED_STRING | IDENT ); + + # Values + important = space* '!' space* 'important'; + expression = ( ( '+' | PERCENTAGE | URL | ATTR | HEX | '-' | DIMENSION | NUMBER | QUOTED_STRING | IDENT | ',') S* )+; + functional_pseudo = (IDENT - ('attr'|'url')) '(' space* expression? ')'; + value = ( URL | ATTR | PLUS | HEX | PERCENTAGE | '-' | DIMENSION | NUMBER | QUOTED_STRING | IDENT | functional_pseudo); + values = value (space value | '/' value )* ( space* ',' space* value (space value | '/' value )* )* important?; + + # Declaration definition + declaration = (property space? ':' (property ':')* space? values); + + # Selectors + class = '.' IDENT; + element_name = IDENT; + namespace_prefix = ( IDENT | '*' )? '|'; + type_selector = namespace_prefix? element_name; + universal = namespace_prefix? '*'; + attrib = '[' space* namespace_prefix? IDENT space* ( ( PREFIXMATCH | SUFFIXMATCH | SUBSTRINGMATCH | '=' | INCLUDES | DASHMATCH ) space* ( IDENT | QUOTED_STRING ) space* )? ']'; + pseudo = ':' ':'? ( IDENT | functional_pseudo ); + atrule = '@' IDENT; + mediaquery_selector = '(' declaration ')'; + negation_arg = type_selector + | universal + | HASH + | class + | attrib + | pseudo; + negation = 'NOT'|'not' space* negation_arg space* ')'; + # Haha, so simple... + # there should be also mediaquery_selector but it makes grammar too large, so rip it off + simple_selector_sequence = ( type_selector | universal ) ( HASH | class | attrib | pseudo | negation | atrule )* + | ( HASH | class | attrib | pseudo | negation | atrule )+; + combinator = space* PLUS space* + | space* GREATER space* + | space* TILDE space* + | space+; + # Combine simple stuff and obtain just... an ordinary selector, bingo + selector = simple_selector_sequence ( combinator simple_selector_sequence )*; + # Multiple beasts + selectors_group = selector ( COMMENT? ',' space* selector )*; + + # Rules + # This is mostly used stuff + rule = selectors_group space? "{" space* + (COMMENT? space* declaration ( space? ";" space? declaration?)* ";"? space?)* COMMENT* space* '}'; + query_declaration = rule; + + # Areas used in css + arearule = '@'('bottom-left'|'bottom-right'|'top-left'|'top-right'); + areaquery = arearule space? '{' space* (COMMENT? space* declaration ( S? ';' S? declaration?)* ';'? space?)* COMMENT* space* '}'; + # Printed media stuff, useless but we have to parse it :( + printcssrule = '@media print'; + pagearea = ':'('left'|'right'); + pagerule = '@page' space? pagearea?; + pagequery = pagerule space? '{' space* (areaquery| (COMMENT? space* declaration ( space? ';' space? declaration?)* ';'? S?)*) COMMENT* space* '}'; + printcssquery = printcssrule S? '{' ( S? COMMENT* S? (pagequery| COMMENT|query_declaration) S*)* S? '}'; + # Something that defines media + conditions = ('and'|'screen'|'or'|'only'|'not'|'amzn-mobi'|'amzn-kf8'|'amzn-mobi7'|','); + mediarule = '@media' space conditions ( space? conditions| space? mediaquery_selector )*; + mediaquery = mediarule space? '{' ( space? COMMENT* query_declaration)* S? '}'; + + simple_atrule = ("@charset"|"@namespace") space+ QUOTED_STRING space* ";"; + + import_rule = "@import" space+ ( QUOTED_STRING | URL ) space* ";"; + + # Final css definition + css_style = space* ( ( rule | simple_atrule | import_rule | mediaquery | printcssquery | COMMENT) space* )*; + +}%%
\ No newline at end of file diff --git a/src/libserver/css/css_tokeniser.cxx b/src/libserver/css/css_tokeniser.cxx new file mode 100644 index 0000000..6d3f41e --- /dev/null +++ b/src/libserver/css/css_tokeniser.cxx @@ -0,0 +1,836 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_tokeniser.hxx" +#include "css_util.hxx" +#include "css.hxx" +#include "frozen/unordered_map.h" +#include "frozen/string.h" +#include <string> +#include <cmath> + +namespace rspamd::css { + +/* Helpers to create tokens */ + +/* + * This helper is intended to create tokens either with a tag and value + * or with just a tag. + */ +template<css_parser_token::token_type T, class Arg> +auto make_token(const Arg &arg) -> css_parser_token; + +template<> +auto make_token<css_parser_token::token_type::string_token, std::string_view>(const std::string_view &s) + -> css_parser_token +{ + return css_parser_token{css_parser_token::token_type::string_token, s}; +} + +template<> +auto make_token<css_parser_token::token_type::ident_token, std::string_view>(const std::string_view &s) + -> css_parser_token +{ + return css_parser_token{css_parser_token::token_type::ident_token, s}; +} + +template<> +auto make_token<css_parser_token::token_type::function_token, std::string_view>(const std::string_view &s) + -> css_parser_token +{ + return css_parser_token{css_parser_token::token_type::function_token, s}; +} + +template<> +auto make_token<css_parser_token::token_type::url_token, std::string_view>(const std::string_view &s) + -> css_parser_token +{ + return css_parser_token{css_parser_token::token_type::url_token, s}; +} + +template<> +auto make_token<css_parser_token::token_type::whitespace_token, std::string_view>(const std::string_view &s) + -> css_parser_token +{ + return css_parser_token{css_parser_token::token_type::whitespace_token, s}; +} + +template<> +auto make_token<css_parser_token::token_type::delim_token, char>(const char &c) + -> css_parser_token +{ + return css_parser_token{css_parser_token::token_type::delim_token, c}; +} + +template<> +auto make_token<css_parser_token::token_type::number_token, float>(const float &d) + -> css_parser_token +{ + return css_parser_token{css_parser_token::token_type::number_token, d}; +} + +/* + * Generic tokens with no value (non-terminals) + */ +template<css_parser_token::token_type T> +auto make_token(void) -> css_parser_token +{ + return css_parser_token{T, css_parser_token_placeholder()}; +} + +static constexpr inline auto is_plain_ident_start(char c) -> bool +{ + if ((c & 0x80) || g_ascii_isalpha(c) || c == '_') { + return true; + } + + return false; +}; + +static constexpr inline auto is_plain_ident(char c) -> bool +{ + if (is_plain_ident_start(c) || c == '-' || g_ascii_isdigit(c)) { + return true; + } + + return false; +}; + +struct css_dimension_data { + css_parser_token::dim_type dtype; + double mult; +}; + +/* + * Maps from css dimensions to the multipliers that look reasonable in email + */ +constexpr const auto max_dims = static_cast<int>(css_parser_token::dim_type::dim_max); +constexpr frozen::unordered_map<frozen::string, css_dimension_data, max_dims> dimensions_map{ + {"px", {css_parser_token::dim_type::dim_px, 1.0}}, + /* EM/REM are 16 px, so multiply and round */ + {"em", {css_parser_token::dim_type::dim_em, 16.0}}, + {"rem", {css_parser_token::dim_type::dim_rem, 16.0}}, + /* + * Represents the x-height of the element's font. + * On fonts with the "x" letter, this is generally the height + * of lowercase letters in the font; 1ex = 0.5em in many fonts. + */ + {"ex", {css_parser_token::dim_type::dim_ex, 8.0}}, + {"wv", {css_parser_token::dim_type::dim_wv, 8.0}}, + {"wh", {css_parser_token::dim_type::dim_wh, 6.0}}, + {"vmax", {css_parser_token::dim_type::dim_vmax, 8.0}}, + {"vmin", {css_parser_token::dim_type::dim_vmin, 6.0}}, + /* One point. 1pt = 1/72nd of 1in */ + {"pt", {css_parser_token::dim_type::dim_pt, 96.0 / 72.0}}, + /* 96px/2.54 */ + {"cm", {css_parser_token::dim_type::dim_cm, 96.0 / 2.54}}, + {"mm", {css_parser_token::dim_type::dim_mm, 9.60 / 2.54}}, + {"in", {css_parser_token::dim_type::dim_in, 96.0}}, + /* 1pc = 12pt = 1/6th of 1in. */ + {"pc", {css_parser_token::dim_type::dim_pc, 96.0 / 6.0}}}; + +auto css_parser_token::adjust_dim(const css_parser_token &dim_token) -> bool +{ + if (!std::holds_alternative<float>(value) || + !std::holds_alternative<std::string_view>(dim_token.value)) { + /* Invalid tokens */ + return false; + } + + auto num = std::get<float>(value); + auto sv = std::get<std::string_view>(dim_token.value); + + auto dim_found = find_map(dimensions_map, sv); + + if (dim_found) { + auto dim_elt = dim_found.value().get(); + dimension_type = dim_elt.dtype; + flags |= css_parser_token::number_dimension; + num *= dim_elt.mult; + } + else { + flags |= css_parser_token::flag_bad_dimension; + + return false; + } + + value = num; + + return true; +} + + +/* + * Consume functions: return a token and advance lexer offset + */ +auto css_tokeniser::consume_ident(bool allow_number) -> struct css_parser_token { + auto i = offset; + auto need_escape = false; + auto allow_middle_minus = false; + + auto maybe_escape_sv = [&](auto cur_pos, auto tok_type) -> auto { + if (need_escape) { + auto escaped = rspamd::css::unescape_css(pool, {&input[offset], + cur_pos - offset}); + offset = cur_pos; + + return css_parser_token{tok_type, escaped}; + } + + auto result = std::string_view{&input[offset], cur_pos - offset}; + offset = cur_pos; + + return css_parser_token{tok_type, result}; + }; + + /* Ident token can start from `-` or `--` */ + if (input[i] == '-') { + i++; + + if (i < input.size() && input[i] == '-') { + i++; + allow_middle_minus = true; + } + } + + while (i < input.size()) { + auto c = input[i]; + + auto is_plain_c = (allow_number || allow_middle_minus) ? is_plain_ident(c) : is_plain_ident_start(c); + if (!is_plain_c) { + if (c == '\\' && i + 1 < input.size()) { + /* Escape token */ + need_escape = true; + auto nhex = 0; + + /* Need to find an escape end */ + do { + c = input[++i]; + if (g_ascii_isxdigit(c)) { + nhex++; + + if (nhex > 6) { + /* End of the escape */ + break; + } + } + else if (nhex > 0 && c == ' ') { + /* \[hex]{1,6} */ + i++; /* Skip one space */ + break; + } + else { + /* Single \ + char */ + break; + } + } while (i < input.size()); + } + else if (c == '(') { + /* Function or url token */ + auto j = i + 1; + + while (j < input.size() && g_ascii_isspace(input[j])) { + j++; + } + + if (input.size() - offset > 3 && input.substr(offset, 3) == "url") { + if (j < input.size() && (input[j] == '"' || input[j] == '\'')) { + /* Function token */ + auto ret = maybe_escape_sv(i, + css_parser_token::token_type::function_token); + return ret; + } + else { + /* Consume URL token */ + while (j < input.size() && input[j] != ')') { + j++; + } + + if (j < input.size() && input[j] == ')') { + /* Valid url token */ + auto ret = maybe_escape_sv(j + 1, + css_parser_token::token_type::url_token); + return ret; + } + else { + /* Incomplete url token */ + auto ret = maybe_escape_sv(j, + css_parser_token::token_type::url_token); + + ret.flags |= css_parser_token::flag_bad_string; + return ret; + } + } + } + else { + auto ret = maybe_escape_sv(i, + css_parser_token::token_type::function_token); + return ret; + } + } + else if (c == '-' && allow_middle_minus) { + i++; + continue; + } + else { + break; /* Not an ident token */ + } + } /* !plain ident */ + else { + allow_middle_minus = true; + } + + i++; + } + + return maybe_escape_sv(i, css_parser_token::token_type::ident_token); +} + +auto +css_tokeniser::consume_number() -> struct css_parser_token { + auto i = offset; + auto seen_dot = false, seen_exp = false; + + if (input[i] == '-' || input[i] == '+') { + i++; + } + if (input[i] == '.' && i < input.size()) { + seen_dot = true; + i++; + } + + while (i < input.size()) { + auto c = input[i]; + + if (!g_ascii_isdigit(c)) { + if (c == '.') { + if (!seen_dot) { + seen_dot = true; + } + else { + break; + } + } + else if (c == 'e' || c == 'E') { + if (!seen_exp) { + seen_exp = true; + seen_dot = true; /* dots are not allowed after e */ + + if (i + 1 < input.size()) { + auto next_c = input[i + 1]; + if (next_c == '+' || next_c == '-') { + i++; + } + else if (!g_ascii_isdigit(next_c)) { + /* Not an exponent */ + break; + } + } + else { + /* Not an exponent */ + break; + } + } + else { + break; + } + } + else { + break; + } + } + + i++; + } + + if (i > offset) { + /* I wish it was supported properly */ + //auto conv_res = std::from_chars(&input[offset], &input[i], num); + char numbuf[128], *endptr = nullptr; + rspamd_strlcpy(numbuf, &input[offset], MIN(i - offset + 1, sizeof(numbuf))); + auto num = g_ascii_strtod(numbuf, &endptr); + offset = i; + + if (fabs(num) >= G_MAXFLOAT || std::isnan(num)) { + msg_debug_css("invalid number: %s", numbuf); + return make_token<css_parser_token::token_type::delim_token>(input[i - 1]); + } + else { + + auto ret = make_token<css_parser_token::token_type::number_token>(static_cast<float>(num)); + + if (i < input.size()) { + if (input[i] == '%') { + ret.flags |= css_parser_token::number_percent; + i++; + + offset = i; + } + else if (is_plain_ident_start(input[i])) { + auto dim_token = consume_ident(); + + if (dim_token.type == css_parser_token::token_type::ident_token) { + if (!ret.adjust_dim(dim_token)) { + auto sv = std::get<std::string_view>(dim_token.value); + msg_debug_css("cannot apply dimension from the token %*s; number value = %.1f", + (int) sv.size(), sv.begin(), num); + /* Unconsume ident */ + offset = i; + } + } + else { + /* We have no option but to uncosume ident token in this case */ + msg_debug_css("got invalid ident like token after number, unconsume it"); + } + } + else { + /* Plain number, nothing to do */ + } + } + + return ret; + } + } + else { + msg_err_css("internal error: invalid number, empty token"); + i++; + } + + offset = i; + /* Should not happen */ + return make_token<css_parser_token::token_type::delim_token>(input[i - 1]); +} + +/* + * Main routine to produce lexer tokens + */ +auto +css_tokeniser::next_token(void) -> struct css_parser_token { + /* Check pushback queue */ + if (!backlog.empty()) { + auto tok = backlog.front(); + backlog.pop_front(); + + return tok; + } + /* Helpers */ + + /* + * This lambda eats comment handling nested comments; + * offset is set to the next character after a comment (or eof) + * Nothing is returned + */ + auto consume_comment = [this]() { + auto i = offset; + auto nested = 0; + + if (input.empty()) { + /* Nothing to consume */ + return; + } + + /* We handle nested comments just because they can exist... */ + while (i < input.size() - 1) { + auto c = input[i]; + if (c == '*' && input[i + 1] == '/') { + if (nested == 0) { + offset = i + 2; + return; + } + else { + nested--; + i += 2; + continue; + } + } + else if (c == '/' && input[i + 1] == '*') { + nested++; + i += 2; + continue; + } + + i++; + } + + offset = i; + }; + + /* + * Consume quoted string, returns a string_view over a string, offset + * is set one character after the string. Css unescaping is done automatically + * Accepts a quote char to find end of string + */ + auto consume_string = [this](auto quote_char) -> auto { + auto i = offset; + bool need_unescape = false; + + while (i < input.size()) { + auto c = input[i]; + + if (c == '\\') { + if (i + 1 < input.size()) { + need_unescape = true; + } + else { + /* \ at the end -> ignore */ + } + } + else if (c == quote_char) { + /* End of string */ + std::string_view res{&input[offset], i - offset}; + + if (need_unescape) { + res = rspamd::css::unescape_css(pool, res); + } + + offset = i + 1; + + return res; + } + else if (c == '\n') { + /* Should be a error, but we ignore it for now */ + } + + i++; + } + + /* EOF with no quote character, consider it fine */ + std::string_view res{&input[offset], i - offset}; + + if (need_unescape) { + res = rspamd::css::unescape_css(pool, res); + } + + offset = i; + + return res; + }; + + /* Main tokenisation loop */ + for (auto i = offset; i < input.size(); ++i) { + auto c = input[i]; + + switch (c) { + case '/': + if (i + 1 < input.size() && input[i + 1] == '*') { + offset = i + 2; + consume_comment(); /* Consume comment and go forward */ + return next_token(); /* Tail call */ + } + else { + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + } + break; + case ' ': + case '\t': + case '\n': + case '\r': + case '\f': { + /* Consume as much space as we can */ + while (i < input.size() && g_ascii_isspace(input[i])) { + i++; + } + + auto ret = make_token<css_parser_token::token_type::whitespace_token>( + std::string_view(&input[offset], i - offset)); + offset = i; + return ret; + } + case '"': + case '\'': + offset = i + 1; + if (offset < input.size()) { + return make_token<css_parser_token::token_type::string_token>(consume_string(c)); + } + else { + /* Unpaired quote at the end of the rule */ + return make_token<css_parser_token::token_type::delim_token>(c); + } + case '(': + offset = i + 1; + return make_token<css_parser_token::token_type::obrace_token>(); + case ')': + offset = i + 1; + return make_token<css_parser_token::token_type::ebrace_token>(); + case '[': + offset = i + 1; + return make_token<css_parser_token::token_type::osqbrace_token>(); + case ']': + offset = i + 1; + return make_token<css_parser_token::token_type::esqbrace_token>(); + case '{': + offset = i + 1; + return make_token<css_parser_token::token_type::ocurlbrace_token>(); + case '}': + offset = i + 1; + return make_token<css_parser_token::token_type::ecurlbrace_token>(); + case ',': + offset = i + 1; + return make_token<css_parser_token::token_type::comma_token>(); + case ';': + offset = i + 1; + return make_token<css_parser_token::token_type::semicolon_token>(); + case ':': + offset = i + 1; + return make_token<css_parser_token::token_type::colon_token>(); + case '<': + /* Maybe an xml like comment */ + if (i + 3 < input.size() && input[i + 1] == '!' && input[i + 2] == '-' && input[i + 3] == '-') { + offset += 3; + + return make_token<css_parser_token::token_type::cdo_token>(); + } + else { + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + } + break; + case '-': + if (i + 1 < input.size()) { + auto next_c = input[i + 1]; + + if (g_ascii_isdigit(next_c)) { + /* negative number */ + return consume_number(); + } + else if (next_c == '-') { + if (i + 2 < input.size() && input[i + 2] == '>') { + /* XML like comment */ + offset += 3; + + return make_token<css_parser_token::token_type::cdc_token>(); + } + } + } + /* No other options, a delimiter - */ + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + + break; + case '+': + case '.': + /* Maybe number */ + if (i + 1 < input.size()) { + auto next_c = input[i + 1]; + + if (g_ascii_isdigit(next_c)) { + /* Numeric token */ + return consume_number(); + } + else { + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + } + } + /* No other options, a delimiter - */ + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + + break; + case '\\': + if (i + 1 < input.size()) { + if (input[i + 1] == '\n' || input[i + 1] == '\r') { + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + } + else { + /* Valid escape, assume ident */ + return consume_ident(); + } + } + else { + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + } + break; + case '@': + if (i + 3 < input.size()) { + if (is_plain_ident_start(input[i + 1]) && + is_plain_ident(input[i + 2]) && is_plain_ident(input[i + 3])) { + offset = i + 1; + auto ident_token = consume_ident(); + + if (ident_token.type == css_parser_token::token_type::ident_token) { + /* Update type */ + ident_token.type = css_parser_token::token_type::at_keyword_token; + } + + return ident_token; + } + else { + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + } + } + else { + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + } + break; + case '#': + /* TODO: make it more conformant */ + if (i + 2 < input.size()) { + auto next_c = input[i + 1], next_next_c = input[i + 2]; + if ((is_plain_ident(next_c) || next_c == '-') && + (is_plain_ident(next_next_c) || next_next_c == '-')) { + offset = i + 1; + /* We consume indent, but we allow numbers there */ + auto ident_token = consume_ident(true); + + if (ident_token.type == css_parser_token::token_type::ident_token) { + /* Update type */ + ident_token.type = css_parser_token::token_type::hash_token; + } + + return ident_token; + } + else { + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + } + } + else { + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + } + break; + default: + /* Generic parsing code */ + + if (g_ascii_isdigit(c)) { + return consume_number(); + } + else if (is_plain_ident_start(c)) { + return consume_ident(); + } + else { + offset = i + 1; + return make_token<css_parser_token::token_type::delim_token>(c); + } + break; + } + } + + return make_token<css_parser_token::token_type::eof_token>(); +} + +constexpr auto +css_parser_token::get_token_type() -> const char * +{ + const char *ret = "unknown"; + + switch (type) { + case token_type::whitespace_token: + ret = "whitespace"; + break; + case token_type::ident_token: + ret = "ident"; + break; + case token_type::function_token: + ret = "function"; + break; + case token_type::at_keyword_token: + ret = "atkeyword"; + break; + case token_type::hash_token: + ret = "hash"; + break; + case token_type::string_token: + ret = "string"; + break; + case token_type::number_token: + ret = "number"; + break; + case token_type::url_token: + ret = "url"; + break; + case token_type::cdo_token: /* xml open comment */ + ret = "cdo"; + break; + case token_type::cdc_token: /* xml close comment */ + ret = "cdc"; + break; + case token_type::delim_token: + ret = "delim"; + break; + case token_type::obrace_token: /* ( */ + ret = "obrace"; + break; + case token_type::ebrace_token: /* ) */ + ret = "ebrace"; + break; + case token_type::osqbrace_token: /* [ */ + ret = "osqbrace"; + break; + case token_type::esqbrace_token: /* ] */ + ret = "esqbrace"; + break; + case token_type::ocurlbrace_token: /* { */ + ret = "ocurlbrace"; + break; + case token_type::ecurlbrace_token: /* } */ + ret = "ecurlbrace"; + break; + case token_type::comma_token: + ret = "comma"; + break; + case token_type::colon_token: + ret = "colon"; + break; + case token_type::semicolon_token: + ret = "semicolon"; + break; + case token_type::eof_token: + ret = "eof"; + break; + } + + return ret; +} + + +auto css_parser_token::debug_token_str() -> std::string +{ + const auto *token_type_str = get_token_type(); + std::string ret = token_type_str; + + std::visit([&](auto arg) -> auto { + using T = std::decay_t<decltype(arg)>; + + if constexpr (std::is_same_v<T, std::string_view> || std::is_same_v<T, char>) { + ret += "; value="; + ret += arg; + } + else if constexpr (std::is_same_v<T, double>) { + ret += "; value="; + ret += std::to_string(arg); + } + }, + value); + + if ((flags & (~number_dimension)) != default_flags) { + ret += "; flags=" + std::to_string(flags); + } + + if (flags & number_dimension) { + ret += "; dim=" + std::to_string(static_cast<int>(dimension_type)); + } + + return ret; /* Copy elision */ +} + +}// namespace rspamd::css
\ No newline at end of file diff --git a/src/libserver/css/css_tokeniser.hxx b/src/libserver/css/css_tokeniser.hxx new file mode 100644 index 0000000..aa6a1a7 --- /dev/null +++ b/src/libserver/css/css_tokeniser.hxx @@ -0,0 +1,215 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_CSS_TOKENISER_HXX +#define RSPAMD_CSS_TOKENISER_HXX + +#include <string_view> +#include <utility> +#include <variant> +#include <list> +#include <functional> +#include <cstdint> +#include "mem_pool.h" + +namespace rspamd::css { + +struct css_parser_token_placeholder {}; /* For empty tokens */ + +struct css_parser_token { + + enum class token_type : std::uint8_t { + whitespace_token, + ident_token, + function_token, + at_keyword_token, + hash_token, + string_token, + number_token, + url_token, + cdo_token, /* xml open comment */ + cdc_token, /* xml close comment */ + delim_token, + obrace_token, /* ( */ + ebrace_token, /* ) */ + osqbrace_token, /* [ */ + esqbrace_token, /* ] */ + ocurlbrace_token, /* { */ + ecurlbrace_token, /* } */ + comma_token, + colon_token, + semicolon_token, + eof_token, + }; + + enum class dim_type : std::uint8_t { + dim_px = 0, + dim_em, + dim_rem, + dim_ex, + dim_wv, + dim_wh, + dim_vmax, + dim_vmin, + dim_pt, + dim_cm, + dim_mm, + dim_in, + dim_pc, + dim_max, + }; + + static const std::uint8_t default_flags = 0; + static const std::uint8_t flag_bad_string = (1u << 0u); + static const std::uint8_t number_dimension = (1u << 1u); + static const std::uint8_t number_percent = (1u << 2u); + static const std::uint8_t flag_bad_dimension = (1u << 3u); + + using value_type = std::variant<std::string_view, /* For strings and string like tokens */ + char, /* For delimiters (might need to move to unicode point) */ + float, /* For numeric stuff */ + css_parser_token_placeholder /* For general no token stuff */ + >; + + /* Typed storage */ + value_type value; + + int lineno; + + token_type type; + std::uint8_t flags = default_flags; + dim_type dimension_type; + + css_parser_token() = delete; + explicit css_parser_token(token_type type, const value_type &value) + : value(value), type(type) + { + } + css_parser_token(css_parser_token &&other) = default; + css_parser_token(const css_parser_token &token) = default; + auto operator=(css_parser_token &&other) -> css_parser_token & = default; + auto adjust_dim(const css_parser_token &dim_token) -> bool; + + auto get_string_or_default(const std::string_view &def) const -> std::string_view + { + if (std::holds_alternative<std::string_view>(value)) { + return std::get<std::string_view>(value); + } + else if (std::holds_alternative<char>(value)) { + return std::string_view(&std::get<char>(value), 1); + } + + return def; + } + + auto get_delim() const -> char + { + if (std::holds_alternative<char>(value)) { + return std::get<char>(value); + } + + return (char) -1; + } + + auto get_number_or_default(float def) const -> float + { + if (std::holds_alternative<float>(value)) { + auto dbl = std::get<float>(value); + + if (flags & css_parser_token::number_percent) { + dbl /= 100.0; + } + + return dbl; + } + + return def; + } + + auto get_normal_number_or_default(float def) const -> float + { + if (std::holds_alternative<float>(value)) { + auto dbl = std::get<float>(value); + + if (flags & css_parser_token::number_percent) { + dbl /= 100.0; + } + + if (dbl < 0) { + return 0.0; + } + else if (dbl > 1.0) { + return 1.0; + } + + return dbl; + } + + return def; + } + + /* Debugging routines */ + constexpr auto get_token_type() -> const char *; + /* This function might be slow */ + auto debug_token_str() -> std::string; +}; + +static auto css_parser_eof_token(void) -> const css_parser_token & +{ + static css_parser_token eof_tok{ + css_parser_token::token_type::eof_token, + css_parser_token_placeholder()}; + + return eof_tok; +} + +/* Ensure that parser tokens are simple enough */ +/* + * compiler must implement P0602 "variant and optional should propagate copy/move triviality" + * This is broken on gcc < 8! + */ +static_assert(std::is_trivially_copyable_v<css_parser_token>); + +class css_tokeniser { +public: + css_tokeniser() = delete; + css_tokeniser(rspamd_mempool_t *pool, const std::string_view &sv) + : input(sv), offset(0), pool(pool) + { + } + + auto next_token(void) -> struct css_parser_token; + auto pushback_token(const struct css_parser_token &t) const -> void + { + backlog.push_back(t); + } + +private: + std::string_view input; + std::size_t offset; + rspamd_mempool_t *pool; + mutable std::list<css_parser_token> backlog; + + auto consume_number() -> struct css_parser_token; + auto consume_ident(bool allow_number = false) -> struct css_parser_token; +}; + +}// namespace rspamd::css + + +#endif//RSPAMD_CSS_TOKENISER_HXX diff --git a/src/libserver/css/css_util.cxx b/src/libserver/css/css_util.cxx new file mode 100644 index 0000000..07f8722 --- /dev/null +++ b/src/libserver/css/css_util.cxx @@ -0,0 +1,157 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_util.hxx" +#include "css.hxx" +#include <unicode/utf8.h> + +namespace rspamd::css { + +std::string_view unescape_css(rspamd_mempool_t *pool, + const std::string_view &sv) +{ + auto *nspace = reinterpret_cast<char *>(rspamd_mempool_alloc(pool, sv.length())); + auto *d = nspace; + auto nleft = sv.length(); + + enum { + normal = 0, + quoted, + escape, + skip_spaces, + } state = normal; + + char quote_char, prev_c = 0; + auto escape_offset = 0, i = 0; + +#define MAYBE_CONSUME_CHAR(c) \ + do { \ + if ((c) == '"' || (c) == '\'') { \ + state = quoted; \ + quote_char = (c); \ + nleft--; \ + *d++ = (c); \ + } \ + else if ((c) == '\\') { \ + escape_offset = i; \ + state = escape; \ + } \ + else { \ + state = normal; \ + nleft--; \ + *d++ = g_ascii_tolower(c); \ + } \ + } while (0) + + for (const auto c: sv) { + if (nleft == 0) { + msg_err_css("cannot unescape css: truncated buffer of size %d", + (int) sv.length()); + break; + } + switch (state) { + case normal: + MAYBE_CONSUME_CHAR(c); + break; + case quoted: + if (c == quote_char) { + if (prev_c != '\\') { + state = normal; + } + } + prev_c = c; + nleft--; + *d++ = c; + break; + case escape: + if (!g_ascii_isxdigit(c)) { + if (i > escape_offset + 1) { + /* Try to decode an escape */ + const auto *escape_start = &sv[escape_offset + 1]; + unsigned long val; + + if (!rspamd_xstrtoul(escape_start, i - escape_offset - 1, &val)) { + msg_debug_css("invalid broken escape found at pos %d", + escape_offset); + } + else { + if (val < 0x80) { + /* Trivial case: ascii character */ + *d++ = (unsigned char) g_ascii_tolower(val); + nleft--; + } + else { + UChar32 uc = val; + auto off = 0; + UTF8_APPEND_CHAR_SAFE((uint8_t *) d, off, + sv.length(), u_tolower(uc)); + d += off; + nleft -= off; + } + } + } + else { + /* Empty escape, ignore it */ + msg_debug_css("invalid empty escape found at pos %d", + escape_offset); + } + + if (nleft <= 0) { + msg_err_css("cannot unescape css: truncated buffer of size %d", + (int) sv.length()); + } + else { + /* Escape is done, advance forward */ + if (g_ascii_isspace(c)) { + state = skip_spaces; + } + else { + MAYBE_CONSUME_CHAR(c); + } + } + } + break; + case skip_spaces: + if (!g_ascii_isspace(c)) { + MAYBE_CONSUME_CHAR(c); + } + /* Ignore spaces */ + break; + } + + i++; + } + + return std::string_view{nspace, sv.size() - nleft}; +} + +}// namespace rspamd::css + +/* C API */ +const gchar *rspamd_css_unescape(rspamd_mempool_t *pool, + const guchar *begin, + gsize len, + gsize *outlen) +{ + auto sv = rspamd::css::unescape_css(pool, {(const char *) begin, len}); + const auto *v = sv.begin(); + + if (outlen) { + *outlen = sv.size(); + } + + return v; +}
\ No newline at end of file diff --git a/src/libserver/css/css_util.hxx b/src/libserver/css/css_util.hxx new file mode 100644 index 0000000..4837a46 --- /dev/null +++ b/src/libserver/css/css_util.hxx @@ -0,0 +1,37 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_CSS_UTIL_HXX +#define RSPAMD_CSS_UTIL_HXX + +#include <string_view> +#include "mem_pool.h" + +namespace rspamd::css { + +/* + * Unescape css escapes + * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9 + * \0020AC : must be 6 digits long, no space needed (but can be included) + */ +std::string_view unescape_css(rspamd_mempool_t *pool, + const std::string_view &sv); + +}// namespace rspamd::css + +#endif//RSPAMD_CSS_UTIL_HXX diff --git a/src/libserver/css/css_value.cxx b/src/libserver/css/css_value.cxx new file mode 100644 index 0000000..2546e01 --- /dev/null +++ b/src/libserver/css/css_value.cxx @@ -0,0 +1,449 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_value.hxx" +#include "css_colors_list.hxx" +#include "frozen/unordered_map.h" +#include "frozen/string.h" +#include "libutil/util.h" +#include "contrib/ankerl/unordered_dense.h" +#include "fmt/core.h" + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +/* Helper for unit test stringification */ +namespace doctest { +template<> +struct StringMaker<rspamd::css::css_color> { + static String convert(const rspamd::css::css_color &value) + { + return fmt::format("r={};g={};b={};alpha={}", + value.r, value.g, value.b, value.alpha) + .c_str(); + } +}; + +}// namespace doctest + +namespace rspamd::css { + +auto css_value::maybe_color_from_string(const std::string_view &input) + -> std::optional<css_value> +{ + + if (input.size() > 1 && input.front() == '#') { + return css_value::maybe_color_from_hex(input.substr(1)); + } + else { + auto found_it = css_colors_map.find(input); + + if (found_it != css_colors_map.end()) { + return css_value{found_it->second}; + } + } + + return std::nullopt; +} + +constexpr static inline auto hexpair_decode(char c1, char c2) -> std::uint8_t +{ + std::uint8_t ret = 0; + + if (c1 >= '0' && c1 <= '9') ret = c1 - '0'; + else if (c1 >= 'A' && c1 <= 'F') + ret = c1 - 'A' + 10; + else if (c1 >= 'a' && c1 <= 'f') + ret = c1 - 'a' + 10; + + ret *= 16; + + if (c2 >= '0' && c2 <= '9') ret += c2 - '0'; + else if (c2 >= 'A' && c2 <= 'F') + ret += c2 - 'A' + 10; + else if (c2 >= 'a' && c2 <= 'f') + ret += c2 - 'a' + 10; + + return ret; +} + +auto css_value::maybe_color_from_hex(const std::string_view &input) + -> std::optional<css_value> +{ + if (input.length() == 6) { + /* Plain RGB */ + css_color col(hexpair_decode(input[0], input[1]), + hexpair_decode(input[2], input[3]), + hexpair_decode(input[4], input[5])); + return css_value(col); + } + else if (input.length() == 3) { + /* Rgb as 3 hex digests */ + css_color col(hexpair_decode(input[0], input[0]), + hexpair_decode(input[1], input[1]), + hexpair_decode(input[2], input[2])); + return css_value(col); + } + else if (input.length() == 8) { + /* RGBA */ + css_color col(hexpair_decode(input[0], input[1]), + hexpair_decode(input[2], input[3]), + hexpair_decode(input[4], input[5]), + hexpair_decode(input[6], input[7])); + return css_value(col); + } + + return std::nullopt; +} + +constexpr static inline auto rgb_color_component_convert(const css_parser_token &tok) + -> std::uint8_t +{ + std::uint8_t ret = 0; + + if (tok.type == css_parser_token::token_type::number_token) { + auto dbl = std::get<float>(tok.value); + + if (tok.flags & css_parser_token::number_percent) { + if (dbl > 100) { + dbl = 100; + } + else if (dbl < 0) { + dbl = 0; + } + ret = (std::uint8_t)(dbl / 100.0 * 255.0); + } + else { + if (dbl > 255) { + dbl = 255; + } + else if (dbl < 0) { + dbl = 0; + } + + ret = (std::uint8_t)(dbl); + } + } + + return ret; +} + +constexpr static inline auto alpha_component_convert(const css_parser_token &tok) + -> std::uint8_t +{ + double ret = 1.0; + + if (tok.type == css_parser_token::token_type::number_token) { + auto dbl = std::get<float>(tok.value); + + if (tok.flags & css_parser_token::number_percent) { + if (dbl > 100) { + dbl = 100; + } + else if (dbl < 0) { + dbl = 0; + } + ret = (dbl / 100.0); + } + else { + if (dbl > 1.0) { + dbl = 1.0; + } + else if (dbl < 0) { + dbl = 0; + } + + ret = dbl; + } + } + + return (std::uint8_t)(ret * 255.0); +} + +constexpr static inline auto h_component_convert(const css_parser_token &tok) + -> double +{ + double ret = 0.0; + + if (tok.type == css_parser_token::token_type::number_token) { + auto dbl = std::get<float>(tok.value); + + if (tok.flags & css_parser_token::number_percent) { + if (dbl > 100) { + dbl = 100; + } + else if (dbl < 0) { + dbl = 0; + } + ret = (dbl / 100.0); + } + else { + dbl = ((((int) dbl % 360) + 360) % 360); /* Deal with rotations */ + ret = dbl / 360.0; /* Normalize to 0..1 */ + } + } + + return ret; +} + +constexpr static inline auto sl_component_convert(const css_parser_token &tok) + -> double +{ + double ret = 0.0; + + if (tok.type == css_parser_token::token_type::number_token) { + ret = tok.get_normal_number_or_default(ret); + } + + return ret; +} + +static inline auto hsl_to_rgb(double h, double s, double l) + -> css_color +{ + css_color ret; + + constexpr auto hue2rgb = [](auto p, auto q, auto t) -> auto { + if (t < 0.0) { + t += 1.0; + } + if (t > 1.0) { + t -= 1.0; + } + if (t * 6. < 1.0) { + return p + (q - p) * 6.0 * t; + } + if (t * 2. < 1) { + return q; + } + if (t * 3. < 2.) { + return p + (q - p) * (2.0 / 3.0 - t) * 6.0; + } + return p; + }; + + if (s == 0) { + /* Achromatic */ + ret.r = l; + ret.g = l; + ret.b = l; + } + else { + auto q = l <= 0.5 ? l * (1.0 + s) : l + s - l * s; + auto p = 2.0 * l - q; + ret.r = (std::uint8_t)(hue2rgb(p, q, h + 1.0 / 3.0) * 255); + ret.g = (std::uint8_t)(hue2rgb(p, q, h) * 255); + ret.b = (std::uint8_t)(hue2rgb(p, q, h - 1.0 / 3.0) * 255); + } + + ret.alpha = 255; + + return ret; +} + +auto css_value::maybe_color_from_function(const css_consumed_block::css_function_block &func) + -> std::optional<css_value> +{ + + if (func.as_string() == "rgb" && func.args.size() == 3) { + css_color col{rgb_color_component_convert(func.args[0]->get_token_or_empty()), + rgb_color_component_convert(func.args[1]->get_token_or_empty()), + rgb_color_component_convert(func.args[2]->get_token_or_empty())}; + + return css_value(col); + } + else if (func.as_string() == "rgba" && func.args.size() == 4) { + css_color col{rgb_color_component_convert(func.args[0]->get_token_or_empty()), + rgb_color_component_convert(func.args[1]->get_token_or_empty()), + rgb_color_component_convert(func.args[2]->get_token_or_empty()), + alpha_component_convert(func.args[3]->get_token_or_empty())}; + + return css_value(col); + } + else if (func.as_string() == "hsl" && func.args.size() == 3) { + auto h = h_component_convert(func.args[0]->get_token_or_empty()); + auto s = sl_component_convert(func.args[1]->get_token_or_empty()); + auto l = sl_component_convert(func.args[2]->get_token_or_empty()); + + auto col = hsl_to_rgb(h, s, l); + + return css_value(col); + } + else if (func.as_string() == "hsla" && func.args.size() == 4) { + auto h = h_component_convert(func.args[0]->get_token_or_empty()); + auto s = sl_component_convert(func.args[1]->get_token_or_empty()); + auto l = sl_component_convert(func.args[2]->get_token_or_empty()); + + auto col = hsl_to_rgb(h, s, l); + col.alpha = alpha_component_convert(func.args[3]->get_token_or_empty()); + + return css_value(col); + } + + return std::nullopt; +} + +auto css_value::maybe_dimension_from_number(const css_parser_token &tok) + -> std::optional<css_value> +{ + if (std::holds_alternative<float>(tok.value)) { + auto dbl = std::get<float>(tok.value); + css_dimension dim; + + dim.dim = dbl; + + if (tok.flags & css_parser_token::number_percent) { + dim.is_percent = true; + } + else { + dim.is_percent = false; + } + + return css_value{dim}; + } + + return std::nullopt; +} + +constexpr const auto display_names_map = frozen::make_unordered_map<frozen::string, css_display_value>({ + {"hidden", css_display_value::DISPLAY_HIDDEN}, + {"none", css_display_value::DISPLAY_HIDDEN}, + {"inline", css_display_value::DISPLAY_INLINE}, + {"block", css_display_value::DISPLAY_BLOCK}, + {"content", css_display_value::DISPLAY_INLINE}, + {"flex", css_display_value::DISPLAY_BLOCK}, + {"grid", css_display_value::DISPLAY_BLOCK}, + {"inline-block", css_display_value::DISPLAY_INLINE}, + {"inline-flex", css_display_value::DISPLAY_INLINE}, + {"inline-grid", css_display_value::DISPLAY_INLINE}, + {"inline-table", css_display_value::DISPLAY_INLINE}, + {"list-item", css_display_value::DISPLAY_BLOCK}, + {"run-in", css_display_value::DISPLAY_INLINE}, + {"table", css_display_value::DISPLAY_BLOCK}, + {"table-caption", css_display_value::DISPLAY_TABLE_ROW}, + {"table-column-group", css_display_value::DISPLAY_TABLE_ROW}, + {"table-header-group", css_display_value::DISPLAY_TABLE_ROW}, + {"table-footer-group", css_display_value::DISPLAY_TABLE_ROW}, + {"table-row-group", css_display_value::DISPLAY_TABLE_ROW}, + {"table-cell", css_display_value::DISPLAY_TABLE_ROW}, + {"table-column", css_display_value::DISPLAY_TABLE_ROW}, + {"table-row", css_display_value::DISPLAY_TABLE_ROW}, + {"initial", css_display_value::DISPLAY_INLINE}, +}); + +auto css_value::maybe_display_from_string(const std::string_view &input) + -> std::optional<css_value> +{ + auto f = display_names_map.find(input); + + if (f != display_names_map.end()) { + return css_value{f->second}; + } + + return std::nullopt; +} + + +auto css_value::debug_str() const -> std::string +{ + std::string ret; + + std::visit([&](const auto &arg) { + using T = std::decay_t<decltype(arg)>; + + if constexpr (std::is_same_v<T, css_color>) { + ret += fmt::format("color: r={};g={};b={};alpha={}", + arg.r, arg.g, arg.b, arg.alpha); + } + else if constexpr (std::is_same_v<T, double>) { + ret += "size: " + std::to_string(arg); + } + else if constexpr (std::is_same_v<T, css_dimension>) { + ret += "dimension: " + std::to_string(arg.dim); + if (arg.is_percent) { + ret += "%"; + } + } + else if constexpr (std::is_same_v<T, css_display_value>) { + ret += "display: "; + switch (arg) { + case css_display_value::DISPLAY_HIDDEN: + ret += "hidden"; + break; + case css_display_value::DISPLAY_BLOCK: + ret += "block"; + break; + case css_display_value::DISPLAY_INLINE: + ret += "inline"; + break; + case css_display_value::DISPLAY_TABLE_ROW: + ret += "table_row"; + break; + } + } + else if constexpr (std::is_integral_v<T>) { + ret += "integral: " + std::to_string(static_cast<int>(arg)); + } + else { + ret += "nyi"; + } + }, + value); + + return ret; +} + +TEST_SUITE("css"){ + TEST_CASE("css hex colors"){ + const std::pair<const char *, css_color> hex_tests[] = { + {"000", css_color(0, 0, 0)}, + {"000000", css_color(0, 0, 0)}, + {"f00", css_color(255, 0, 0)}, + {"FEDCBA", css_color(254, 220, 186)}, + {"234", css_color(34, 51, 68)}, + }; + +for (const auto &p: hex_tests) { + SUBCASE((std::string("parse hex color: ") + p.first).c_str()) + { + auto col_parsed = css_value::maybe_color_from_hex(p.first); + //CHECK_UNARY(col_parsed); + //CHECK_UNARY(col_parsed.value().to_color()); + auto final_col = col_parsed.value().to_color().value(); + CHECK(final_col == p.second); + } +} +}// namespace rspamd::css +TEST_CASE("css colors strings") +{ + auto passed = 0; + for (const auto &p: css_colors_map) { + /* Match some of the colors selected randomly */ + if (rspamd_random_double_fast() > 0.9) { + auto col_parsed = css_value::maybe_color_from_string(p.first); + auto final_col = col_parsed.value().to_color().value(); + CHECK_MESSAGE(final_col == p.second, p.first.data()); + passed++; + + if (passed > 20) { + break; + } + } + } +} +} +; +} diff --git a/src/libserver/css/css_value.hxx b/src/libserver/css/css_value.hxx new file mode 100644 index 0000000..1d57421 --- /dev/null +++ b/src/libserver/css/css_value.hxx @@ -0,0 +1,174 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_CSS_VALUE_HXX +#define RSPAMD_CSS_VALUE_HXX + +#include <string> +#include <variant> +#include <optional> +#include <vector> +#include <iosfwd> +#include "parse_error.hxx" +#include "css_parser.hxx" +#include "contrib/expected/expected.hpp" + +namespace rspamd::css { + +struct alignas(int) css_color { + std::uint8_t r; + std::uint8_t g; + std::uint8_t b; + + std::uint8_t alpha; + + css_color(std::uint8_t _r, std::uint8_t _g, std::uint8_t _b, std::uint8_t _alpha = 255) + : r(_r), g(_g), b(_b), alpha(_alpha) + { + } + css_color() = default; + constexpr auto to_number() const -> std::uint32_t + { + return (std::uint32_t) alpha << 24 | + (std::uint32_t) r << 16 | + (std::uint32_t) g << 8 | + (std::uint32_t) b << 0; + } + + constexpr auto to_rgb() const -> std::uint32_t + { + return (std::uint32_t) r << 16 | + (std::uint32_t) g << 8 | + (std::uint32_t) b << 0; + } + friend bool operator==(const css_color &l, const css_color &r) + { + return (memcmp(&l, &r, sizeof(css_color)) == 0); + } + + static auto white() -> css_color + { + return css_color{255, 255, 255}; + } + static auto black() -> css_color + { + return css_color{0, 0, 0}; + } +}; + +struct css_dimension { + float dim; + bool is_percent; +}; + +/* + * Simple enum class for display stuff + */ +enum class css_display_value : std::uint8_t { + DISPLAY_INLINE, + DISPLAY_BLOCK, + DISPLAY_TABLE_ROW, + DISPLAY_HIDDEN +}; + +/* + * Value handler, uses std::variant instead of polymorphic classes for now + * for simplicity + */ +struct css_value { + std::variant<css_color, + float, + css_display_value, + css_dimension, + std::monostate> + value; + + css_value() + { + } + css_value(const css_color &color) + : value(color) + { + } + css_value(float num) + : value(num) + { + } + css_value(css_dimension dim) + : value(dim) + { + } + css_value(css_display_value d) + : value(d) + { + } + + auto to_color(void) const -> std::optional<css_color> + { + return extract_value_maybe<css_color>(); + } + + auto to_number(void) const -> std::optional<float> + { + return extract_value_maybe<float>(); + } + + auto to_dimension(void) const -> std::optional<css_dimension> + { + return extract_value_maybe<css_dimension>(); + } + + auto to_display(void) const -> std::optional<css_display_value> + { + return extract_value_maybe<css_display_value>(); + } + + auto is_valid(void) const -> bool + { + return !(std::holds_alternative<std::monostate>(value)); + } + + auto debug_str() const -> std::string; + + static auto maybe_color_from_string(const std::string_view &input) + -> std::optional<css_value>; + static auto maybe_color_from_hex(const std::string_view &input) + -> std::optional<css_value>; + static auto maybe_color_from_function(const css_consumed_block::css_function_block &func) + -> std::optional<css_value>; + static auto maybe_dimension_from_number(const css_parser_token &tok) + -> std::optional<css_value>; + static auto maybe_display_from_string(const std::string_view &input) + -> std::optional<css_value>; + +private: + template<typename T> + auto extract_value_maybe(void) const -> std::optional<T> + { + if (std::holds_alternative<T>(value)) { + return std::get<T>(value); + } + + return std::nullopt; + } +}; + +}// namespace rspamd::css + + +#endif//RSPAMD_CSS_VALUE_HXX diff --git a/src/libserver/css/parse_error.hxx b/src/libserver/css/parse_error.hxx new file mode 100644 index 0000000..22b76f0 --- /dev/null +++ b/src/libserver/css/parse_error.hxx @@ -0,0 +1,61 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_PARSE_ERROR_HXX +#define RSPAMD_PARSE_ERROR_HXX + +#include <string> +#include <optional> + +namespace rspamd::css { + +/* + * Generic parser errors + */ +enum class css_parse_error_type { + PARSE_ERROR_UNKNOWN_OPTION, + PARSE_ERROR_INVALID_SYNTAX, + PARSE_ERROR_BAD_NESTING, + PARSE_ERROR_NYI, + PARSE_ERROR_UNKNOWN_ERROR, + /* All above is treated as fatal error in parsing */ + PARSE_ERROR_NO_ERROR, + PARSE_ERROR_EMPTY, +}; + +struct css_parse_error { + css_parse_error_type type = css_parse_error_type::PARSE_ERROR_UNKNOWN_ERROR; + std::optional<std::string> description; + + explicit css_parse_error(css_parse_error_type type, const std::string &description) + : type(type), description(description) + { + } + explicit css_parse_error(css_parse_error_type type = css_parse_error_type::PARSE_ERROR_NO_ERROR) + : type(type) + { + } + + constexpr auto is_fatal(void) const -> bool + { + return type < css_parse_error_type::PARSE_ERROR_NO_ERROR; + } +}; + +}// namespace rspamd::css +#endif//RSPAMD_PARSE_ERROR_HXX diff --git a/src/libserver/dkim.c b/src/libserver/dkim.c new file mode 100644 index 0000000..4318e87 --- /dev/null +++ b/src/libserver/dkim.c @@ -0,0 +1,3588 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "rspamd.h" +#include "message.h" +#include "dkim.h" +#include "dns.h" +#include "utlist.h" +#include "unix-std.h" +#include "mempool_vars_internal.h" + +#include <openssl/evp.h> +#include <openssl/rsa.h> +#include <openssl/engine.h> + +/* special DNS tokens */ +#define DKIM_DNSKEYNAME "_domainkey" + +/* ed25519 key lengths */ +#define ED25519_B64_BYTES 45 +#define ED25519_BYTES 32 + +/* Canonization methods */ +#define DKIM_CANON_UNKNOWN (-1) /* unknown method */ +#define DKIM_CANON_SIMPLE 0 /* as specified in DKIM spec */ +#define DKIM_CANON_RELAXED 1 /* as specified in DKIM spec */ + +#define DKIM_CANON_DEFAULT DKIM_CANON_SIMPLE + +#define RSPAMD_SHORT_BH_LEN 8 + +/* Params */ +enum rspamd_dkim_param_type { + DKIM_PARAM_UNKNOWN = -1, + DKIM_PARAM_SIGNATURE = 0, + DKIM_PARAM_SIGNALG, + DKIM_PARAM_DOMAIN, + DKIM_PARAM_CANONALG, + DKIM_PARAM_QUERYMETHOD, + DKIM_PARAM_SELECTOR, + DKIM_PARAM_HDRLIST, + DKIM_PARAM_VERSION, + DKIM_PARAM_IDENTITY, + DKIM_PARAM_TIMESTAMP, + DKIM_PARAM_EXPIRATION, + DKIM_PARAM_COPIEDHDRS, + DKIM_PARAM_BODYHASH, + DKIM_PARAM_BODYLENGTH, + DKIM_PARAM_IDX, + DKIM_PARAM_CV, + DKIM_PARAM_IGNORE +}; + +#define RSPAMD_DKIM_MAX_ARC_IDX 10 + +#define msg_err_dkim(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "dkim", ctx->pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_dkim(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "dkim", ctx->pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_dkim(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "dkim", ctx->pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_dkim(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_dkim_log_id, "dkim", ctx->pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_dkim_taskless(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_dkim_log_id, "dkim", "", \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(dkim) + +#define RSPAMD_DKIM_FLAG_OVERSIGN (1u << 0u) +#define RSPAMD_DKIM_FLAG_OVERSIGN_EXISTING (1u << 1u) + +union rspamd_dkim_header_stat { + struct _st { + guint16 count; + guint16 flags; + } s; + guint32 n; +}; + +struct rspamd_dkim_common_ctx { + rspamd_mempool_t *pool; + guint64 sig_hash; + gsize len; + GPtrArray *hlist; + GHashTable *htable; /* header -> count mapping */ + EVP_MD_CTX *headers_hash; + EVP_MD_CTX *body_hash; + enum rspamd_dkim_type type; + guint idx; + gint header_canon_type; + gint body_canon_type; + guint body_canonicalised; + guint headers_canonicalised; + gboolean is_sign; +}; + +enum rspamd_arc_seal_cv { + RSPAMD_ARC_UNKNOWN = 0, + RSPAMD_ARC_NONE, + RSPAMD_ARC_INVALID, + RSPAMD_ARC_FAIL, + RSPAMD_ARC_PASS +}; + + +struct rspamd_dkim_context_s { + struct rspamd_dkim_common_ctx common; + rspamd_mempool_t *pool; + struct rspamd_dns_resolver *resolver; + gsize blen; + gsize bhlen; + gint sig_alg; + guint ver; + time_t timestamp; + time_t expiration; + gchar *domain; + gchar *selector; + gint8 *b; + gchar *short_b; + gint8 *bh; + gchar *dns_key; + enum rspamd_arc_seal_cv cv; + const gchar *dkim_header; +}; + +#define RSPAMD_DKIM_KEY_ID_LEN 16 + +struct rspamd_dkim_key_s { + guint8 *keydata; + guint8 *raw_key; + gsize keylen; + gsize decoded_len; + gchar key_id[RSPAMD_DKIM_KEY_ID_LEN]; + union { + RSA *key_rsa; + EC_KEY *key_ecdsa; + guchar *key_eddsa; + } key; + BIO *key_bio; + EVP_PKEY *key_evp; + time_t mtime; + guint ttl; + enum rspamd_dkim_key_type type; + ref_entry_t ref; +}; + +struct rspamd_dkim_sign_context_s { + struct rspamd_dkim_common_ctx common; + rspamd_dkim_sign_key_t *key; +}; + +struct rspamd_dkim_header { + const gchar *name; + gint count; +}; + +/* Parser of dkim params */ +typedef gboolean (*dkim_parse_param_f)(rspamd_dkim_context_t *ctx, + const gchar *param, gsize len, GError **err); + +static gboolean rspamd_dkim_parse_signature(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_signalg(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_domain(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_canonalg(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_ignore(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_selector(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_hdrlist(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_version(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_timestamp(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_expiration(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_bodyhash(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_bodylength(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_idx(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); +static gboolean rspamd_dkim_parse_cv(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err); + + +static const dkim_parse_param_f parser_funcs[] = { + [DKIM_PARAM_SIGNATURE] = rspamd_dkim_parse_signature, + [DKIM_PARAM_SIGNALG] = rspamd_dkim_parse_signalg, + [DKIM_PARAM_DOMAIN] = rspamd_dkim_parse_domain, + [DKIM_PARAM_CANONALG] = rspamd_dkim_parse_canonalg, + [DKIM_PARAM_QUERYMETHOD] = rspamd_dkim_parse_ignore, + [DKIM_PARAM_SELECTOR] = rspamd_dkim_parse_selector, + [DKIM_PARAM_HDRLIST] = rspamd_dkim_parse_hdrlist, + [DKIM_PARAM_VERSION] = rspamd_dkim_parse_version, + [DKIM_PARAM_IDENTITY] = rspamd_dkim_parse_ignore, + [DKIM_PARAM_TIMESTAMP] = rspamd_dkim_parse_timestamp, + [DKIM_PARAM_EXPIRATION] = rspamd_dkim_parse_expiration, + [DKIM_PARAM_COPIEDHDRS] = rspamd_dkim_parse_ignore, + [DKIM_PARAM_BODYHASH] = rspamd_dkim_parse_bodyhash, + [DKIM_PARAM_BODYLENGTH] = rspamd_dkim_parse_bodylength, + [DKIM_PARAM_IDX] = rspamd_dkim_parse_idx, + [DKIM_PARAM_CV] = rspamd_dkim_parse_cv, + [DKIM_PARAM_IGNORE] = rspamd_dkim_parse_ignore, +}; + +#define DKIM_ERROR dkim_error_quark() +GQuark +dkim_error_quark(void) +{ + return g_quark_from_static_string("dkim-error-quark"); +} + +/* Parsers implementation */ +static gboolean +rspamd_dkim_parse_signature(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + ctx->b = rspamd_mempool_alloc0(ctx->pool, len); + ctx->short_b = rspamd_mempool_alloc0(ctx->pool, RSPAMD_SHORT_BH_LEN + 1); + rspamd_strlcpy(ctx->short_b, param, MIN(len, RSPAMD_SHORT_BH_LEN + 1)); + (void) rspamd_cryptobox_base64_decode(param, len, ctx->b, &ctx->blen); + + return TRUE; +} + +static gboolean +rspamd_dkim_parse_signalg(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + /* XXX: ugly size comparison, improve this code style some day */ + if (len == 8) { + if (memcmp(param, "rsa-sha1", len) == 0) { + ctx->sig_alg = DKIM_SIGN_RSASHA1; + return TRUE; + } + } + else if (len == 10) { + if (memcmp(param, "rsa-sha256", len) == 0) { + ctx->sig_alg = DKIM_SIGN_RSASHA256; + return TRUE; + } + else if (memcmp(param, "rsa-sha512", len) == 0) { + ctx->sig_alg = DKIM_SIGN_RSASHA512; + return TRUE; + } + } + else if (len == 15) { + if (memcmp(param, "ecdsa256-sha256", len) == 0) { + ctx->sig_alg = DKIM_SIGN_ECDSASHA256; + return TRUE; + } + else if (memcmp(param, "ecdsa256-sha512", len) == 0) { + ctx->sig_alg = DKIM_SIGN_ECDSASHA512; + return TRUE; + } + } + else if (len == 14) { + if (memcmp(param, "ed25519-sha256", len) == 0) { + ctx->sig_alg = DKIM_SIGN_EDDSASHA256; + return TRUE; + } + } + + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_INVALID_A, + "invalid dkim sign algorithm"); + return FALSE; +} + +static gboolean +rspamd_dkim_parse_domain(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + if (!rspamd_str_has_8bit(param, len)) { + ctx->domain = rspamd_mempool_alloc(ctx->pool, len + 1); + rspamd_strlcpy(ctx->domain, param, len + 1); + } + else { + ctx->domain = rspamd_dns_resolver_idna_convert_utf8(ctx->resolver, + ctx->pool, param, len, NULL); + + if (!ctx->domain) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_INVALID_H, + "invalid dkim domain tag %.*s: idna failed", + (int) len, param); + + return FALSE; + } + } + + return TRUE; +} + +static gboolean +rspamd_dkim_parse_canonalg(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + const gchar *p, *slash = NULL, *end = param + len; + gsize sl = 0; + + p = param; + while (p != end) { + if (*p == '/') { + slash = p; + break; + } + p++; + sl++; + } + + if (slash == NULL) { + /* Only check header */ + if (len == 6 && memcmp(param, "simple", len) == 0) { + ctx->common.header_canon_type = DKIM_CANON_SIMPLE; + return TRUE; + } + else if (len == 7 && memcmp(param, "relaxed", len) == 0) { + ctx->common.header_canon_type = DKIM_CANON_RELAXED; + return TRUE; + } + } + else { + /* First check header */ + if (sl == 6 && memcmp(param, "simple", sl) == 0) { + ctx->common.header_canon_type = DKIM_CANON_SIMPLE; + } + else if (sl == 7 && memcmp(param, "relaxed", sl) == 0) { + ctx->common.header_canon_type = DKIM_CANON_RELAXED; + } + else { + goto err; + } + /* Check body */ + len -= sl + 1; + slash++; + if (len == 6 && memcmp(slash, "simple", len) == 0) { + ctx->common.body_canon_type = DKIM_CANON_SIMPLE; + return TRUE; + } + else if (len == 7 && memcmp(slash, "relaxed", len) == 0) { + ctx->common.body_canon_type = DKIM_CANON_RELAXED; + return TRUE; + } + } + +err: + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_INVALID_A, + "invalid dkim canonization algorithm"); + return FALSE; +} + +static gboolean +rspamd_dkim_parse_ignore(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + /* Just ignore unused params */ + return TRUE; +} + +static gboolean +rspamd_dkim_parse_selector(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + + if (!rspamd_str_has_8bit(param, len)) { + ctx->selector = rspamd_mempool_alloc(ctx->pool, len + 1); + rspamd_strlcpy(ctx->selector, param, len + 1); + } + else { + ctx->selector = rspamd_dns_resolver_idna_convert_utf8(ctx->resolver, + ctx->pool, param, len, NULL); + + if (!ctx->selector) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_INVALID_H, + "invalid dkim selector tag %.*s: idna failed", + (int) len, param); + + return FALSE; + } + } + + return TRUE; +} + +static void +rspamd_dkim_hlist_free(void *ud) +{ + GPtrArray *a = ud; + + g_ptr_array_free(a, TRUE); +} + +static gboolean +rspamd_dkim_parse_hdrlist_common(struct rspamd_dkim_common_ctx *ctx, + const gchar *param, + gsize len, + gboolean sign, + GError **err) +{ + const gchar *c, *p, *end = param + len; + gchar *h; + gboolean from_found = FALSE, oversign, existing; + guint count = 0; + struct rspamd_dkim_header *new; + gpointer found; + union rspamd_dkim_header_stat u; + + p = param; + while (p <= end) { + if ((p == end || *p == ':')) { + count++; + } + p++; + } + + if (count > 0) { + ctx->hlist = g_ptr_array_sized_new(count); + } + else { + return FALSE; + } + + c = param; + p = param; + ctx->htable = g_hash_table_new(rspamd_strcase_hash, rspamd_strcase_equal); + + while (p <= end) { + if ((p == end || *p == ':') && p - c > 0) { + oversign = FALSE; + existing = FALSE; + h = rspamd_mempool_alloc(ctx->pool, p - c + 1); + rspamd_strlcpy(h, c, p - c + 1); + + g_strstrip(h); + + if (sign) { + if (rspamd_lc_cmp(h, "(o)", 3) == 0) { + oversign = TRUE; + h += 3; + msg_debug_dkim("oversign header: %s", h); + } + else if (rspamd_lc_cmp(h, "(x)", 3) == 0) { + oversign = TRUE; + existing = TRUE; + h += 3; + msg_debug_dkim("oversign existing header: %s", h); + } + } + + /* Check mandatory from */ + if (!from_found && g_ascii_strcasecmp(h, "from") == 0) { + from_found = TRUE; + } + + new = rspamd_mempool_alloc(ctx->pool, + sizeof(struct rspamd_dkim_header)); + new->name = h; + new->count = 0; + u.n = 0; + + g_ptr_array_add(ctx->hlist, new); + found = g_hash_table_lookup(ctx->htable, h); + + if (oversign) { + if (found) { + msg_err_dkim("specified oversigned header more than once: %s", + h); + } + + u.s.flags |= RSPAMD_DKIM_FLAG_OVERSIGN; + + if (existing) { + u.s.flags |= RSPAMD_DKIM_FLAG_OVERSIGN_EXISTING; + } + + u.s.count = 0; + } + else { + if (found != NULL) { + u.n = GPOINTER_TO_UINT(found); + new->count = u.s.count; + u.s.count++; + } + else { + /* Insert new header order to the list */ + u.s.count = new->count + 1; + } + } + + g_hash_table_insert(ctx->htable, h, GUINT_TO_POINTER(u.n)); + + c = p + 1; + p++; + } + else { + p++; + } + } + + if (!ctx->hlist) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_INVALID_H, + "invalid dkim header list"); + return FALSE; + } + else { + if (!from_found) { + g_ptr_array_free(ctx->hlist, TRUE); + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_INVALID_H, + "invalid dkim header list, from header is missing"); + return FALSE; + } + + rspamd_mempool_add_destructor(ctx->pool, + (rspamd_mempool_destruct_t) rspamd_dkim_hlist_free, + ctx->hlist); + rspamd_mempool_add_destructor(ctx->pool, + (rspamd_mempool_destruct_t) g_hash_table_unref, + ctx->htable); + } + + return TRUE; +} + +static gboolean +rspamd_dkim_parse_hdrlist(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + return rspamd_dkim_parse_hdrlist_common(&ctx->common, param, len, FALSE, err); +} + +static gboolean +rspamd_dkim_parse_version(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + if (len != 1 || *param != '1') { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_VERSION, + "invalid dkim version"); + return FALSE; + } + + ctx->ver = 1; + return TRUE; +} + +static gboolean +rspamd_dkim_parse_timestamp(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + gulong val; + + if (!rspamd_strtoul(param, len, &val)) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "invalid dkim timestamp"); + return FALSE; + } + ctx->timestamp = val; + + return TRUE; +} + +static gboolean +rspamd_dkim_parse_expiration(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + gulong val; + + if (!rspamd_strtoul(param, len, &val)) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "invalid dkim expiration"); + return FALSE; + } + ctx->expiration = val; + + return TRUE; +} + +static gboolean +rspamd_dkim_parse_bodyhash(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + ctx->bh = rspamd_mempool_alloc0(ctx->pool, len); + (void) rspamd_cryptobox_base64_decode(param, len, ctx->bh, &ctx->bhlen); + + return TRUE; +} + +static gboolean +rspamd_dkim_parse_bodylength(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + gulong val; + + if (!rspamd_strtoul(param, len, &val)) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_INVALID_L, + "invalid dkim body length"); + return FALSE; + } + ctx->common.len = val; + + return TRUE; +} + +static gboolean +rspamd_dkim_parse_idx(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + gulong val; + + if (!rspamd_strtoul(param, len, &val)) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_INVALID_L, + "invalid ARC idx"); + return FALSE; + } + ctx->common.idx = val; + + return TRUE; +} + +static gboolean +rspamd_dkim_parse_cv(rspamd_dkim_context_t *ctx, + const gchar *param, + gsize len, + GError **err) +{ + + /* Only check header */ + if (len == 4 && memcmp(param, "fail", len) == 0) { + ctx->cv = RSPAMD_ARC_FAIL; + return TRUE; + } + else if (len == 4 && memcmp(param, "pass", len) == 0) { + ctx->cv = RSPAMD_ARC_PASS; + return TRUE; + } + else if (len == 4 && memcmp(param, "none", len) == 0) { + ctx->cv = RSPAMD_ARC_NONE; + return TRUE; + } + else if (len == 7 && memcmp(param, "invalid", len) == 0) { + ctx->cv = RSPAMD_ARC_INVALID; + return TRUE; + } + + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "invalid arc seal verification result"); + + return FALSE; +} + + +static void +rspamd_dkim_add_arc_seal_headers(rspamd_mempool_t *pool, + struct rspamd_dkim_common_ctx *ctx) +{ + struct rspamd_dkim_header *hdr; + gint count = ctx->idx, i; + + ctx->hlist = g_ptr_array_sized_new(count * 3 - 1); + + for (i = 0; i < count; i++) { + /* Authentication results */ + hdr = rspamd_mempool_alloc(pool, sizeof(*hdr)); + hdr->name = RSPAMD_DKIM_ARC_AUTHHEADER; + hdr->count = -(i + 1); + g_ptr_array_add(ctx->hlist, hdr); + + /* Arc signature */ + hdr = rspamd_mempool_alloc(pool, sizeof(*hdr)); + hdr->name = RSPAMD_DKIM_ARC_SIGNHEADER; + hdr->count = -(i + 1); + g_ptr_array_add(ctx->hlist, hdr); + + /* Arc seal (except last one) */ + if (i != count - 1) { + hdr = rspamd_mempool_alloc(pool, sizeof(*hdr)); + hdr->name = RSPAMD_DKIM_ARC_SEALHEADER; + hdr->count = -(i + 1); + g_ptr_array_add(ctx->hlist, hdr); + } + } + + rspamd_mempool_add_destructor(ctx->pool, + (rspamd_mempool_destruct_t) rspamd_dkim_hlist_free, + ctx->hlist); +} + +/** + * Create new dkim context from signature + * @param sig message's signature + * @param pool pool to allocate memory from + * @param err pointer to error object + * @return new context or NULL + */ +rspamd_dkim_context_t * +rspamd_create_dkim_context(const gchar *sig, + rspamd_mempool_t *pool, + struct rspamd_dns_resolver *resolver, + guint time_jitter, + enum rspamd_dkim_type type, + GError **err) +{ + const gchar *p, *c, *tag = NULL, *end; + gint taglen; + gint param = DKIM_PARAM_UNKNOWN; + const EVP_MD *md_alg; + time_t now; + rspamd_dkim_context_t *ctx; + enum { + DKIM_STATE_TAG = 0, + DKIM_STATE_AFTER_TAG, + DKIM_STATE_VALUE, + DKIM_STATE_SKIP_SPACES = 99, + DKIM_STATE_ERROR = 100 + } state, + next_state; + + + if (sig == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_EMPTY_B, + "empty signature"); + return NULL; + } + + ctx = rspamd_mempool_alloc0(pool, sizeof(rspamd_dkim_context_t)); + ctx->pool = pool; + ctx->resolver = resolver; + + if (type == RSPAMD_DKIM_ARC_SEAL) { + ctx->common.header_canon_type = DKIM_CANON_RELAXED; + ctx->common.body_canon_type = DKIM_CANON_RELAXED; + } + else { + ctx->common.header_canon_type = DKIM_CANON_DEFAULT; + ctx->common.body_canon_type = DKIM_CANON_DEFAULT; + } + + ctx->sig_alg = DKIM_SIGN_UNKNOWN; + ctx->common.pool = pool; + ctx->common.type = type; + /* A simple state machine of parsing tags */ + state = DKIM_STATE_SKIP_SPACES; + next_state = DKIM_STATE_TAG; + taglen = 0; + p = sig; + c = sig; + end = p + strlen(p); + ctx->common.sig_hash = rspamd_cryptobox_fast_hash(sig, end - sig, + rspamd_hash_seed()); + + msg_debug_dkim("create dkim context sig = %L", ctx->common.sig_hash); + + while (p <= end) { + switch (state) { + case DKIM_STATE_TAG: + if (g_ascii_isspace(*p)) { + taglen = (int) (p - c); + while (*p && g_ascii_isspace(*p)) { + /* Skip spaces before '=' sign */ + p++; + } + if (*p != '=') { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "invalid dkim param"); + state = DKIM_STATE_ERROR; + } + else { + state = DKIM_STATE_SKIP_SPACES; + next_state = DKIM_STATE_AFTER_TAG; + param = DKIM_PARAM_UNKNOWN; + p++; + tag = c; + } + } + else if (*p == '=') { + state = DKIM_STATE_SKIP_SPACES; + next_state = DKIM_STATE_AFTER_TAG; + param = DKIM_PARAM_UNKNOWN; + p++; + tag = c; + } + else { + taglen++; + + if (taglen > G_MAXINT8) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "too long dkim tag"); + state = DKIM_STATE_ERROR; + } + else { + p++; + } + } + break; + case DKIM_STATE_AFTER_TAG: + /* We got tag at tag and len at taglen */ + switch (taglen) { + case 0: + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "zero length dkim param"); + state = DKIM_STATE_ERROR; + break; + case 1: + /* 1 character tags */ + switch (*tag) { + case 'v': + if (type == RSPAMD_DKIM_NORMAL) { + param = DKIM_PARAM_VERSION; + } + else { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "invalid ARC v param"); + state = DKIM_STATE_ERROR; + break; + } + break; + case 'a': + param = DKIM_PARAM_SIGNALG; + break; + case 'b': + param = DKIM_PARAM_SIGNATURE; + break; + case 'c': + param = DKIM_PARAM_CANONALG; + break; + case 'd': + param = DKIM_PARAM_DOMAIN; + break; + case 'h': + if (type == RSPAMD_DKIM_ARC_SEAL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "ARC seal must NOT have h= tag"); + state = DKIM_STATE_ERROR; + break; + } + else { + param = DKIM_PARAM_HDRLIST; + } + break; + case 'i': + if (type == RSPAMD_DKIM_NORMAL) { + param = DKIM_PARAM_IDENTITY; + } + else { + param = DKIM_PARAM_IDX; + } + break; + case 'l': + param = DKIM_PARAM_BODYLENGTH; + break; + case 'q': + param = DKIM_PARAM_QUERYMETHOD; + break; + case 's': + param = DKIM_PARAM_SELECTOR; + break; + case 't': + param = DKIM_PARAM_TIMESTAMP; + break; + case 'x': + param = DKIM_PARAM_EXPIRATION; + break; + case 'z': + param = DKIM_PARAM_COPIEDHDRS; + break; + case 'r': + param = DKIM_PARAM_IGNORE; + break; + default: + param = DKIM_PARAM_UNKNOWN; + msg_debug_dkim("unknown DKIM param %c, ignoring it", *tag); + break; + } + break; + case 2: + /* Two characters tags, e.g. `bh` */ + if (tag[0] == 'b' && tag[1] == 'h') { + if (type == RSPAMD_DKIM_ARC_SEAL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "ARC seal must NOT have bh= tag"); + state = DKIM_STATE_ERROR; + } + else { + param = DKIM_PARAM_BODYHASH; + } + } + else if (tag[0] == 'c' && tag[1] == 'v') { + if (type != RSPAMD_DKIM_ARC_SEAL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "cv tag is valid for ARC-Seal only"); + state = DKIM_STATE_ERROR; + } + else { + param = DKIM_PARAM_CV; + } + } + else { + param = DKIM_PARAM_UNKNOWN; + msg_debug_dkim("unknown DKIM param %*s, ignoring it", taglen, tag); + } + break; + default: + /* Long and unknown (yet) DKIM tag */ + param = DKIM_PARAM_UNKNOWN; + msg_debug_dkim("unknown DKIM param %*s, ignoring it", taglen, tag); + break; + } + + if (state != DKIM_STATE_ERROR) { + /* Skip spaces */ + state = DKIM_STATE_SKIP_SPACES; + next_state = DKIM_STATE_VALUE; + } + break; + case DKIM_STATE_VALUE: + if (*p == ';') { + if (p - c == 0 || c > p) { + state = DKIM_STATE_ERROR; + } + else { + /* Cut trailing spaces for value */ + gint tlen = p - c; + const gchar *tmp = p - 1; + + while (tlen > 0) { + if (!g_ascii_isspace(*tmp)) { + break; + } + tlen--; + tmp--; + } + + if (param != DKIM_PARAM_UNKNOWN) { + if (!parser_funcs[param](ctx, c, tlen, err)) { + state = DKIM_STATE_ERROR; + } + else { + state = DKIM_STATE_SKIP_SPACES; + next_state = DKIM_STATE_TAG; + p++; + taglen = 0; + } + } + else { + /* Unknown param has been ignored */ + msg_debug_dkim("ignored unknown tag parameter value: %*s = %*s", + taglen, tag, tlen, c); + state = DKIM_STATE_SKIP_SPACES; + next_state = DKIM_STATE_TAG; + p++; + taglen = 0; + } + } + } + else if (p == end) { + /* Last parameter with no `;` character */ + gint tlen = p - c; + const gchar *tmp = p - 1; + + while (tlen > 0) { + if (!g_ascii_isspace(*tmp)) { + break; + } + tlen--; + tmp--; + } + + if (param != DKIM_PARAM_UNKNOWN) { + if (!parser_funcs[param](ctx, c, tlen, err)) { + state = DKIM_STATE_ERROR; + } + } + else { + msg_debug_dkim("ignored unknown tag parameter value: %*s: %*s", + taglen, tag, tlen, c); + } + + if (state == DKIM_STATE_ERROR) { + /* + * We need to return from here as state machine won't + * do any more steps after p == end + */ + if (err) { + msg_info_dkim("dkim parse failed: %e", *err); + } + + return NULL; + } + /* Finish processing */ + p++; + } + else { + p++; + } + break; + case DKIM_STATE_SKIP_SPACES: + if (g_ascii_isspace(*p)) { + p++; + } + else { + c = p; + state = next_state; + } + break; + case DKIM_STATE_ERROR: + if (err && *err) { + msg_info_dkim("dkim parse failed: %s", (*err)->message); + return NULL; + } + else { + msg_info_dkim("dkim parse failed: unknown error when parsing %c tag", + tag ? *tag : '?'); + return NULL; + } + break; + } + } + + if (type == RSPAMD_DKIM_ARC_SEAL) { + rspamd_dkim_add_arc_seal_headers(pool, &ctx->common); + } + + /* Now check validity of signature */ + if (ctx->b == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_EMPTY_B, + "b parameter missing"); + return NULL; + } + if (ctx->common.type != RSPAMD_DKIM_ARC_SEAL && ctx->bh == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_EMPTY_BH, + "bh parameter missing"); + return NULL; + } + if (ctx->domain == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_EMPTY_D, + "domain parameter missing"); + return NULL; + } + if (ctx->selector == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_EMPTY_S, + "selector parameter missing"); + return NULL; + } + if (ctx->common.type == RSPAMD_DKIM_NORMAL && ctx->ver == 0) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_EMPTY_V, + "v parameter missing"); + return NULL; + } + if (ctx->common.hlist == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_EMPTY_H, + "h parameter missing"); + return NULL; + } + if (ctx->sig_alg == DKIM_SIGN_UNKNOWN) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_EMPTY_S, + "s parameter missing"); + return NULL; + } + + if (type != RSPAMD_DKIM_ARC_SEAL) { + if (ctx->sig_alg == DKIM_SIGN_RSASHA1) { + /* Check bh length */ + if (ctx->bhlen != (guint) EVP_MD_size(EVP_sha1())) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_BADSIG, + "signature has incorrect length: %zu", + ctx->bhlen); + return NULL; + } + } + else if (ctx->sig_alg == DKIM_SIGN_RSASHA256 || + ctx->sig_alg == DKIM_SIGN_ECDSASHA256) { + if (ctx->bhlen != + (guint) EVP_MD_size(EVP_sha256())) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_BADSIG, + "signature has incorrect length: %zu", + ctx->bhlen); + return NULL; + } + } + else if (ctx->sig_alg == DKIM_SIGN_RSASHA512 || + ctx->sig_alg == DKIM_SIGN_ECDSASHA512) { + if (ctx->bhlen != + (guint) EVP_MD_size(EVP_sha512())) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_BADSIG, + "signature has incorrect length: %zu", + ctx->bhlen); + return NULL; + } + } + } + + /* Check expiration */ + now = time(NULL); + if (ctx->timestamp && now < ctx->timestamp && ctx->timestamp - now > (gint) time_jitter) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_FUTURE, + "signature was made in future, ignoring"); + return NULL; + } + if (ctx->expiration && ctx->expiration < now) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_EXPIRED, + "signature has expired"); + return NULL; + } + + if (ctx->common.type != RSPAMD_DKIM_NORMAL && (ctx->common.idx == 0 || + ctx->common.idx > RSPAMD_DKIM_MAX_ARC_IDX)) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "i parameter missing or invalid for ARC"); + return NULL; + } + + if (ctx->common.type == RSPAMD_DKIM_ARC_SEAL) { + if (ctx->cv == RSPAMD_ARC_UNKNOWN) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_UNKNOWN, + "cv parameter missing or invalid for ARC"); + return NULL; + } + } + + /* Now create dns key to request further */ + gsize dnslen = strlen(ctx->domain) + strlen(ctx->selector) + + sizeof(DKIM_DNSKEYNAME) + 2; + ctx->dns_key = rspamd_mempool_alloc(ctx->pool, dnslen); + rspamd_snprintf(ctx->dns_key, + dnslen, + "%s.%s.%s", + ctx->selector, + DKIM_DNSKEYNAME, + ctx->domain); + + /* Create checksums for further operations */ + if (ctx->sig_alg == DKIM_SIGN_RSASHA1) { + md_alg = EVP_sha1(); + } + else if (ctx->sig_alg == DKIM_SIGN_RSASHA256 || + ctx->sig_alg == DKIM_SIGN_ECDSASHA256 || + ctx->sig_alg == DKIM_SIGN_EDDSASHA256) { + md_alg = EVP_sha256(); + } + else if (ctx->sig_alg == DKIM_SIGN_RSASHA512 || + ctx->sig_alg == DKIM_SIGN_ECDSASHA512) { + md_alg = EVP_sha512(); + } + else { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_BADSIG, + "signature has unsupported signature algorithm"); + + return NULL; + } +#if OPENSSL_VERSION_NUMBER < 0x10100000L || defined(LIBRESSL_VERSION_NUMBER) + ctx->common.body_hash = EVP_MD_CTX_create(); + EVP_DigestInit_ex(ctx->common.body_hash, md_alg, NULL); + ctx->common.headers_hash = EVP_MD_CTX_create(); + EVP_DigestInit_ex(ctx->common.headers_hash, md_alg, NULL); + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) EVP_MD_CTX_destroy, ctx->common.body_hash); + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) EVP_MD_CTX_destroy, ctx->common.headers_hash); +#else + ctx->common.body_hash = EVP_MD_CTX_new(); + EVP_DigestInit_ex(ctx->common.body_hash, md_alg, NULL); + ctx->common.headers_hash = EVP_MD_CTX_new(); + EVP_DigestInit_ex(ctx->common.headers_hash, md_alg, NULL); + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) EVP_MD_CTX_free, ctx->common.body_hash); + rspamd_mempool_add_destructor(pool, + (rspamd_mempool_destruct_t) EVP_MD_CTX_free, ctx->common.headers_hash); +#endif + ctx->dkim_header = sig; + + return ctx; +} + +struct rspamd_dkim_key_cbdata { + rspamd_dkim_context_t *ctx; + dkim_key_handler_f handler; + gpointer ud; +}; + +rspamd_dkim_key_t * +rspamd_dkim_make_key(const gchar *keydata, + guint keylen, enum rspamd_dkim_key_type type, GError **err) +{ + rspamd_dkim_key_t *key = NULL; + + if (keylen < 3) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_KEYFAIL, + "DKIM key is too short to be valid"); + return NULL; + } + + key = g_malloc0(sizeof(rspamd_dkim_key_t)); + REF_INIT_RETAIN(key, rspamd_dkim_key_free); + key->keydata = g_malloc0(keylen + 1); + key->raw_key = g_malloc(keylen); + key->decoded_len = keylen; + key->type = type; + + /* Copy key skipping all spaces and newlines */ + const char *h = keydata; + guint8 *t = key->raw_key; + + while (h - keydata < keylen) { + if (!g_ascii_isspace(*h)) { + *t++ = *h++; + } + else { + h++; + } + } + + key->keylen = t - key->raw_key; + + if (!rspamd_cryptobox_base64_decode(key->raw_key, key->keylen, key->keydata, + &key->decoded_len)) { + REF_RELEASE(key); + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_KEYFAIL, + "DKIM key is not a valid base64 string"); + + return NULL; + } + + /* Calculate ID -> md5 */ + EVP_MD_CTX *mdctx = EVP_MD_CTX_create(); + +#ifdef EVP_MD_CTX_FLAG_NON_FIPS_ALLOW + EVP_MD_CTX_set_flags(mdctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); +#endif + + if (EVP_DigestInit_ex(mdctx, EVP_md5(), NULL) == 1) { + guint dlen = sizeof(key->key_id); + + EVP_DigestUpdate(mdctx, key->keydata, key->decoded_len); + EVP_DigestFinal_ex(mdctx, key->key_id, &dlen); + } + + EVP_MD_CTX_destroy(mdctx); + + if (key->type == RSPAMD_DKIM_KEY_EDDSA) { + key->key.key_eddsa = key->keydata; + + if (key->decoded_len != rspamd_cryptobox_pk_sig_bytes( + RSPAMD_CRYPTOBOX_MODE_25519)) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_KEYFAIL, + "DKIM key is has invalid length %d for eddsa; expected %d", + (gint) key->decoded_len, + rspamd_cryptobox_pk_sig_bytes(RSPAMD_CRYPTOBOX_MODE_25519)); + REF_RELEASE(key); + + return NULL; + } + } + else { + key->key_bio = BIO_new_mem_buf(key->keydata, key->decoded_len); + + if (key->key_bio == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_KEYFAIL, + "cannot make ssl bio from key"); + REF_RELEASE(key); + + return NULL; + } + + key->key_evp = d2i_PUBKEY_bio(key->key_bio, NULL); + + if (key->key_evp == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_KEYFAIL, + "cannot extract pubkey from bio"); + REF_RELEASE(key); + + return NULL; + } + + if (type == RSPAMD_DKIM_KEY_RSA) { + key->key.key_rsa = EVP_PKEY_get1_RSA(key->key_evp); + + if (key->key.key_rsa == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_KEYFAIL, + "cannot extract rsa key from evp key"); + REF_RELEASE(key); + + return NULL; + } + } + else { + key->key.key_ecdsa = EVP_PKEY_get1_EC_KEY(key->key_evp); + + if (key->key.key_ecdsa == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_KEYFAIL, + "cannot extract ecdsa key from evp key"); + REF_RELEASE(key); + + return NULL; + } + } + } + + return key; +} + +const guchar * +rspamd_dkim_key_id(rspamd_dkim_key_t *key) +{ + if (key) { + return key->key_id; + } + + return NULL; +} + +/** + * Free DKIM key + * @param key + */ +void rspamd_dkim_key_free(rspamd_dkim_key_t *key) +{ + if (key->key_evp) { + EVP_PKEY_free(key->key_evp); + } + + if (key->type == RSPAMD_DKIM_KEY_RSA) { + if (key->key.key_rsa) { + RSA_free(key->key.key_rsa); + } + } + else if (key->type == RSPAMD_DKIM_KEY_ECDSA) { + if (key->key.key_ecdsa) { + EC_KEY_free(key->key.key_ecdsa); + } + } + /* Nothing in case of eddsa key */ + if (key->key_bio) { + BIO_free(key->key_bio); + } + + g_free(key->raw_key); + g_free(key->keydata); + g_free(key); +} + +void rspamd_dkim_sign_key_free(rspamd_dkim_sign_key_t *key) +{ + if (key->key_evp) { + EVP_PKEY_free(key->key_evp); + } + if (key->type == RSPAMD_DKIM_KEY_RSA) { + if (key->key.key_rsa) { + RSA_free(key->key.key_rsa); + } + } + if (key->key_bio) { + BIO_free(key->key_bio); + } + + if (key->type == RSPAMD_DKIM_KEY_EDDSA) { + rspamd_explicit_memzero(key->key.key_eddsa, key->keylen); + g_free(key->keydata); + } + + g_free(key); +} + +rspamd_dkim_key_t * +rspamd_dkim_parse_key(const gchar *txt, gsize *keylen, GError **err) +{ + const gchar *c, *p, *end, *key = NULL, *alg = "rsa"; + enum { + read_tag = 0, + read_tag_before_eqsign, + read_eqsign, + read_p_tag, + read_k_tag, + ignore_value, + skip_spaces, + } state = read_tag, + next_state; + gchar tag = '\0'; + gsize klen = 0, alglen = 0; + + c = txt; + p = txt; + end = txt + strlen(txt); + + while (p < end) { + switch (state) { + case read_tag: + if (*p == '=') { + state = read_eqsign; + } + else if (g_ascii_isspace(*p)) { + state = skip_spaces; + + if (tag != '\0') { + /* We had tag letter */ + next_state = read_tag_before_eqsign; + } + else { + /* We had no tag letter, so we ignore empty tag */ + next_state = read_tag; + } + } + else { + tag = *p; + } + p++; + break; + case read_tag_before_eqsign: + /* Input: spaces before eqsign + * Output: either read a next tag (previous had no value), or read value + * p is moved forward + */ + if (*p == '=') { + state = read_eqsign; + } + else { + tag = *p; + state = read_tag; + } + p++; + break; + case read_eqsign: + /* Always switch to skip spaces state and do not advance p */ + state = skip_spaces; + + if (tag == 'p') { + next_state = read_p_tag; + } + else if (tag == 'k') { + next_state = read_k_tag; + } + else { + /* Unknown tag, ignore */ + next_state = ignore_value; + tag = '\0'; + } + break; + case read_p_tag: + if (*p == ';') { + klen = p - c; + key = c; + state = read_tag; + tag = '\0'; + p++; + } + else { + p++; + } + break; + case read_k_tag: + if (*p == ';') { + alglen = p - c; + alg = c; + state = read_tag; + tag = '\0'; + p++; + } + else if (g_ascii_isspace(*p)) { + alglen = p - c; + alg = c; + state = skip_spaces; + next_state = read_tag; + tag = '\0'; + } + else { + p++; + } + break; + case ignore_value: + if (*p == ';') { + state = read_tag; + tag = '\0'; + p++; + } + else if (g_ascii_isspace(*p)) { + state = skip_spaces; + next_state = read_tag; + tag = '\0'; + } + else { + p++; + } + break; + case skip_spaces: + /* Skip spaces and switch to the next state if needed */ + if (g_ascii_isspace(*p)) { + p++; + } + else { + c = p; + state = next_state; + } + break; + default: + break; + } + } + + /* Leftover */ + switch (state) { + case read_p_tag: + klen = p - c; + key = c; + break; + case read_k_tag: + alglen = p - c; + alg = c; + break; + default: + break; + } + + if (klen == 0 || key == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_KEYFAIL, + "key is missing"); + + return NULL; + } + + if (alglen == 0 || alg == NULL) { + alg = "rsa"; /* Implicit */ + alglen = 3; + } + + if (keylen) { + *keylen = klen; + } + + if (alglen == 8 && rspamd_lc_cmp(alg, "ecdsa256", alglen) == 0) { + return rspamd_dkim_make_key(key, klen, + RSPAMD_DKIM_KEY_ECDSA, err); + } + else if (alglen == 7 && rspamd_lc_cmp(alg, "ed25519", alglen) == 0) { + return rspamd_dkim_make_key(key, klen, + RSPAMD_DKIM_KEY_EDDSA, err); + } + else { + /* We assume RSA default in all cases */ + return rspamd_dkim_make_key(key, klen, + RSPAMD_DKIM_KEY_RSA, err); + } + + g_assert_not_reached(); + + return NULL; +} + +/* Get TXT request data and parse it */ +static void +rspamd_dkim_dns_cb(struct rdns_reply *reply, gpointer arg) +{ + struct rspamd_dkim_key_cbdata *cbdata = arg; + rspamd_dkim_key_t *key = NULL; + GError *err = NULL; + struct rdns_reply_entry *elt; + gsize keylen = 0; + + if (reply->code != RDNS_RC_NOERROR) { + gint err_code = DKIM_SIGERROR_NOKEY; + if (reply->code == RDNS_RC_NOREC) { + err_code = DKIM_SIGERROR_NOREC; + } + else if (reply->code == RDNS_RC_NXDOMAIN) { + err_code = DKIM_SIGERROR_NOREC; + } + g_set_error(&err, + DKIM_ERROR, + err_code, + "dns request to %s failed: %s", + cbdata->ctx->dns_key, + rdns_strerror(reply->code)); + cbdata->handler(NULL, 0, cbdata->ctx, cbdata->ud, err); + } + else { + LL_FOREACH(reply->entries, elt) + { + if (elt->type == RDNS_REQUEST_TXT) { + if (err != NULL) { + /* Free error as it is insignificant */ + g_error_free(err); + err = NULL; + } + key = rspamd_dkim_parse_key(elt->content.txt.data, + &keylen, + &err); + if (key) { + key->ttl = elt->ttl; + break; + } + } + } + cbdata->handler(key, keylen, cbdata->ctx, cbdata->ud, err); + } +} + +/** + * Make DNS request for specified context and obtain and parse key + * @param ctx dkim context from signature + * @param resolver dns resolver object + * @param s async session to make request + * @return + */ +gboolean +rspamd_get_dkim_key(rspamd_dkim_context_t *ctx, + struct rspamd_task *task, + dkim_key_handler_f handler, + gpointer ud) +{ + struct rspamd_dkim_key_cbdata *cbdata; + + g_return_val_if_fail(ctx != NULL, FALSE); + g_return_val_if_fail(ctx->dns_key != NULL, FALSE); + + cbdata = + rspamd_mempool_alloc(ctx->pool, + sizeof(struct rspamd_dkim_key_cbdata)); + cbdata->ctx = ctx; + cbdata->handler = handler; + cbdata->ud = ud; + + return rspamd_dns_resolver_request_task_forced(task, + rspamd_dkim_dns_cb, + cbdata, + RDNS_REQUEST_TXT, + ctx->dns_key); +} + +static gboolean +rspamd_dkim_relaxed_body_step(struct rspamd_dkim_common_ctx *ctx, EVP_MD_CTX *ck, + const gchar **start, guint size, + gssize *remain) +{ + const gchar *h; + gchar *t; + guint len, inlen; + gssize octets_remain; + gboolean got_sp, ret = TRUE; + gchar buf[1024]; + + len = size; + inlen = sizeof(buf) - 1; + h = *start; + t = buf; + got_sp = FALSE; + octets_remain = *remain; + + while (len > 0 && inlen > 0 && (octets_remain > 0)) { + + if (*h == '\r' || *h == '\n') { + if (got_sp) { + /* Ignore spaces at the end of line */ + t--; + } + *t++ = '\r'; + *t++ = '\n'; + + if (len > 1 && (*h == '\r' && h[1] == '\n')) { + h += 2; + len -= 2; + octets_remain -= 2; + } + else { + h++; + len--; + if (octets_remain >= 2) { + octets_remain -= 2; /* Input has just \n or \r so we actually add more octets */ + } + else { + octets_remain--; + break; + } + } + break; + } + else if (g_ascii_isspace(*h)) { + if (got_sp) { + /* Ignore multiply spaces */ + h++; + len--; + continue; + } + else { + *t++ = ' '; + h++; + inlen--; + len--; + octets_remain--; + got_sp = TRUE; + continue; + } + } + else { + got_sp = FALSE; + } + + *t++ = *h++; + inlen--; + len--; + octets_remain--; + } + + if (octets_remain < 0) { + /* Absurdic l tag value, but we still need to rewind the t pointer back */ + while (t > buf && octets_remain < 0) { + t--; + octets_remain++; + } + + ret = FALSE; + } + + *start = h; + + if (t - buf > 0) { + gsize cklen = t - buf; + + EVP_DigestUpdate(ck, buf, cklen); + ctx->body_canonicalised += cklen; + msg_debug_dkim("relaxed update signature with body buffer " + "(%z size, %z -> %z remain)", + cklen, *remain, octets_remain); + *remain = octets_remain; + } + + return ret && ((len > 0) && (octets_remain > 0)); +} + +static gboolean +rspamd_dkim_simple_body_step(struct rspamd_dkim_common_ctx *ctx, + EVP_MD_CTX *ck, const gchar **start, guint size, + gssize *remain) +{ + const gchar *h; + gchar *t; + guint len, inlen; + gssize octets_remain; + gchar buf[1024]; + + len = size; + inlen = sizeof(buf) - 1; + h = *start; + t = &buf[0]; + octets_remain = *remain; + + while (len > 0 && inlen > 0 && (octets_remain != 0)) { + if (*h == '\r' || *h == '\n') { + *t++ = '\r'; + *t++ = '\n'; + + if (len > 1 && (*h == '\r' && h[1] == '\n')) { + h += 2; + len -= 2; + + if (octets_remain >= 2) { + octets_remain -= 2; /* Input has just \n or \r so we actually add more octets */ + } + else { + octets_remain--; + } + } + else { + h++; + len--; + + if (octets_remain >= 2) { + octets_remain -= 2; /* Input has just \n or \r so we actually add more octets */ + } + else { + octets_remain--; + } + } + break; + } + + *t++ = *h++; + octets_remain--; + inlen--; + len--; + } + + *start = h; + + if (t - buf > 0) { + gsize cklen = t - buf; + + EVP_DigestUpdate(ck, buf, cklen); + ctx->body_canonicalised += cklen; + msg_debug_dkim("simple update signature with body buffer " + "(%z size, %z -> %z remain)", + cklen, *remain, octets_remain); + *remain = octets_remain; + } + + return ((len != 0) && (octets_remain != 0)); +} + +static const gchar * +rspamd_dkim_skip_empty_lines(const gchar *start, const gchar *end, + guint type, gboolean sign, gboolean *need_crlf) +{ + const gchar *p = end - 1, *t; + enum { + init = 0, + init_2, + got_cr, + got_lf, + got_crlf, + test_spaces, + } state = init; + guint skip = 0; + + while (p >= start) { + switch (state) { + case init: + if (*p == '\r') { + state = got_cr; + } + else if (*p == '\n') { + state = got_lf; + } + else if (type == DKIM_CANON_RELAXED && *p == ' ') { + skip = 0; + state = test_spaces; + } + else { + if (sign || type != DKIM_CANON_RELAXED) { + *need_crlf = TRUE; + } + + goto end; + } + break; + case init_2: + if (*p == '\r') { + state = got_cr; + } + else if (*p == '\n') { + state = got_lf; + } + else if (type == DKIM_CANON_RELAXED && (*p == ' ' || *p == '\t')) { + skip = 0; + state = test_spaces; + } + else { + goto end; + } + break; + case got_cr: + if (p >= start + 1) { + if (*(p - 1) == '\r') { + p--; + state = got_cr; + } + else if (*(p - 1) == '\n') { + if ((*p - 2) == '\r') { + /* \r\n\r -> we know about one line */ + p -= 1; + state = got_crlf; + } + else { + /* \n\r -> we know about one line */ + p -= 1; + state = got_lf; + } + } + else if (type == DKIM_CANON_RELAXED && (*(p - 1) == ' ' || + *(p - 1) == '\t')) { + skip = 1; + state = test_spaces; + } + else { + goto end; + } + } + else { + if (g_ascii_isspace(*(p - 1))) { + if (type == DKIM_CANON_RELAXED) { + p -= 1; + } + } + goto end; + } + break; + case got_lf: + if (p >= start + 1) { + if (*(p - 1) == '\r') { + state = got_crlf; + } + else if (*(p - 1) == '\n') { + /* We know about one line */ + p--; + state = got_lf; + } + else if (type == DKIM_CANON_RELAXED && (*(p - 1) == ' ' || + *(p - 1) == '\t')) { + skip = 1; + state = test_spaces; + } + else { + goto end; + } + } + else { + if (g_ascii_isspace(*(p - 1))) { + if (type == DKIM_CANON_RELAXED) { + p -= 1; + } + } + goto end; + } + break; + case got_crlf: + if (p >= start + 2) { + if (*(p - 2) == '\r') { + p -= 2; + state = got_cr; + } + else if (*(p - 2) == '\n') { + p -= 2; + state = got_lf; + } + else if (type == DKIM_CANON_RELAXED && (*(p - 2) == ' ' || + *(p - 2) == '\t')) { + skip = 2; + state = test_spaces; + } + else { + goto end; + } + } + else { + if (g_ascii_isspace(*(p - 2))) { + if (type == DKIM_CANON_RELAXED) { + p -= 2; + } + } + goto end; + } + break; + case test_spaces: + t = p - skip; + + while (t >= start + 2 && (*t == ' ' || *t == '\t')) { + t--; + } + + if (*t == '\r') { + p = t; + state = got_cr; + } + else if (*t == '\n') { + p = t; + state = got_lf; + } + else { + goto end; + } + break; + } + } + +end: + return p; +} + +static gboolean +rspamd_dkim_canonize_body(struct rspamd_dkim_common_ctx *ctx, + const gchar *start, + const gchar *end, + gboolean sign) +{ + const gchar *p; + gssize remain = ctx->len ? ctx->len : G_MAXSSIZE; + guint total_len = end - start; + gboolean need_crlf = FALSE; + + if (start == NULL) { + /* Empty body */ + if (ctx->body_canon_type == DKIM_CANON_SIMPLE) { + EVP_DigestUpdate(ctx->body_hash, CRLF, sizeof(CRLF) - 1); + ctx->body_canonicalised += sizeof(CRLF) - 1; + } + else { + EVP_DigestUpdate(ctx->body_hash, "", 0); + } + } + else { + /* Strip extra ending CRLF */ + p = rspamd_dkim_skip_empty_lines(start, end, ctx->body_canon_type, + sign, &need_crlf); + end = p + 1; + + if (end == start) { + /* Empty body */ + if (ctx->body_canon_type == DKIM_CANON_SIMPLE) { + EVP_DigestUpdate(ctx->body_hash, CRLF, sizeof(CRLF) - 1); + ctx->body_canonicalised += sizeof(CRLF) - 1; + } + else { + EVP_DigestUpdate(ctx->body_hash, "", 0); + } + } + else { + if (ctx->body_canon_type == DKIM_CANON_SIMPLE) { + /* Simple canonization */ + while (rspamd_dkim_simple_body_step(ctx, ctx->body_hash, + &start, end - start, &remain)) + ; + + /* + * If we have l= tag then we cannot add crlf... + */ + if (need_crlf) { + /* l is evil... */ + if (ctx->len == 0) { + remain = 2; + } + else { + if (ctx->len <= total_len) { + /* We don't have enough l to add \r\n */ + remain = 0; + } + else { + if (ctx->len - total_len >= 2) { + remain = 2; + } + else { + remain = ctx->len - total_len; + } + } + } + + start = "\r\n"; + end = start + 2; + + rspamd_dkim_simple_body_step(ctx, ctx->body_hash, + &start, end - start, &remain); + } + } + else { + while (rspamd_dkim_relaxed_body_step(ctx, ctx->body_hash, + &start, end - start, &remain)) + ; + if (need_crlf) { + start = "\r\n"; + end = start + 2; + remain = 2; + rspamd_dkim_relaxed_body_step(ctx, ctx->body_hash, + &start, end - start, &remain); + } + } + } + return TRUE; + } + + /* TODO: Implement relaxed algorithm */ + return FALSE; +} + +/* Update hash converting all CR and LF to CRLF */ +static void +rspamd_dkim_hash_update(EVP_MD_CTX *ck, const gchar *begin, gsize len) +{ + const gchar *p, *c, *end; + + end = begin + len; + p = begin; + c = p; + + while (p < end) { + if (*p == '\r') { + EVP_DigestUpdate(ck, c, p - c); + EVP_DigestUpdate(ck, CRLF, sizeof(CRLF) - 1); + p++; + + if (p < end && *p == '\n') { + p++; + } + c = p; + } + else if (*p == '\n') { + EVP_DigestUpdate(ck, c, p - c); + EVP_DigestUpdate(ck, CRLF, sizeof(CRLF) - 1); + p++; + c = p; + } + else { + p++; + } + } + + if (p > c) { + EVP_DigestUpdate(ck, c, p - c); + } +} + +/* Update hash by signature value (ignoring b= tag) */ +static void +rspamd_dkim_signature_update(struct rspamd_dkim_common_ctx *ctx, + const gchar *begin, + guint len) +{ + const gchar *p, *c, *end; + gboolean tag, skip; + + end = begin + len; + p = begin; + c = begin; + tag = TRUE; + skip = FALSE; + + while (p < end) { + if (tag && p[0] == 'b' && p[1] == '=') { + /* Add to signature */ + msg_debug_dkim("initial update hash with signature part: %*s", + (gint) (p - c + 2), + c); + ctx->headers_canonicalised += p - c + 2; + rspamd_dkim_hash_update(ctx->headers_hash, c, p - c + 2); + skip = TRUE; + } + else if (skip && (*p == ';' || p == end - 1)) { + skip = FALSE; + c = p; + } + else if (!tag && *p == ';') { + tag = TRUE; + } + else if (tag && *p == '=') { + tag = FALSE; + } + p++; + } + + p--; + /* Skip \r\n at the end */ + while ((*p == '\r' || *p == '\n') && p >= c) { + p--; + } + + if (p - c + 1 > 0) { + msg_debug_dkim("final update hash with signature part: %*s", + (gint) (p - c + 1), c); + ctx->headers_canonicalised += p - c + 1; + rspamd_dkim_hash_update(ctx->headers_hash, c, p - c + 1); + } +} + +goffset +rspamd_dkim_canonize_header_relaxed_str(const gchar *hname, + const gchar *hvalue, + gchar *out, + gsize outlen) +{ + gchar *t; + const guchar *h; + gboolean got_sp; + + /* Name part */ + t = out; + h = hname; + + while (*h && t - out < outlen) { + *t++ = lc_map[*h++]; + } + + if (t - out >= outlen) { + return -1; + } + + *t++ = ':'; + + /* Value part */ + h = hvalue; + /* Skip spaces at the beginning */ + while (g_ascii_isspace(*h)) { + h++; + } + + got_sp = FALSE; + + while (*h && (t - out < outlen)) { + if (g_ascii_isspace(*h)) { + if (got_sp) { + h++; + continue; + } + else { + got_sp = TRUE; + *t++ = ' '; + h++; + continue; + } + } + else { + got_sp = FALSE; + } + + *t++ = *h++; + } + + if (g_ascii_isspace(*(t - 1))) { + t--; + } + + if (t - out >= outlen - 2) { + return -1; + } + + *t++ = '\r'; + *t++ = '\n'; + *t = '\0'; + + return t - out; +} + +static gboolean +rspamd_dkim_canonize_header_relaxed(struct rspamd_dkim_common_ctx *ctx, + const gchar *header, + const gchar *header_name, + gboolean is_sign, + guint count, + bool is_seal) +{ + static gchar st_buf[8192]; + gchar *buf; + guint inlen; + goffset r; + gboolean allocated = FALSE; + + inlen = strlen(header) + strlen(header_name) + sizeof(":" CRLF); + + if (inlen > sizeof(st_buf)) { + buf = g_malloc(inlen); + allocated = TRUE; + } + else { + /* Faster */ + buf = st_buf; + } + + r = rspamd_dkim_canonize_header_relaxed_str(header_name, header, buf, inlen); + + g_assert(r != -1); + + if (!is_sign) { + msg_debug_dkim("update %s with header (idx=%d): %s", + is_seal ? "seal" : "signature", count, buf); + EVP_DigestUpdate(ctx->headers_hash, buf, r); + } + else { + rspamd_dkim_signature_update(ctx, buf, r); + } + + if (allocated) { + g_free(buf); + } + + return TRUE; +} + + +static gboolean +rspamd_dkim_canonize_header(struct rspamd_dkim_common_ctx *ctx, + struct rspamd_task *task, + const gchar *header_name, + gint count, + const gchar *dkim_header, + const gchar *dkim_domain) +{ + struct rspamd_mime_header *rh, *cur, *sel = NULL; + gint hdr_cnt = 0; + bool use_idx = false, is_sign = ctx->is_sign; + + /* + * TODO: + * Temporary hack to prevent linked list being misused until refactored + */ + const guint max_list_iters = 1000; + + if (count < 0) { + use_idx = true; + count = -(count); /* use i= in header content as it is arc stuff */ + } + + if (dkim_header == NULL) { + rh = rspamd_message_get_header_array(task, header_name, + is_sign); + + if (rh) { + /* Check uniqueness of the header but we count from the bottom to top */ + if (!use_idx) { + for (cur = rh->prev;; cur = cur->prev) { + if (hdr_cnt == count) { + sel = cur; + } + + hdr_cnt++; + + if (cur == rh || hdr_cnt >= max_list_iters) { + /* Cycle */ + break; + } + } + + if ((rh->flags & RSPAMD_HEADER_UNIQUE) && hdr_cnt > 1) { + guint64 random_cookie = ottery_rand_uint64(); + + msg_warn_dkim("header %s is intended to be unique by" + " email standards, but we have %d headers of this" + " type, artificially break DKIM check", + header_name, + hdr_cnt); + rspamd_dkim_hash_update(ctx->headers_hash, + (const gchar *) &random_cookie, + sizeof(random_cookie)); + ctx->headers_canonicalised += sizeof(random_cookie); + + return FALSE; + } + + if (hdr_cnt <= count) { + /* + * If DKIM has less headers requested than there are in a + * message, then it's fine, it allows adding extra headers + */ + return TRUE; + } + } + else { + /* + * This branch is used for ARC headers, and it orders them based on + * i=<number> string and not their real order in the list of headers + */ + gchar idx_buf[16]; + gint id_len, i; + + id_len = rspamd_snprintf(idx_buf, sizeof(idx_buf), "i=%d;", + count); + + for (cur = rh->prev, i = 0; i < max_list_iters; cur = cur->prev, i++) { + if (cur->decoded && + rspamd_substring_search(cur->decoded, strlen(cur->decoded), + idx_buf, id_len) != -1) { + sel = cur; + break; + } + + if (cur == rh) { + /* Cycle */ + break; + } + } + + if (sel == NULL) { + return FALSE; + } + } + + /* Selected header must be non-null if previous condition is false */ + g_assert(sel != NULL); + + if (ctx->header_canon_type == DKIM_CANON_SIMPLE) { + rspamd_dkim_hash_update(ctx->headers_hash, sel->raw_value, + sel->raw_len); + ctx->headers_canonicalised += sel->raw_len; + msg_debug_dkim("update %s with header (idx=%d): %*s", + (use_idx ? "seal" : "signature"), + count, (gint) sel->raw_len, sel->raw_value); + } + else { + if (is_sign && (sel->flags & RSPAMD_HEADER_FROM)) { + /* Special handling of the From handling when rewrite is done */ + gboolean has_rewrite = FALSE; + guint i; + struct rspamd_email_address *addr; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, from_mime), i, addr) + { + if ((addr->flags & RSPAMD_EMAIL_ADDR_ORIGINAL) && !(addr->flags & RSPAMD_EMAIL_ADDR_ALIASED)) { + has_rewrite = TRUE; + } + } + + if (has_rewrite) { + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, from_mime), i, addr) + { + if (!(addr->flags & RSPAMD_EMAIL_ADDR_ORIGINAL)) { + if (!rspamd_dkim_canonize_header_relaxed(ctx, addr->raw, + header_name, FALSE, i, use_idx)) { + return FALSE; + } + + return TRUE; + } + } + } + } + + if (!rspamd_dkim_canonize_header_relaxed(ctx, sel->value, + header_name, FALSE, count, use_idx)) { + return FALSE; + } + } + } + } + else { + /* For signature check just use the saved dkim header */ + if (ctx->header_canon_type == DKIM_CANON_SIMPLE) { + /* We need to find our own signature and use it */ + rh = rspamd_message_get_header_array(task, header_name, is_sign); + + if (rh) { + /* We need to find our own signature */ + if (!dkim_domain) { + msg_err_dkim("cannot verify dkim as we have no dkim domain!"); + return FALSE; + } + + gboolean found = FALSE; + + DL_FOREACH(rh, cur) + { + guint64 th = rspamd_cryptobox_fast_hash(cur->decoded, + strlen(cur->decoded), rspamd_hash_seed()); + + if (th == ctx->sig_hash) { + rspamd_dkim_signature_update(ctx, cur->raw_value, + cur->raw_len); + found = TRUE; + break; + } + } + if (!found) { + msg_err_dkim("BUGON: cannot verify dkim as we have lost our signature" + " during simple canonicalisation, expected hash=%L", + ctx->sig_hash); + return FALSE; + } + } + else { + return FALSE; + } + } + else { + if (!rspamd_dkim_canonize_header_relaxed(ctx, + dkim_header, + header_name, + TRUE, 0, use_idx)) { + return FALSE; + } + } + } + + return TRUE; +} + +struct rspamd_dkim_cached_hash { + guchar *digest_normal; + guchar *digest_cr; + guchar *digest_crlf; + gchar *type; +}; + +static struct rspamd_dkim_cached_hash * +rspamd_dkim_check_bh_cached(struct rspamd_dkim_common_ctx *ctx, + struct rspamd_task *task, gsize bhlen, gboolean is_sign) +{ + gchar typebuf[64]; + struct rspamd_dkim_cached_hash *res; + + rspamd_snprintf(typebuf, sizeof(typebuf), + RSPAMD_MEMPOOL_DKIM_BH_CACHE "%z_%s_%d_%z", + bhlen, + ctx->body_canon_type == DKIM_CANON_RELAXED ? "1" : "0", + !!is_sign, + ctx->len); + + res = rspamd_mempool_get_variable(task->task_pool, + typebuf); + + if (!res) { + res = rspamd_mempool_alloc0(task->task_pool, sizeof(*res)); + res->type = rspamd_mempool_strdup(task->task_pool, typebuf); + rspamd_mempool_set_variable(task->task_pool, + res->type, res, NULL); + } + + return res; +} + +static const char * +rspamd_dkim_type_to_string(enum rspamd_dkim_type t) +{ + switch (t) { + case RSPAMD_DKIM_NORMAL: + return "dkim"; + case RSPAMD_DKIM_ARC_SIG: + return "arc_sig"; + case RSPAMD_DKIM_ARC_SEAL: + default: + return "arc_seal"; + } +} + +/** + * Check task for dkim context using dkim key + * @param ctx dkim verify context + * @param key dkim key (from cache or from dns request) + * @param task task to check + * @return + */ +struct rspamd_dkim_check_result * +rspamd_dkim_check(rspamd_dkim_context_t *ctx, + rspamd_dkim_key_t *key, + struct rspamd_task *task) +{ + const gchar *body_end, *body_start; + guchar raw_digest[EVP_MAX_MD_SIZE]; + struct rspamd_dkim_cached_hash *cached_bh = NULL; + EVP_MD_CTX *cpy_ctx = NULL; + gsize dlen = 0; + struct rspamd_dkim_check_result *res; + guint i; + struct rspamd_dkim_header *dh; + gint nid; + + g_return_val_if_fail(ctx != NULL, NULL); + g_return_val_if_fail(key != NULL, NULL); + g_return_val_if_fail(task->msg.len > 0, NULL); + + /* First of all find place of body */ + body_end = task->msg.begin + task->msg.len; + + body_start = MESSAGE_FIELD(task, raw_headers_content).body_start; + + res = rspamd_mempool_alloc0(task->task_pool, sizeof(*res)); + res->ctx = ctx; + res->selector = ctx->selector; + res->domain = ctx->domain; + res->fail_reason = NULL; + res->short_b = ctx->short_b; + res->rcode = DKIM_CONTINUE; + + if (!body_start) { + res->rcode = DKIM_ERROR; + return res; + } + + if (ctx->common.type != RSPAMD_DKIM_ARC_SEAL) { + dlen = EVP_MD_CTX_size(ctx->common.body_hash); + cached_bh = rspamd_dkim_check_bh_cached(&ctx->common, task, + dlen, FALSE); + + if (!cached_bh->digest_normal) { + /* Start canonization of body part */ + if (!rspamd_dkim_canonize_body(&ctx->common, body_start, body_end, + FALSE)) { + res->rcode = DKIM_RECORD_ERROR; + return res; + } + } + } + + /* Now canonize headers */ + for (i = 0; i < ctx->common.hlist->len; i++) { + dh = g_ptr_array_index(ctx->common.hlist, i); + rspamd_dkim_canonize_header(&ctx->common, task, dh->name, dh->count, + NULL, NULL); + } + + /* Canonize dkim signature */ + switch (ctx->common.type) { + case RSPAMD_DKIM_NORMAL: + rspamd_dkim_canonize_header(&ctx->common, task, RSPAMD_DKIM_SIGNHEADER, 0, + ctx->dkim_header, ctx->domain); + break; + case RSPAMD_DKIM_ARC_SIG: + rspamd_dkim_canonize_header(&ctx->common, task, RSPAMD_DKIM_ARC_SIGNHEADER, 0, + ctx->dkim_header, ctx->domain); + break; + case RSPAMD_DKIM_ARC_SEAL: + rspamd_dkim_canonize_header(&ctx->common, task, RSPAMD_DKIM_ARC_SEALHEADER, 0, + ctx->dkim_header, ctx->domain); + break; + } + + + /* Use cached BH for all but arc seal, if it is not NULL we are not in arc seal mode */ + if (cached_bh != NULL) { + if (!cached_bh->digest_normal) { + /* Copy md_ctx to deal with broken CRLF at the end */ + cpy_ctx = EVP_MD_CTX_create(); + EVP_MD_CTX_copy(cpy_ctx, ctx->common.body_hash); + EVP_DigestFinal_ex(cpy_ctx, raw_digest, NULL); + + cached_bh->digest_normal = rspamd_mempool_alloc(task->task_pool, + sizeof(raw_digest)); + memcpy(cached_bh->digest_normal, raw_digest, sizeof(raw_digest)); + } + + /* Check bh field */ + if (memcmp(ctx->bh, cached_bh->digest_normal, ctx->bhlen) != 0) { + msg_debug_dkim( + "bh value mismatch: %*xs versus %*xs, try add LF; try adding CRLF", + (gint) dlen, ctx->bh, + (gint) dlen, raw_digest); + + if (cpy_ctx) { + /* Try add CRLF */ +#if OPENSSL_VERSION_NUMBER < 0x10100000L || defined(LIBRESSL_VERSION_NUMBER) + EVP_MD_CTX_cleanup(cpy_ctx); +#else + EVP_MD_CTX_reset(cpy_ctx); +#endif + EVP_MD_CTX_copy(cpy_ctx, ctx->common.body_hash); + EVP_DigestUpdate(cpy_ctx, "\r\n", 2); + EVP_DigestFinal_ex(cpy_ctx, raw_digest, NULL); + cached_bh->digest_crlf = rspamd_mempool_alloc(task->task_pool, + sizeof(raw_digest)); + memcpy(cached_bh->digest_crlf, raw_digest, sizeof(raw_digest)); + + if (memcmp(ctx->bh, raw_digest, ctx->bhlen) != 0) { + msg_debug_dkim( + "bh value mismatch after added CRLF: %*xs versus %*xs, try add LF", + (gint) dlen, ctx->bh, + (gint) dlen, raw_digest); + + /* Try add LF */ +#if OPENSSL_VERSION_NUMBER < 0x10100000L || defined(LIBRESSL_VERSION_NUMBER) + EVP_MD_CTX_cleanup(cpy_ctx); +#else + EVP_MD_CTX_reset(cpy_ctx); +#endif + EVP_MD_CTX_copy(cpy_ctx, ctx->common.body_hash); + EVP_DigestUpdate(cpy_ctx, "\n", 1); + EVP_DigestFinal_ex(cpy_ctx, raw_digest, NULL); + cached_bh->digest_cr = rspamd_mempool_alloc(task->task_pool, + sizeof(raw_digest)); + memcpy(cached_bh->digest_cr, raw_digest, sizeof(raw_digest)); + + if (memcmp(ctx->bh, raw_digest, ctx->bhlen) != 0) { + msg_debug_dkim("bh value mismatch after added LF: %*xs versus %*xs", + (gint) dlen, ctx->bh, + (gint) dlen, raw_digest); + res->fail_reason = "body hash did not verify"; + res->rcode = DKIM_REJECT; + } + } + } + else if (cached_bh->digest_crlf) { + if (memcmp(ctx->bh, cached_bh->digest_crlf, ctx->bhlen) != 0) { + msg_debug_dkim("bh value mismatch after added CRLF: %*xs versus %*xs", + (gint) dlen, ctx->bh, + (gint) dlen, cached_bh->digest_crlf); + + if (cached_bh->digest_cr) { + if (memcmp(ctx->bh, cached_bh->digest_cr, ctx->bhlen) != 0) { + msg_debug_dkim( + "bh value mismatch after added LF: %*xs versus %*xs", + (gint) dlen, ctx->bh, + (gint) dlen, cached_bh->digest_cr); + + res->fail_reason = "body hash did not verify"; + res->rcode = DKIM_REJECT; + } + } + else { + + res->fail_reason = "body hash did not verify"; + res->rcode = DKIM_REJECT; + } + } + } + else { + msg_debug_dkim( + "bh value mismatch: %*xs versus %*xs", + (gint) dlen, ctx->bh, + (gint) dlen, cached_bh->digest_normal); + res->fail_reason = "body hash did not verify"; + res->rcode = DKIM_REJECT; + } + } + + if (cpy_ctx) { +#if OPENSSL_VERSION_NUMBER < 0x10100000L || defined(LIBRESSL_VERSION_NUMBER) + EVP_MD_CTX_cleanup(cpy_ctx); +#else + EVP_MD_CTX_reset(cpy_ctx); +#endif + EVP_MD_CTX_destroy(cpy_ctx); + } + + if (res->rcode == DKIM_REJECT) { + msg_info_dkim( + "%s: bh value mismatch: got %*Bs, expected %*Bs; " + "body length %d->%d; d=%s; s=%s", + rspamd_dkim_type_to_string(ctx->common.type), + (gint) dlen, cached_bh->digest_normal, + (gint) dlen, ctx->bh, + (gint) (body_end - body_start), ctx->common.body_canonicalised, + ctx->domain, ctx->selector); + + return res; + } + } + + dlen = EVP_MD_CTX_size(ctx->common.headers_hash); + EVP_DigestFinal_ex(ctx->common.headers_hash, raw_digest, NULL); + /* Check headers signature */ + + if (ctx->sig_alg == DKIM_SIGN_RSASHA1) { + nid = NID_sha1; + } + else if (ctx->sig_alg == DKIM_SIGN_RSASHA256 || + ctx->sig_alg == DKIM_SIGN_ECDSASHA256 || + ctx->sig_alg == DKIM_SIGN_EDDSASHA256) { + nid = NID_sha256; + } + else if (ctx->sig_alg == DKIM_SIGN_RSASHA512 || + ctx->sig_alg == DKIM_SIGN_ECDSASHA512) { + nid = NID_sha512; + } + else { + /* Not reached */ + nid = NID_sha1; + } + + switch (key->type) { + case RSPAMD_DKIM_KEY_RSA: + if (RSA_verify(nid, raw_digest, dlen, ctx->b, ctx->blen, + key->key.key_rsa) != 1) { + msg_debug_dkim("headers rsa verify failed"); + ERR_clear_error(); + res->rcode = DKIM_REJECT; + res->fail_reason = "headers rsa verify failed"; + + msg_info_dkim( + "%s: headers RSA verification failure; " + "body length %d->%d; headers length %d; d=%s; s=%s; key_md5=%*xs; orig header: %s", + rspamd_dkim_type_to_string(ctx->common.type), + (gint) (body_end - body_start), ctx->common.body_canonicalised, + ctx->common.headers_canonicalised, + ctx->domain, ctx->selector, + RSPAMD_DKIM_KEY_ID_LEN, rspamd_dkim_key_id(key), + ctx->dkim_header); + } + break; + case RSPAMD_DKIM_KEY_ECDSA: + if (ECDSA_verify(nid, raw_digest, dlen, ctx->b, ctx->blen, + key->key.key_ecdsa) != 1) { + msg_info_dkim( + "%s: headers ECDSA verification failure; " + "body length %d->%d; headers length %d; d=%s; s=%s; key_md5=%*xs; orig header: %s", + rspamd_dkim_type_to_string(ctx->common.type), + (gint) (body_end - body_start), ctx->common.body_canonicalised, + ctx->common.headers_canonicalised, + ctx->domain, ctx->selector, + RSPAMD_DKIM_KEY_ID_LEN, rspamd_dkim_key_id(key), + ctx->dkim_header); + msg_debug_dkim("headers ecdsa verify failed"); + ERR_clear_error(); + res->rcode = DKIM_REJECT; + res->fail_reason = "headers ecdsa verify failed"; + } + break; + case RSPAMD_DKIM_KEY_EDDSA: + if (!rspamd_cryptobox_verify(ctx->b, ctx->blen, raw_digest, dlen, + key->key.key_eddsa, RSPAMD_CRYPTOBOX_MODE_25519)) { + msg_info_dkim( + "%s: headers EDDSA verification failure; " + "body length %d->%d; headers length %d; d=%s; s=%s; key_md5=%*xs; orig header: %s", + rspamd_dkim_type_to_string(ctx->common.type), + (gint) (body_end - body_start), ctx->common.body_canonicalised, + ctx->common.headers_canonicalised, + ctx->domain, ctx->selector, + RSPAMD_DKIM_KEY_ID_LEN, rspamd_dkim_key_id(key), + ctx->dkim_header); + msg_debug_dkim("headers eddsa verify failed"); + res->rcode = DKIM_REJECT; + res->fail_reason = "headers eddsa verify failed"; + } + break; + } + + + if (ctx->common.type == RSPAMD_DKIM_ARC_SEAL && res->rcode == DKIM_CONTINUE) { + switch (ctx->cv) { + case RSPAMD_ARC_INVALID: + msg_info_dkim("arc seal is invalid i=%d", ctx->common.idx); + res->rcode = DKIM_PERM_ERROR; + res->fail_reason = "arc seal is invalid"; + break; + case RSPAMD_ARC_FAIL: + msg_info_dkim("arc seal failed i=%d", ctx->common.idx); + res->rcode = DKIM_REJECT; + res->fail_reason = "arc seal failed"; + break; + default: + break; + } + } + + return res; +} + +struct rspamd_dkim_check_result * +rspamd_dkim_create_result(rspamd_dkim_context_t *ctx, + enum rspamd_dkim_check_rcode rcode, + struct rspamd_task *task) +{ + struct rspamd_dkim_check_result *res; + + res = rspamd_mempool_alloc0(task->task_pool, sizeof(*res)); + res->ctx = ctx; + res->selector = ctx->selector; + res->domain = ctx->domain; + res->fail_reason = NULL; + res->short_b = ctx->short_b; + res->rcode = rcode; + + return res; +} + +rspamd_dkim_key_t * +rspamd_dkim_key_ref(rspamd_dkim_key_t *k) +{ + REF_RETAIN(k); + + return k; +} + +void rspamd_dkim_key_unref(rspamd_dkim_key_t *k) +{ + REF_RELEASE(k); +} + +rspamd_dkim_sign_key_t * +rspamd_dkim_sign_key_ref(rspamd_dkim_sign_key_t *k) +{ + REF_RETAIN(k); + + return k; +} + +void rspamd_dkim_sign_key_unref(rspamd_dkim_sign_key_t *k) +{ + REF_RELEASE(k); +} + +const gchar * +rspamd_dkim_get_domain(rspamd_dkim_context_t *ctx) +{ + if (ctx) { + return ctx->domain; + } + + return NULL; +} + +const gchar * +rspamd_dkim_get_selector(rspamd_dkim_context_t *ctx) +{ + if (ctx) { + return ctx->selector; + } + + return NULL; +} + +guint rspamd_dkim_key_get_ttl(rspamd_dkim_key_t *k) +{ + if (k) { + return k->ttl; + } + + return 0; +} + +const gchar * +rspamd_dkim_get_dns_key(rspamd_dkim_context_t *ctx) +{ + if (ctx) { + return ctx->dns_key; + } + + return NULL; +} + +#define PEM_SIG "-----BEGIN" + +rspamd_dkim_sign_key_t * +rspamd_dkim_sign_key_load(const gchar *key, gsize len, + enum rspamd_dkim_key_format type, + GError **err) +{ + guchar *map = NULL, *tmp = NULL; + gsize maplen; + rspamd_dkim_sign_key_t *nkey; + time_t mtime = time(NULL); + + if (type < 0 || type > RSPAMD_DKIM_KEY_UNKNOWN || len == 0 || key == NULL) { + g_set_error(err, dkim_error_quark(), DKIM_SIGERROR_KEYFAIL, + "invalid key type to load: %d", type); + return NULL; + } + + nkey = g_malloc0(sizeof(*nkey)); + nkey->mtime = mtime; + + msg_debug_dkim_taskless("got public key with length %z and type %d", + len, type); + + /* Load key file if needed */ + if (type == RSPAMD_DKIM_KEY_FILE) { + struct stat st; + + if (stat(key, &st) != 0) { + g_set_error(err, dkim_error_quark(), DKIM_SIGERROR_KEYFAIL, + "cannot stat key file: '%s' %s", key, strerror(errno)); + g_free(nkey); + + return NULL; + } + + nkey->mtime = st.st_mtime; + map = rspamd_file_xmap(key, PROT_READ, &maplen, TRUE); + + if (map == NULL) { + g_set_error(err, dkim_error_quark(), DKIM_SIGERROR_KEYFAIL, + "cannot map key file: '%s' %s", key, strerror(errno)); + g_free(nkey); + + return NULL; + } + + key = map; + len = maplen; + + if (maplen > sizeof(PEM_SIG) && + strncmp(map, PEM_SIG, sizeof(PEM_SIG) - 1) == 0) { + type = RSPAMD_DKIM_KEY_PEM; + } + else if (rspamd_cryptobox_base64_is_valid(map, maplen)) { + type = RSPAMD_DKIM_KEY_BASE64; + } + else { + type = RSPAMD_DKIM_KEY_RAW; + } + } + + if (type == RSPAMD_DKIM_KEY_UNKNOWN) { + if (len > sizeof(PEM_SIG) && + memcmp(key, PEM_SIG, sizeof(PEM_SIG) - 1) == 0) { + type = RSPAMD_DKIM_KEY_PEM; + } + else { + type = RSPAMD_DKIM_KEY_RAW; + } + } + + if (type == RSPAMD_DKIM_KEY_BASE64) { + type = RSPAMD_DKIM_KEY_RAW; + tmp = g_malloc(len); + rspamd_cryptobox_base64_decode(key, len, tmp, &len); + key = tmp; + } + + if (type == RSPAMD_DKIM_KEY_RAW && (len == 32 || + len == rspamd_cryptobox_sk_sig_bytes(RSPAMD_CRYPTOBOX_MODE_25519))) { + if (len == 32) { + /* Seeded key, need scalarmult */ + unsigned char pk[32]; + nkey->type = RSPAMD_DKIM_KEY_EDDSA; + nkey->key.key_eddsa = g_malloc( + rspamd_cryptobox_sk_sig_bytes(RSPAMD_CRYPTOBOX_MODE_25519)); + crypto_sign_ed25519_seed_keypair(pk, nkey->key.key_eddsa, key); + nkey->keylen = rspamd_cryptobox_sk_sig_bytes(RSPAMD_CRYPTOBOX_MODE_25519); + } + else { + /* Full ed25519 key */ + unsigned klen = rspamd_cryptobox_sk_sig_bytes(RSPAMD_CRYPTOBOX_MODE_25519); + nkey->type = RSPAMD_DKIM_KEY_EDDSA; + nkey->key.key_eddsa = g_malloc(klen); + memcpy(nkey->key.key_eddsa, key, klen); + nkey->keylen = klen; + } + } + else { + nkey->key_bio = BIO_new_mem_buf(key, len); + + if (type == RSPAMD_DKIM_KEY_RAW) { + if (d2i_PrivateKey_bio(nkey->key_bio, &nkey->key_evp) == NULL) { + g_set_error(err, dkim_error_quark(), DKIM_SIGERROR_KEYFAIL, + "cannot parse raw private key: %s", + ERR_error_string(ERR_get_error(), NULL)); + + rspamd_dkim_sign_key_free(nkey); + nkey = NULL; + + goto end; + } + } + else { + if (!PEM_read_bio_PrivateKey(nkey->key_bio, &nkey->key_evp, NULL, NULL)) { + g_set_error(err, dkim_error_quark(), DKIM_SIGERROR_KEYFAIL, + "cannot parse pem private key: %s", + ERR_error_string(ERR_get_error(), NULL)); + rspamd_dkim_sign_key_free(nkey); + nkey = NULL; + + goto end; + } + } + nkey->key.key_rsa = EVP_PKEY_get1_RSA(nkey->key_evp); + if (nkey->key.key_rsa == NULL) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_KEYFAIL, + "cannot extract rsa key from evp key"); + rspamd_dkim_sign_key_free(nkey); + nkey = NULL; + + goto end; + } + nkey->type = RSPAMD_DKIM_KEY_RSA; + } + + REF_INIT_RETAIN(nkey, rspamd_dkim_sign_key_free); + +end: + + if (map != NULL) { + munmap(map, maplen); + } + + if (tmp != NULL) { + rspamd_explicit_memzero(tmp, len); + g_free(tmp); + } + + return nkey; +} + +#undef PEM_SIG + +gboolean +rspamd_dkim_sign_key_maybe_invalidate(rspamd_dkim_sign_key_t *key, time_t mtime) +{ + if (mtime > key->mtime) { + return TRUE; + } + return FALSE; +} + +rspamd_dkim_sign_context_t * +rspamd_create_dkim_sign_context(struct rspamd_task *task, + rspamd_dkim_sign_key_t *priv_key, + gint headers_canon, + gint body_canon, + const gchar *headers, + enum rspamd_dkim_type type, + GError **err) +{ + rspamd_dkim_sign_context_t *nctx; + + if (headers_canon != DKIM_CANON_SIMPLE && headers_canon != DKIM_CANON_RELAXED) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_INVALID_HC, + "bad headers canonicalisation"); + + return NULL; + } + if (body_canon != DKIM_CANON_SIMPLE && body_canon != DKIM_CANON_RELAXED) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_INVALID_BC, + "bad body canonicalisation"); + + return NULL; + } + + if (!priv_key || (!priv_key->key.key_rsa && !priv_key->key.key_eddsa)) { + g_set_error(err, + DKIM_ERROR, + DKIM_SIGERROR_KEYFAIL, + "bad key to sign"); + + return NULL; + } + + nctx = rspamd_mempool_alloc0(task->task_pool, sizeof(*nctx)); + nctx->common.pool = task->task_pool; + nctx->common.header_canon_type = headers_canon; + nctx->common.body_canon_type = body_canon; + nctx->common.type = type; + nctx->common.is_sign = TRUE; + + if (type != RSPAMD_DKIM_ARC_SEAL) { + if (!rspamd_dkim_parse_hdrlist_common(&nctx->common, headers, + strlen(headers), TRUE, + err)) { + return NULL; + } + } + else { + rspamd_dkim_add_arc_seal_headers(task->task_pool, &nctx->common); + } + + nctx->key = rspamd_dkim_sign_key_ref(priv_key); + + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) rspamd_dkim_sign_key_unref, priv_key); + +#if OPENSSL_VERSION_NUMBER < 0x10100000L || defined(LIBRESSL_VERSION_NUMBER) + nctx->common.body_hash = EVP_MD_CTX_create(); + EVP_DigestInit_ex(nctx->common.body_hash, EVP_sha256(), NULL); + nctx->common.headers_hash = EVP_MD_CTX_create(); + EVP_DigestInit_ex(nctx->common.headers_hash, EVP_sha256(), NULL); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) EVP_MD_CTX_destroy, nctx->common.body_hash); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) EVP_MD_CTX_destroy, nctx->common.headers_hash); +#else + nctx->common.body_hash = EVP_MD_CTX_new(); + EVP_DigestInit_ex(nctx->common.body_hash, EVP_sha256(), NULL); + nctx->common.headers_hash = EVP_MD_CTX_new(); + EVP_DigestInit_ex(nctx->common.headers_hash, EVP_sha256(), NULL); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) EVP_MD_CTX_free, nctx->common.body_hash); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) EVP_MD_CTX_free, nctx->common.headers_hash); +#endif + + return nctx; +} + + +GString * +rspamd_dkim_sign(struct rspamd_task *task, const gchar *selector, + const gchar *domain, time_t expire, gsize len, guint idx, + const gchar *arc_cv, rspamd_dkim_sign_context_t *ctx) +{ + GString *hdr; + struct rspamd_dkim_header *dh; + const gchar *body_end, *body_start, *hname; + guchar raw_digest[EVP_MAX_MD_SIZE]; + struct rspamd_dkim_cached_hash *cached_bh = NULL; + gsize dlen = 0; + guint i, j; + gchar *b64_data; + guchar *sig_buf; + guint sig_len; + guint headers_len = 0, cur_len = 0; + union rspamd_dkim_header_stat hstat; + + g_assert(ctx != NULL); + + /* First of all find place of body */ + body_end = task->msg.begin + task->msg.len; + body_start = MESSAGE_FIELD(task, raw_headers_content).body_start; + + if (len > 0) { + ctx->common.len = len; + } + + if (!body_start) { + return NULL; + } + + /* Start canonization of body part */ + if (ctx->common.type != RSPAMD_DKIM_ARC_SEAL) { + dlen = EVP_MD_CTX_size(ctx->common.body_hash); + cached_bh = rspamd_dkim_check_bh_cached(&ctx->common, task, + dlen, TRUE); + + if (!cached_bh->digest_normal) { + /* Start canonization of body part */ + if (!rspamd_dkim_canonize_body(&ctx->common, body_start, body_end, + TRUE)) { + return NULL; + } + } + } + + hdr = g_string_sized_new(255); + + if (ctx->common.type == RSPAMD_DKIM_NORMAL) { + rspamd_printf_gstring(hdr, "v=1; a=%s; c=%s/%s; d=%s; s=%s; ", + ctx->key->type == RSPAMD_DKIM_KEY_RSA ? "rsa-sha256" : "ed25519-sha256", + ctx->common.header_canon_type == DKIM_CANON_RELAXED ? "relaxed" : "simple", + ctx->common.body_canon_type == DKIM_CANON_RELAXED ? "relaxed" : "simple", + domain, selector); + } + else if (ctx->common.type == RSPAMD_DKIM_ARC_SIG) { + rspamd_printf_gstring(hdr, "i=%d; a=%s; c=%s/%s; d=%s; s=%s; ", + idx, + ctx->key->type == RSPAMD_DKIM_KEY_RSA ? "rsa-sha256" : "ed25519-sha256", + ctx->common.header_canon_type == DKIM_CANON_RELAXED ? "relaxed" : "simple", + ctx->common.body_canon_type == DKIM_CANON_RELAXED ? "relaxed" : "simple", + domain, selector); + } + else { + g_assert(arc_cv != NULL); + rspamd_printf_gstring(hdr, "i=%d; a=%s; d=%s; s=%s; cv=%s; ", + idx, + ctx->key->type == RSPAMD_DKIM_KEY_RSA ? "rsa-sha256" : "ed25519-sha256", + domain, + selector, + arc_cv); + } + + if (expire > 0) { + rspamd_printf_gstring(hdr, "x=%t; ", expire); + } + + if (ctx->common.type != RSPAMD_DKIM_ARC_SEAL) { + if (len > 0) { + rspamd_printf_gstring(hdr, "l=%z; ", len); + } + } + + rspamd_printf_gstring(hdr, "t=%t; h=", time(NULL)); + + /* Now canonize headers */ + for (i = 0; i < ctx->common.hlist->len; i++) { + struct rspamd_mime_header *rh, *cur; + + dh = g_ptr_array_index(ctx->common.hlist, i); + + /* We allow oversigning if dh->count > number of headers with this name */ + hstat.n = GPOINTER_TO_UINT(g_hash_table_lookup(ctx->common.htable, dh->name)); + + if (hstat.s.flags & RSPAMD_DKIM_FLAG_OVERSIGN) { + /* Do oversigning */ + guint count = 0; + + rh = rspamd_message_get_header_array(task, dh->name, FALSE); + + if (rh) { + DL_FOREACH(rh, cur) + { + /* Sign all existing headers */ + rspamd_dkim_canonize_header(&ctx->common, task, dh->name, + count, + NULL, NULL); + count++; + } + } + + /* Now add one more entry to oversign */ + if (count > 0 || !(hstat.s.flags & RSPAMD_DKIM_FLAG_OVERSIGN_EXISTING)) { + cur_len = (strlen(dh->name) + 1) * (count + 1); + headers_len += cur_len; + + if (headers_len > 70 && i > 0 && i < ctx->common.hlist->len - 1) { + rspamd_printf_gstring(hdr, " "); + headers_len = cur_len; + } + + for (j = 0; j < count + 1; j++) { + rspamd_printf_gstring(hdr, "%s:", dh->name); + } + } + } + else { + rh = rspamd_message_get_header_array(task, dh->name, FALSE); + + if (rh) { + if (hstat.s.count > 0) { + + cur_len = (strlen(dh->name) + 1) * (hstat.s.count); + headers_len += cur_len; + if (headers_len > 70 && i > 0 && i < ctx->common.hlist->len - 1) { + rspamd_printf_gstring(hdr, " "); + headers_len = cur_len; + } + + for (j = 0; j < hstat.s.count; j++) { + rspamd_printf_gstring(hdr, "%s:", dh->name); + } + } + + + rspamd_dkim_canonize_header(&ctx->common, task, + dh->name, dh->count, + NULL, NULL); + } + } + + g_hash_table_remove(ctx->common.htable, dh->name); + } + + /* Replace the last ':' with ';' */ + hdr->str[hdr->len - 1] = ';'; + + if (ctx->common.type != RSPAMD_DKIM_ARC_SEAL) { + if (!cached_bh->digest_normal) { + EVP_DigestFinal_ex(ctx->common.body_hash, raw_digest, NULL); + cached_bh->digest_normal = rspamd_mempool_alloc(task->task_pool, + sizeof(raw_digest)); + memcpy(cached_bh->digest_normal, raw_digest, sizeof(raw_digest)); + } + + + b64_data = rspamd_encode_base64(cached_bh->digest_normal, dlen, 0, NULL); + rspamd_printf_gstring(hdr, " bh=%s; b=", b64_data); + g_free(b64_data); + } + else { + rspamd_printf_gstring(hdr, " b="); + } + + switch (ctx->common.type) { + case RSPAMD_DKIM_NORMAL: + default: + hname = RSPAMD_DKIM_SIGNHEADER; + break; + case RSPAMD_DKIM_ARC_SIG: + hname = RSPAMD_DKIM_ARC_SIGNHEADER; + break; + case RSPAMD_DKIM_ARC_SEAL: + hname = RSPAMD_DKIM_ARC_SEALHEADER; + break; + } + + if (ctx->common.header_canon_type == DKIM_CANON_RELAXED) { + if (!rspamd_dkim_canonize_header_relaxed(&ctx->common, + hdr->str, + hname, + TRUE, + 0, + ctx->common.type == RSPAMD_DKIM_ARC_SEAL)) { + + g_string_free(hdr, TRUE); + return NULL; + } + } + else { + /* Will likely have issues with folding */ + rspamd_dkim_hash_update(ctx->common.headers_hash, hdr->str, + hdr->len); + ctx->common.headers_canonicalised += hdr->len; + msg_debug_task("update signature with header: %*s", + (gint) hdr->len, hdr->str); + } + + dlen = EVP_MD_CTX_size(ctx->common.headers_hash); + EVP_DigestFinal_ex(ctx->common.headers_hash, raw_digest, NULL); + if (ctx->key->type == RSPAMD_DKIM_KEY_RSA) { + sig_len = RSA_size(ctx->key->key.key_rsa); + sig_buf = g_alloca(sig_len); + + if (RSA_sign(NID_sha256, raw_digest, dlen, sig_buf, &sig_len, + ctx->key->key.key_rsa) != 1) { + g_string_free(hdr, TRUE); + msg_err_task("rsa sign error: %s", + ERR_error_string(ERR_get_error(), NULL)); + + return NULL; + } + } + else if (ctx->key->type == RSPAMD_DKIM_KEY_EDDSA) { + sig_len = rspamd_cryptobox_signature_bytes(RSPAMD_CRYPTOBOX_MODE_25519); + sig_buf = g_alloca(sig_len); + + rspamd_cryptobox_sign(sig_buf, NULL, raw_digest, dlen, + ctx->key->key.key_eddsa, RSPAMD_CRYPTOBOX_MODE_25519); + } + else { + g_string_free(hdr, TRUE); + msg_err_task("unsupported key type for signing"); + + return NULL; + } + + if (task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_MILTER) { + b64_data = rspamd_encode_base64_fold(sig_buf, sig_len, 70, NULL, + RSPAMD_TASK_NEWLINES_LF); + } + else { + b64_data = rspamd_encode_base64_fold(sig_buf, sig_len, 70, NULL, + MESSAGE_FIELD(task, nlines_type)); + } + + rspamd_printf_gstring(hdr, "%s", b64_data); + g_free(b64_data); + + return hdr; +} + +gboolean +rspamd_dkim_match_keys(rspamd_dkim_key_t *pk, + rspamd_dkim_sign_key_t *sk, + GError **err) +{ + if (pk == NULL || sk == NULL) { + g_set_error(err, dkim_error_quark(), DKIM_SIGERROR_KEYFAIL, + "missing public or private key"); + return FALSE; + } + if (pk->type != sk->type) { + g_set_error(err, dkim_error_quark(), DKIM_SIGERROR_KEYFAIL, + "public and private key types do not match"); + return FALSE; + } + + if (pk->type == RSPAMD_DKIM_KEY_EDDSA) { + if (memcmp(sk->key.key_eddsa + 32, pk->key.key_eddsa, 32) != 0) { + g_set_error(err, dkim_error_quark(), DKIM_SIGERROR_KEYHASHMISMATCH, + "pubkey does not match private key"); + return FALSE; + } + } + else if (EVP_PKEY_cmp(pk->key_evp, sk->key_evp) != 1) { + g_set_error(err, dkim_error_quark(), DKIM_SIGERROR_KEYHASHMISMATCH, + "pubkey does not match private key"); + return FALSE; + } + + return TRUE; +} diff --git a/src/libserver/dkim.h b/src/libserver/dkim.h new file mode 100644 index 0000000..50703da --- /dev/null +++ b/src/libserver/dkim.h @@ -0,0 +1,298 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DKIM_H_ +#define DKIM_H_ + +#include "config.h" +#include "contrib/libev/ev.h" +#include "dns.h" +#include "ref.h" + + +/* Main types and definitions */ + +#define RSPAMD_DKIM_SIGNHEADER "DKIM-Signature" +#define RSPAMD_DKIM_ARC_SIGNHEADER "ARC-Message-Signature" +#define RSPAMD_DKIM_ARC_AUTHHEADER "ARC-Authentication-Results" +#define RSPAMD_DKIM_ARC_SEALHEADER "ARC-Seal" +/* DKIM signature header */ + + +/* Errors (from OpenDKIM) */ + +#define DKIM_SIGERROR_UNKNOWN (-1) /* unknown error */ +#define DKIM_SIGERROR_VERSION 1 /* unsupported version */ +#define DKIM_SIGERROR_EXPIRED 3 /* signature expired */ +#define DKIM_SIGERROR_FUTURE 4 /* signature in the future */ +#define DKIM_SIGERROR_NOREC 6 /* No record */ +#define DKIM_SIGERROR_INVALID_HC 7 /* c= invalid (header) */ +#define DKIM_SIGERROR_INVALID_BC 8 /* c= invalid (body) */ +#define DKIM_SIGERROR_INVALID_A 10 /* a= invalid */ +#define DKIM_SIGERROR_INVALID_L 12 /* l= invalid */ +#define DKIM_SIGERROR_EMPTY_D 16 /* d= empty */ +#define DKIM_SIGERROR_EMPTY_S 18 /* s= empty */ +#define DKIM_SIGERROR_EMPTY_B 20 /* b= empty */ +#define DKIM_SIGERROR_NOKEY 22 /* no key found in DNS */ +#define DKIM_SIGERROR_KEYFAIL 24 /* DNS query failed */ +#define DKIM_SIGERROR_EMPTY_BH 26 /* bh= empty */ +#define DKIM_SIGERROR_BADSIG 28 /* signature mismatch */ +#define DKIM_SIGERROR_EMPTY_H 31 /* h= empty */ +#define DKIM_SIGERROR_INVALID_H 32 /* h= missing req'd entries */ +#define DKIM_SIGERROR_KEYHASHMISMATCH 37 /* sig-key hash mismatch */ +#define DKIM_SIGERROR_EMPTY_V 45 /* v= tag empty */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Check results */ +enum rspamd_dkim_check_rcode { + DKIM_CONTINUE = 0, + DKIM_REJECT, + DKIM_TRYAGAIN, + DKIM_NOTFOUND, + DKIM_RECORD_ERROR, + DKIM_PERM_ERROR, +}; + +#define DKIM_CANON_SIMPLE 0 /* as specified in DKIM spec */ +#define DKIM_CANON_RELAXED 1 /* as specified in DKIM spec */ + +struct rspamd_dkim_context_s; +typedef struct rspamd_dkim_context_s rspamd_dkim_context_t; + +struct rspamd_dkim_sign_context_s; +typedef struct rspamd_dkim_sign_context_s rspamd_dkim_sign_context_t; + +struct rspamd_dkim_key_s; +typedef struct rspamd_dkim_key_s rspamd_dkim_key_t; +typedef struct rspamd_dkim_key_s rspamd_dkim_sign_key_t; + +struct rspamd_task; + +enum rspamd_dkim_key_format { + RSPAMD_DKIM_KEY_FILE = 0, + RSPAMD_DKIM_KEY_PEM, + RSPAMD_DKIM_KEY_BASE64, + RSPAMD_DKIM_KEY_RAW, + RSPAMD_DKIM_KEY_UNKNOWN +}; + +enum rspamd_dkim_type { + RSPAMD_DKIM_NORMAL, + RSPAMD_DKIM_ARC_SIG, + RSPAMD_DKIM_ARC_SEAL +}; + +/* Signature methods */ +enum rspamd_sign_type { + DKIM_SIGN_UNKNOWN = -2, + DKIM_SIGN_RSASHA1 = 0, + DKIM_SIGN_RSASHA256, + DKIM_SIGN_RSASHA512, + DKIM_SIGN_ECDSASHA256, + DKIM_SIGN_ECDSASHA512, + DKIM_SIGN_EDDSASHA256, +}; + +enum rspamd_dkim_key_type { + RSPAMD_DKIM_KEY_RSA = 0, + RSPAMD_DKIM_KEY_ECDSA, + RSPAMD_DKIM_KEY_EDDSA +}; + +struct rspamd_dkim_check_result { + enum rspamd_dkim_check_rcode rcode; + rspamd_dkim_context_t *ctx; + /* Processed parts */ + const gchar *selector; + const gchar *domain; + const gchar *short_b; + const gchar *fail_reason; +}; + + +/* Err MUST be freed if it is not NULL, key is allocated by slice allocator */ +typedef void (*dkim_key_handler_f)(rspamd_dkim_key_t *key, gsize keylen, + rspamd_dkim_context_t *ctx, gpointer ud, GError *err); + +/** + * Create new dkim context from signature + * @param sig message's signature + * @param pool pool to allocate memory from + * @param time_jitter jitter in seconds to allow time diff while checking + * @param err pointer to error object + * @return new context or NULL + */ +rspamd_dkim_context_t *rspamd_create_dkim_context(const gchar *sig, + rspamd_mempool_t *pool, + struct rspamd_dns_resolver *resolver, + guint time_jitter, + enum rspamd_dkim_type type, + GError **err); + +/** + * Create new dkim context for making a signature + * @param task + * @param priv_key + * @param err + * @return + */ +rspamd_dkim_sign_context_t *rspamd_create_dkim_sign_context(struct rspamd_task *task, + rspamd_dkim_sign_key_t *priv_key, + gint headers_canon, + gint body_canon, + const gchar *dkim_headers, + enum rspamd_dkim_type type, + GError **err); + +/** + * Load dkim key + * @param path + * @param err + * @return + */ +rspamd_dkim_sign_key_t *rspamd_dkim_sign_key_load(const gchar *what, gsize len, + enum rspamd_dkim_key_format type, + GError **err); + +/** + * Invalidate modified sign key + * @param key + * @return +*/ +gboolean rspamd_dkim_sign_key_maybe_invalidate(rspamd_dkim_sign_key_t *key, + time_t mtime); + +/** + * Make DNS request for specified context and obtain and parse key + * @param ctx dkim context from signature + * @param resolver dns resolver object + * @param s async session to make request + * @return + */ +gboolean rspamd_get_dkim_key(rspamd_dkim_context_t *ctx, + struct rspamd_task *task, + dkim_key_handler_f handler, + gpointer ud); + +/** + * Check task for dkim context using dkim key + * @param ctx dkim verify context + * @param key dkim key (from cache or from dns request) + * @param task task to check + * @return + */ +struct rspamd_dkim_check_result *rspamd_dkim_check(rspamd_dkim_context_t *ctx, + rspamd_dkim_key_t *key, + struct rspamd_task *task); + +struct rspamd_dkim_check_result * +rspamd_dkim_create_result(rspamd_dkim_context_t *ctx, + enum rspamd_dkim_check_rcode rcode, + struct rspamd_task *task); + +GString *rspamd_dkim_sign(struct rspamd_task *task, + const gchar *selector, + const gchar *domain, + time_t expire, + gsize len, + guint idx, + const gchar *arc_cv, + rspamd_dkim_sign_context_t *ctx); + +rspamd_dkim_key_t *rspamd_dkim_key_ref(rspamd_dkim_key_t *k); + +void rspamd_dkim_key_unref(rspamd_dkim_key_t *k); + +rspamd_dkim_sign_key_t *rspamd_dkim_sign_key_ref(rspamd_dkim_sign_key_t *k); + +void rspamd_dkim_sign_key_unref(rspamd_dkim_sign_key_t *k); + +const gchar *rspamd_dkim_get_domain(rspamd_dkim_context_t *ctx); + +const gchar *rspamd_dkim_get_selector(rspamd_dkim_context_t *ctx); + +const gchar *rspamd_dkim_get_dns_key(rspamd_dkim_context_t *ctx); + +guint rspamd_dkim_key_get_ttl(rspamd_dkim_key_t *k); + +/** + * Create DKIM public key from a raw data + * @param keydata + * @param keylen + * @param type + * @param err + * @return + */ +rspamd_dkim_key_t *rspamd_dkim_make_key(const gchar *keydata, guint keylen, + enum rspamd_dkim_key_type type, + GError **err); + +#define RSPAMD_DKIM_KEY_ID_LEN 16 +/** + * Returns key id for dkim key (raw md5 of RSPAMD_DKIM_KEY_ID_LEN) + * NOT ZERO TERMINATED, use RSPAMD_DKIM_KEY_ID_LEN for length + * @param key + * @return + */ +const guchar *rspamd_dkim_key_id(rspamd_dkim_key_t *key); + +/** + * Parse DKIM public key from a TXT record + * @param txt + * @param keylen + * @param err + * @return + */ +rspamd_dkim_key_t *rspamd_dkim_parse_key(const gchar *txt, gsize *keylen, + GError **err); + +/** + * Canonicalise header using relaxed algorithm + * @param hname + * @param hvalue + * @param out + * @param outlen + * @return + */ +goffset rspamd_dkim_canonize_header_relaxed_str(const gchar *hname, + const gchar *hvalue, + gchar *out, + gsize outlen); + +/** + * Checks public and private keys for match + * @param pk + * @param sk + * @param err + * @return + */ +gboolean rspamd_dkim_match_keys(rspamd_dkim_key_t *pk, + rspamd_dkim_sign_key_t *sk, + GError **err); + +/** + * Free DKIM key + * @param key + */ +void rspamd_dkim_key_free(rspamd_dkim_key_t *key); + +#ifdef __cplusplus +} +#endif + +#endif /* DKIM_H_ */ diff --git a/src/libserver/dns.c b/src/libserver/dns.c new file mode 100644 index 0000000..be2d5a3 --- /dev/null +++ b/src/libserver/dns.c @@ -0,0 +1,1124 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "contrib/librdns/rdns.h" +#include "config.h" +#include "dns.h" +#include "rspamd.h" +#include "utlist.h" +#include "contrib/libev/ev.h" +#include "contrib/librdns/rdns.h" +#include "contrib/librdns/dns_private.h" +#include "contrib/librdns/rdns_ev.h" +#include "unix-std.h" + +#include <unicode/uidna.h> + +static const gchar *M = "rspamd dns"; + +static struct rdns_upstream_elt *rspamd_dns_select_upstream(const char *name, + size_t len, void *ups_data); +static struct rdns_upstream_elt *rspamd_dns_select_upstream_retransmit( + const char *name, + size_t len, + struct rdns_upstream_elt *prev_elt, + void *ups_data); +static void rspamd_dns_upstream_ok(struct rdns_upstream_elt *elt, + void *ups_data); +static void rspamd_dns_upstream_fail(struct rdns_upstream_elt *elt, + void *ups_data, const gchar *reason); +static unsigned int rspamd_dns_upstream_count(void *ups_data); + +static struct rdns_upstream_context rspamd_ups_ctx = { + .select = rspamd_dns_select_upstream, + .select_retransmit = rspamd_dns_select_upstream_retransmit, + .ok = rspamd_dns_upstream_ok, + .fail = rspamd_dns_upstream_fail, + .count = rspamd_dns_upstream_count, + .data = NULL}; + +struct rspamd_dns_request_ud { + struct rspamd_async_session *session; + dns_callback_type cb; + gpointer ud; + rspamd_mempool_t *pool; + struct rspamd_task *task; + struct rspamd_symcache_dynamic_item *item; + struct rdns_request *req; + struct rdns_reply *reply; +}; + +struct rspamd_dns_fail_cache_entry { + const char *name; + gint32 namelen; + enum rdns_request_type type; +}; + +static const gint8 ascii_dns_table[128] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + /* HYPHEN-MINUS..FULL STOP */ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1, + /* 0..9 digits */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, + /* LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z */ + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* _ */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 1, + /* LATIN SMALL LETTER A..LATIN SMALL LETTER Z */ + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1}; + +static guint +rspamd_dns_fail_hash(gconstpointer ptr) +{ + struct rspamd_dns_fail_cache_entry *elt = + (struct rspamd_dns_fail_cache_entry *) ptr; + + /* We don't care about type when doing hashing */ + return rspamd_cryptobox_fast_hash(elt->name, elt->namelen, + rspamd_hash_seed()); +} + +static gboolean +rspamd_dns_fail_equal(gconstpointer p1, gconstpointer p2) +{ + struct rspamd_dns_fail_cache_entry *e1 = (struct rspamd_dns_fail_cache_entry *) p1, + *e2 = (struct rspamd_dns_fail_cache_entry *) p2; + + if (e1->type == e2->type && e1->namelen == e2->namelen) { + return memcmp(e1->name, e2->name, e1->namelen) == 0; + } + + return FALSE; +} + +static void +rspamd_dns_fin_cb(gpointer arg) +{ + struct rspamd_dns_request_ud *reqdata = (struct rspamd_dns_request_ud *) arg; + + if (reqdata->item) { + rspamd_symcache_set_cur_item(reqdata->task, reqdata->item); + } + + if (reqdata->reply) { + reqdata->cb(reqdata->reply, reqdata->ud); + } + else { + struct rdns_reply fake_reply; + + memset(&fake_reply, 0, sizeof(fake_reply)); + fake_reply.code = RDNS_RC_TIMEOUT; + fake_reply.request = reqdata->req; + fake_reply.resolver = reqdata->req->resolver; + fake_reply.requested_name = reqdata->req->requested_names[0].name; + + reqdata->cb(&fake_reply, reqdata->ud); + } + + rdns_request_release(reqdata->req); + + if (reqdata->item) { + rspamd_symcache_item_async_dec_check(reqdata->task, + reqdata->item, M); + } + + if (reqdata->pool == NULL) { + g_free(reqdata); + } +} + +static void +rspamd_dns_callback(struct rdns_reply *reply, gpointer ud) +{ + struct rspamd_dns_request_ud *reqdata = ud; + + reqdata->reply = reply; + + + if (reqdata->session) { + if (reply->code == RDNS_RC_SERVFAIL && + reqdata->task && + reqdata->task->resolver->fails_cache) { + + /* Add to cache... */ + const gchar *name = reqdata->req->requested_names[0].name; + gchar *target; + gsize namelen; + struct rspamd_dns_fail_cache_entry *nentry; + + /* Allocate in a single entry to allow further free in a single call */ + namelen = strlen(name); + nentry = g_malloc(sizeof(nentry) + namelen + 1); + target = ((gchar *) nentry) + sizeof(nentry); + rspamd_strlcpy(target, name, namelen + 1); + nentry->type = reqdata->req->requested_names[0].type; + nentry->name = target; + nentry->namelen = namelen; + + /* Rdns request is retained there */ + rspamd_lru_hash_insert(reqdata->task->resolver->fails_cache, + nentry, rdns_request_retain(reply->request), + reqdata->task->task_timestamp, + reqdata->task->resolver->fails_cache_time); + } + + /* + * Ref event to avoid double unref by + * event removing + */ + rdns_request_retain(reply->request); + rspamd_session_remove_event(reqdata->session, + rspamd_dns_fin_cb, reqdata); + } + else { + reqdata->cb(reply, reqdata->ud); + + if (reqdata->pool == NULL) { + g_free(reqdata); + } + } +} + +struct rspamd_dns_request_ud * +rspamd_dns_resolver_request(struct rspamd_dns_resolver *resolver, + struct rspamd_async_session *session, + rspamd_mempool_t *pool, + dns_callback_type cb, + gpointer ud, + enum rdns_request_type type, + const char *name) +{ + struct rdns_request *req; + struct rspamd_dns_request_ud *reqdata = NULL; + guint nlen = strlen(name); + gchar *real_name = NULL; + + g_assert(resolver != NULL); + + if (resolver->r == NULL) { + return NULL; + } + + if (nlen == 0 || nlen > DNS_D_MAXNAME) { + return NULL; + } + + if (session && rspamd_session_blocked(session)) { + return NULL; + } + + if (rspamd_str_has_8bit(name, nlen)) { + /* Convert to idna using libicu as it follows all the standards */ + real_name = rspamd_dns_resolver_idna_convert_utf8(resolver, pool, + name, nlen, &nlen); + + if (real_name == NULL) { + return NULL; + } + + name = real_name; + } + + /* Name is now in ASCII only */ + for (gsize i = 0; i < nlen; i++) { + if (ascii_dns_table[((unsigned int) name[i]) & 0x7F] == -1) { + /* Invalid DNS name requested */ + + if (!pool) { + g_free(real_name); + } + + return NULL; + } + } + + if (pool != NULL) { + reqdata = + rspamd_mempool_alloc0(pool, sizeof(struct rspamd_dns_request_ud)); + } + else { + reqdata = g_malloc0(sizeof(struct rspamd_dns_request_ud)); + } + + reqdata->pool = pool; + reqdata->session = session; + reqdata->cb = cb; + reqdata->ud = ud; + + req = rdns_make_request_full(resolver->r, rspamd_dns_callback, reqdata, + resolver->request_timeout, resolver->max_retransmits, 1, name, + type); + reqdata->req = req; + + if (session) { + if (req != NULL) { + rspamd_session_add_event(session, + (event_finalizer_t) rspamd_dns_fin_cb, + reqdata, + M); + } + } + + if (req == NULL) { + if (pool == NULL) { + g_free(reqdata); + g_free(real_name); + } + + return NULL; + } + + if (real_name && pool == NULL) { + g_free(real_name); + } + + return reqdata; +} + +struct rspamd_dns_cached_delayed_cbdata { + struct rspamd_task *task; + dns_callback_type cb; + gpointer ud; + ev_timer tm; + struct rdns_request *req; +}; + +static void +rspamd_fail_cache_cb(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_dns_cached_delayed_cbdata *cbd = + (struct rspamd_dns_cached_delayed_cbdata *) w->data; + struct rdns_reply fake_reply; + + ev_timer_stop(EV_A_ w); + memset(&fake_reply, 0, sizeof(fake_reply)); + fake_reply.code = RDNS_RC_SERVFAIL; + fake_reply.request = cbd->req; + fake_reply.resolver = cbd->req->resolver; + fake_reply.requested_name = cbd->req->requested_names[0].name; + cbd->cb(&fake_reply, cbd->ud); + rdns_request_release(cbd->req); +} + +static gboolean +make_dns_request_task_common(struct rspamd_task *task, + dns_callback_type cb, + gpointer ud, + enum rdns_request_type type, + const char *name, + gboolean forced) +{ + struct rspamd_dns_request_ud *reqdata; + + if (!forced && task->dns_requests >= task->cfg->dns_max_requests) { + return FALSE; + } + + if (task->resolver->fails_cache) { + /* Search in failures cache */ + struct rspamd_dns_fail_cache_entry search; + struct rdns_request *req; + + search.name = name; + search.namelen = strlen(name); + search.type = type; + + if ((req = rspamd_lru_hash_lookup(task->resolver->fails_cache, + &search, task->task_timestamp)) != NULL) { + /* + * We need to reply with SERVFAIL again to the API, so add a special + * timer, uh-oh, and fire it + */ + struct rspamd_dns_cached_delayed_cbdata *cbd = + rspamd_mempool_alloc0(task->task_pool, sizeof(*cbd)); + + ev_timer_init(&cbd->tm, rspamd_fail_cache_cb, 0.0, 0.0); + cbd->task = task; + cbd->cb = cb; + cbd->ud = ud; + cbd->req = rdns_request_retain(req); + cbd->tm.data = cbd; + + return TRUE; + } + } + + reqdata = rspamd_dns_resolver_request( + task->resolver, task->s, task->task_pool, cb, ud, + type, name); + + if (reqdata) { + task->dns_requests++; + + reqdata->task = task; + reqdata->item = rspamd_symcache_get_cur_item(task); + + if (reqdata->item) { + /* We are inside some session */ + rspamd_symcache_item_async_inc(task, reqdata->item, M); + } + + if (!forced && task->dns_requests >= task->cfg->dns_max_requests) { + msg_info_task("stop resolving on reaching %ud requests", + task->dns_requests); + } + + return TRUE; + } + + return FALSE; +} + +gboolean +rspamd_dns_resolver_request_task(struct rspamd_task *task, + dns_callback_type cb, + gpointer ud, + enum rdns_request_type type, + const char *name) +{ + return make_dns_request_task_common(task, cb, ud, type, name, FALSE); +} + +gboolean +rspamd_dns_resolver_request_task_forced(struct rspamd_task *task, + dns_callback_type cb, + gpointer ud, + enum rdns_request_type type, + const char *name) +{ + return make_dns_request_task_common(task, cb, ud, type, name, TRUE); +} + +static void rspamd_rnds_log_bridge( + void *log_data, + enum rdns_log_level level, + const char *function, + const char *format, + va_list args) +{ + rspamd_logger_t *logger = log_data; + + rspamd_common_logv(logger, (GLogLevelFlags) level, "rdns", NULL, + function, format, args); +} + +static void +rspamd_dns_server_init(struct upstream *up, guint idx, gpointer ud) +{ + struct rspamd_dns_resolver *r = ud; + rspamd_inet_addr_t *addr; + void *serv; + struct rdns_upstream_elt *elt; + + addr = rspamd_upstream_addr_next(up); + + if (r->cfg) { + serv = rdns_resolver_add_server(r->r, rspamd_inet_address_to_string(addr), + rspamd_inet_address_get_port(addr), 0, r->cfg->dns_io_per_server); + + elt = rspamd_mempool_alloc0(r->cfg->cfg_pool, sizeof(*elt)); + elt->server = serv; + elt->lib_data = up; + + rspamd_upstream_set_data(up, elt); + } + else { + serv = rdns_resolver_add_server(r->r, rspamd_inet_address_to_string(addr), + rspamd_inet_address_get_port(addr), 0, 8); + } + + g_assert(serv != NULL); +} + +static void +rspamd_dns_server_reorder(struct upstream *up, guint idx, gpointer ud) +{ + struct rspamd_dns_resolver *r = ud; + + rspamd_upstream_set_weight(up, rspamd_upstreams_count(r->ups) - idx + 1); +} + +static bool +rspamd_dns_resolv_conf_on_server(struct rdns_resolver *resolver, + const char *name, unsigned int port, + int priority, unsigned int io_cnt, void *ud) +{ + struct rspamd_dns_resolver *dns_resolver = ud; + struct rspamd_config *cfg; + rspamd_inet_addr_t *addr; + gint test_fd; + + cfg = dns_resolver->cfg; + + msg_info_config("parsed nameserver %s from resolv.conf", name); + + /* Try to open a connection */ + if (!rspamd_parse_inet_address(&addr, name, strlen(name), + RSPAMD_INET_ADDRESS_PARSE_DEFAULT)) { + msg_warn_config("cannot parse nameserver address %s", name); + + return FALSE; + } + + rspamd_inet_address_set_port(addr, port); + test_fd = rspamd_inet_address_connect(addr, SOCK_DGRAM, TRUE); + + if (test_fd == -1 && (errno != EINTR || errno != ECONNREFUSED || errno != ECONNRESET)) { + msg_info_config("cannot open connection to nameserver at address %s: %s", + name, strerror(errno)); + rspamd_inet_address_free(addr); + + return FALSE; + } + + rspamd_inet_address_free(addr); + close(test_fd); + + return rspamd_upstreams_add_upstream(dns_resolver->ups, name, port, + RSPAMD_UPSTREAM_PARSE_NAMESERVER, + NULL); +} + +static void +rspamd_process_fake_reply(struct rspamd_config *cfg, + struct rspamd_dns_resolver *dns_resolver, + const ucl_object_t *cur_arr) +{ + const ucl_object_t *cur; + ucl_object_iter_t it; + + it = ucl_object_iterate_new(cur_arr); + + while ((cur = ucl_object_iterate_safe(it, true))) { + const ucl_object_t *type_obj, *name_obj, *code_obj, *replies_obj; + enum rdns_request_type rtype = RDNS_REQUEST_A; + enum dns_rcode rcode = RDNS_RC_NOERROR; + struct rdns_reply_entry *replies = NULL; + const gchar *name = NULL; + + if (ucl_object_type(cur) != UCL_OBJECT) { + continue; + } + + name_obj = ucl_object_lookup(cur, "name"); + if (name_obj == NULL || + (name = ucl_object_tostring(name_obj)) == NULL) { + msg_err_config("no name for fake dns reply"); + continue; + } + + type_obj = ucl_object_lookup(cur, "type"); + if (type_obj) { + rtype = rdns_type_fromstr(ucl_object_tostring(type_obj)); + + if (rtype == RDNS_REQUEST_INVALID) { + msg_err_config("invalid type for %s: %s", name, + ucl_object_tostring(type_obj)); + continue; + } + } + + code_obj = ucl_object_lookup_any(cur, "code", "rcode", NULL); + if (code_obj) { + rcode = rdns_rcode_fromstr(ucl_object_tostring(code_obj)); + + if (rcode == RDNS_RC_INVALID) { + msg_err_config("invalid rcode for %s: %s", name, + ucl_object_tostring(code_obj)); + continue; + } + } + + if (rcode == RDNS_RC_NOERROR) { + /* We want replies to be set for this rcode */ + replies_obj = ucl_object_lookup(cur, "replies"); + + if (replies_obj == NULL || ucl_object_type(replies_obj) != UCL_ARRAY) { + msg_err_config("invalid replies for fake DNS record %s", name); + continue; + } + + ucl_object_iter_t rep_it; + const ucl_object_t *rep_obj; + + rep_it = ucl_object_iterate_new(replies_obj); + + while ((rep_obj = ucl_object_iterate_safe(rep_it, true))) { + const gchar *str_rep = ucl_object_tostring(rep_obj); + struct rdns_reply_entry *rep; + gchar **svec; + + if (str_rep == NULL) { + msg_err_config("invalid reply element for fake DNS record %s", + name); + continue; + } + + rep = calloc(1, sizeof(*rep)); + g_assert(rep != NULL); + + rep->type = rtype; + rep->ttl = 0; + + switch (rtype) { + case RDNS_REQUEST_A: + if (inet_pton(AF_INET, str_rep, &rep->content.a.addr) != 1) { + msg_err_config("invalid A reply element for fake " + "DNS record %s: %s", + name, str_rep); + free(rep); + } + else { + DL_APPEND(replies, rep); + } + break; + case RDNS_REQUEST_NS: + rep->content.ns.name = strdup(str_rep); + DL_APPEND(replies, rep); + break; + case RDNS_REQUEST_PTR: + rep->content.ptr.name = strdup(str_rep); + DL_APPEND(replies, rep); + break; + case RDNS_REQUEST_MX: + svec = g_strsplit_set(str_rep, " :", -1); + + if (svec && svec[0] && svec[1]) { + rep->content.mx.priority = strtoul(svec[0], NULL, 10); + rep->content.mx.name = strdup(svec[1]); + DL_APPEND(replies, rep); + } + else { + msg_err_config("invalid MX reply element for fake " + "DNS record %s: %s", + name, str_rep); + free(rep); + } + + g_strfreev(svec); + break; + case RDNS_REQUEST_TXT: + rep->content.txt.data = strdup(str_rep); + DL_APPEND(replies, rep); + break; + case RDNS_REQUEST_SOA: + svec = g_strsplit_set(str_rep, " :", -1); + + /* 7 elements */ + if (svec && svec[0] && svec[1] && svec[2] && + svec[3] && svec[4] && svec[5] && svec[6]) { + rep->content.soa.mname = strdup(svec[0]); + rep->content.soa.admin = strdup(svec[1]); + rep->content.soa.serial = strtoul(svec[2], NULL, 10); + rep->content.soa.refresh = strtol(svec[3], NULL, 10); + rep->content.soa.retry = strtol(svec[4], NULL, 10); + rep->content.soa.expire = strtol(svec[5], NULL, 10); + rep->content.soa.minimum = strtoul(svec[6], NULL, 10); + DL_APPEND(replies, rep); + } + else { + msg_err_config("invalid MX reply element for fake " + "DNS record %s: %s", + name, str_rep); + free(rep); + } + + g_strfreev(svec); + break; + case RDNS_REQUEST_AAAA: + if (inet_pton(AF_INET6, str_rep, &rep->content.aaa.addr) != 1) { + msg_err_config("invalid AAAA reply element for fake " + "DNS record %s: %s", + name, str_rep); + free(rep); + } + else { + DL_APPEND(replies, rep); + } + break; + case RDNS_REQUEST_SRV: + default: + msg_err_config("invalid or unsupported reply element " + "for fake DNS record %s(%s): %s", + name, rdns_str_from_type(rtype), str_rep); + free(rep); + break; + } + } + + ucl_object_iterate_free(rep_it); + + if (replies) { + struct rdns_reply_entry *tmp_entry; + guint i = 0; + DL_COUNT(replies, tmp_entry, i); + + msg_info_config("added fake record: %s(%s); %d replies", name, + rdns_str_from_type(rtype), i); + rdns_resolver_set_fake_reply(dns_resolver->r, + name, rtype, rcode, replies); + } + else { + msg_warn_config("record %s has no replies, not adding", + name); + } + } + else { + /* This entry returns some non valid code, no replies are possible */ + replies_obj = ucl_object_lookup(cur, "replies"); + + if (replies_obj) { + msg_warn_config("replies are set for non-successful return " + "code for %s(%s), they will be ignored", + name, rdns_str_from_type(rtype)); + } + + rdns_resolver_set_fake_reply(dns_resolver->r, + name, rtype, rcode, NULL); + } + } + + ucl_object_iterate_free(it); +} + +static bool +rspamd_dns_read_hosts_file(struct rspamd_config *cfg, + struct rspamd_dns_resolver *dns_resolver, + const gchar *fname) +{ + gchar *linebuf = NULL; + gsize buflen = 0; + gssize r; + FILE *fp; + guint nadded = 0; + + fp = fopen(fname, "r"); + + if (fp == NULL) { + /* Hack to reduce noise */ + if (strcmp(fname, "/etc/hosts") == 0) { + msg_info_config("cannot open hosts file %s: %s", fname, + strerror(errno)); + } + else { + msg_err_config("cannot open hosts file %s: %s", fname, + strerror(errno)); + } + + return false; + } + + while ((r = getline(&linebuf, &buflen, fp)) > 0) { + if (linebuf[0] == '#' || g_ascii_isspace(linebuf[0])) { + /* Skip comment or empty line */ + continue; + } + + g_strchomp(linebuf); + + gchar **elts = g_strsplit_set(linebuf, " \t\v", -1); + rspamd_inet_addr_t *addr; + + if (!rspamd_parse_inet_address(&addr, elts[0], strlen(elts[0]), + RSPAMD_INET_ADDRESS_PARSE_REMOTE | RSPAMD_INET_ADDRESS_PARSE_NO_UNIX)) { + msg_warn_config("bad hosts file line: %s; cannot parse address", linebuf); + } + else { + /* Add all FQDN + aliases if any */ + gchar **cur_name = &elts[1]; + + while (*cur_name) { + if (strlen(*cur_name) == 0) { + cur_name++; + continue; + } + + if (*cur_name[0] == '#') { + /* Start of the comment */ + break; + } + + struct rdns_reply_entry *rep; + rep = calloc(1, sizeof(*rep)); + g_assert(rep != NULL); + + rep->ttl = 0; + + if (rspamd_inet_address_get_af(addr) == AF_INET) { + socklen_t unused; + const struct sockaddr_in *sin = (const struct sockaddr_in *) + rspamd_inet_address_get_sa(addr, &unused); + rep->type = RDNS_REQUEST_A; + memcpy(&rep->content.a.addr, &sin->sin_addr, + sizeof(rep->content.a.addr)); + } + else { + socklen_t unused; + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *) + rspamd_inet_address_get_sa(addr, &unused); + rep->type = RDNS_REQUEST_AAAA; + memcpy(&rep->content.aaa.addr, &sin6->sin6_addr, + sizeof(rep->content.aaa.addr)); + } + + rep->next = NULL; + rep->prev = rep; + rdns_resolver_set_fake_reply(dns_resolver->r, + *cur_name, rep->type, RDNS_RC_NOERROR, rep); + msg_debug_config("added fake record %s -> %s from hosts file %s", + *cur_name, rspamd_inet_address_to_string(addr), fname); + cur_name++; + nadded++; + } + + rspamd_inet_address_free(addr); + } + + g_strfreev(elts); + } + + if (linebuf) { + free(linebuf); + } + + msg_info_config("processed host file %s; %d records added", fname, nadded); + fclose(fp); + + return true; +} + +static void +rspamd_dns_resolver_config_ucl(struct rspamd_config *cfg, + struct rspamd_dns_resolver *dns_resolver, + const ucl_object_t *dns_section) +{ + const ucl_object_t *fake_replies, *fails_cache_size, *fails_cache_time, + *hosts; + static const ev_tstamp default_fails_cache_time = 10.0; + + /* Process fake replies */ + fake_replies = ucl_object_lookup_any(dns_section, "fake_records", + "fake_replies", NULL); + + if (fake_replies && ucl_object_type(fake_replies) == UCL_ARRAY) { + const ucl_object_t *cur_arr; + + DL_FOREACH(fake_replies, cur_arr) + { + rspamd_process_fake_reply(cfg, dns_resolver, cur_arr); + } + } + + hosts = ucl_object_lookup(dns_section, "hosts"); + + if (hosts == NULL) { + /* Read normal `/etc/hosts` file */ + rspamd_dns_read_hosts_file(cfg, dns_resolver, "/etc/hosts"); + } + else if (ucl_object_type(hosts) == UCL_NULL) { + /* Do nothing, hosts are explicitly disabled */ + } + else if (ucl_object_type(hosts) == UCL_STRING) { + if (!rspamd_dns_read_hosts_file(cfg, dns_resolver, ucl_object_tostring(hosts))) { + msg_err_config("cannot read hosts file %s", ucl_object_tostring(hosts)); + } + } + else if (ucl_object_type(hosts) == UCL_ARRAY) { + const ucl_object_t *cur; + ucl_object_iter_t it = NULL; + + while ((cur = ucl_object_iterate(hosts, &it, true)) != NULL) { + if (!rspamd_dns_read_hosts_file(cfg, dns_resolver, ucl_object_tostring(cur))) { + msg_err_config("cannot read hosts file %s", ucl_object_tostring(cur)); + } + } + } + else { + msg_err_config("invalid type for hosts parameter: %s", + ucl_object_type_to_string(ucl_object_type(hosts))); + } + + fails_cache_size = ucl_object_lookup(dns_section, "fails_cache_size"); + if (fails_cache_size && ucl_object_type(fails_cache_size) == UCL_INT) { + + dns_resolver->fails_cache_time = default_fails_cache_time; + fails_cache_time = ucl_object_lookup(dns_section, "fails_cache_time"); + + if (fails_cache_time) { + dns_resolver->fails_cache_time = ucl_object_todouble(fails_cache_time); + } + + dns_resolver->fails_cache = rspamd_lru_hash_new_full( + ucl_object_toint(fails_cache_size), + g_free, (GDestroyNotify) rdns_request_release, + rspamd_dns_fail_hash, rspamd_dns_fail_equal); + } +} + +struct rspamd_dns_resolver * +rspamd_dns_resolver_init(rspamd_logger_t *logger, + struct ev_loop *ev_base, + struct rspamd_config *cfg) +{ + struct rspamd_dns_resolver *dns_resolver; + + dns_resolver = g_malloc0(sizeof(struct rspamd_dns_resolver)); + dns_resolver->event_loop = ev_base; + + if (cfg != NULL) { + dns_resolver->request_timeout = cfg->dns_timeout; + dns_resolver->max_retransmits = cfg->dns_retransmits; + } + else { + dns_resolver->request_timeout = 1; + dns_resolver->max_retransmits = 2; + } + + /* IDN translation is performed in Rspamd now */ + dns_resolver->r = rdns_resolver_new(RDNS_RESOLVER_NOIDN); + + UErrorCode uc_err = U_ZERO_ERROR; + + dns_resolver->uidna = uidna_openUTS46(UIDNA_DEFAULT, &uc_err); + g_assert(!U_FAILURE(uc_err)); + rdns_bind_libev(dns_resolver->r, dns_resolver->event_loop); + + if (cfg != NULL) { + rdns_resolver_set_log_level(dns_resolver->r, cfg->log_level); + dns_resolver->cfg = cfg; + rdns_resolver_set_dnssec(dns_resolver->r, cfg->enable_dnssec); + + if (cfg->nameservers == NULL) { + /* Parse resolv.conf */ + dns_resolver->ups = rspamd_upstreams_create(cfg->ups_ctx); + rspamd_upstreams_set_flags(dns_resolver->ups, + RSPAMD_UPSTREAM_FLAG_NORESOLVE); + rspamd_upstreams_set_rotation(dns_resolver->ups, + RSPAMD_UPSTREAM_MASTER_SLAVE); + + if (!rdns_resolver_parse_resolv_conf_cb(dns_resolver->r, + "/etc/resolv.conf", + rspamd_dns_resolv_conf_on_server, + dns_resolver)) { + msg_err("cannot parse resolv.conf and no nameservers defined, " + "so no ways to resolve addresses"); + rdns_resolver_release(dns_resolver->r); + dns_resolver->r = NULL; + + return dns_resolver; + } + + /* Use normal resolv.conf rules */ + rspamd_upstreams_foreach(dns_resolver->ups, rspamd_dns_server_reorder, + dns_resolver); + } + else { + dns_resolver->ups = rspamd_upstreams_create(cfg->ups_ctx); + rspamd_upstreams_set_flags(dns_resolver->ups, + RSPAMD_UPSTREAM_FLAG_NORESOLVE); + + if (!rspamd_upstreams_from_ucl(dns_resolver->ups, cfg->nameservers, + 53, dns_resolver)) { + msg_err_config("cannot parse DNS nameservers definitions"); + rdns_resolver_release(dns_resolver->r); + dns_resolver->r = NULL; + + return dns_resolver; + } + } + + rspamd_upstreams_foreach(dns_resolver->ups, rspamd_dns_server_init, + dns_resolver); + rdns_resolver_set_upstream_lib(dns_resolver->r, &rspamd_ups_ctx, + dns_resolver->ups); + cfg->dns_resolver = dns_resolver; + + if (cfg->cfg_ucl_obj) { + /* Configure additional options */ + const ucl_object_t *opts_section, *dns_section, *tmp; + + opts_section = ucl_object_lookup(cfg->cfg_ucl_obj, "options"); + + if (opts_section) { + /* TODO: implement a more simple merge logic */ + DL_FOREACH(opts_section, tmp) + { + dns_section = ucl_object_lookup(opts_section, "dns"); + + if (dns_section) { + rspamd_dns_resolver_config_ucl(cfg, dns_resolver, + dns_section); + } + } + } + } + } + + rdns_resolver_set_logger(dns_resolver->r, rspamd_rnds_log_bridge, logger); + rdns_resolver_init(dns_resolver->r); + + return dns_resolver; +} + +void rspamd_dns_resolver_deinit(struct rspamd_dns_resolver *resolver) +{ + if (resolver) { + if (resolver->r) { + rdns_resolver_release(resolver->r); + } + + if (resolver->ups) { + rspamd_upstreams_destroy(resolver->ups); + } + + if (resolver->fails_cache) { + rspamd_lru_hash_destroy(resolver->fails_cache); + } + + uidna_close(resolver->uidna); + + g_free(resolver); + } +} + + +static struct rdns_upstream_elt * +rspamd_dns_select_upstream(const char *name, + size_t len, void *ups_data) +{ + struct upstream_list *ups = ups_data; + struct upstream *up; + + up = rspamd_upstream_get(ups, RSPAMD_UPSTREAM_ROUND_ROBIN, name, len); + + if (up) { + msg_debug("select %s", rspamd_upstream_name(up)); + + return rspamd_upstream_get_data(up); + } + + return NULL; +} + +static struct rdns_upstream_elt * +rspamd_dns_select_upstream_retransmit( + const char *name, + size_t len, + struct rdns_upstream_elt *prev_elt, + void *ups_data) +{ + struct upstream_list *ups = ups_data; + struct upstream *up; + + if (prev_elt) { + up = rspamd_upstream_get_except(ups, (struct upstream *) prev_elt->lib_data, + RSPAMD_UPSTREAM_MASTER_SLAVE, name, len); + } + else { + up = rspamd_upstream_get_forced(ups, RSPAMD_UPSTREAM_RANDOM, name, len); + } + + if (up) { + msg_debug("select forced %s", rspamd_upstream_name(up)); + + return rspamd_upstream_get_data(up); + } + + return NULL; +} + +static void +rspamd_dns_upstream_ok(struct rdns_upstream_elt *elt, + void *ups_data) +{ + struct upstream *up = elt->lib_data; + + rspamd_upstream_ok(up); +} + +static void +rspamd_dns_upstream_fail(struct rdns_upstream_elt *elt, + void *ups_data, const gchar *reason) +{ + struct upstream *up = elt->lib_data; + + rspamd_upstream_fail(up, FALSE, reason); +} + +static unsigned int +rspamd_dns_upstream_count(void *ups_data) +{ + struct upstream_list *ups = ups_data; + + return rspamd_upstreams_alive(ups); +} + +gchar * +rspamd_dns_resolver_idna_convert_utf8(struct rspamd_dns_resolver *resolver, + rspamd_mempool_t *pool, + const char *name, + gint namelen, + guint *outlen) +{ + if (resolver == NULL || resolver->uidna == NULL || name == NULL || namelen > DNS_D_MAXNAME) { + return NULL; + } + + guint dest_len; + UErrorCode uc_err = U_ZERO_ERROR; + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + /* Calculate length required */ + dest_len = uidna_nameToASCII_UTF8(resolver->uidna, name, namelen, + NULL, 0, &info, &uc_err); + + if (uc_err == U_BUFFER_OVERFLOW_ERROR) { + gchar *dest; + + if (pool) { + dest = rspamd_mempool_alloc(pool, dest_len + 1); + } + else { + dest = g_malloc(dest_len + 1); + } + + uc_err = U_ZERO_ERROR; + + dest_len = uidna_nameToASCII_UTF8(resolver->uidna, name, namelen, + dest, dest_len + 1, &info, &uc_err); + + if (U_FAILURE(uc_err)) { + + if (!pool) { + g_free(dest); + } + + return NULL; + } + + dest[dest_len] = '\0'; + + if (outlen) { + *outlen = dest_len; + } + + return dest; + } + + return NULL; +}
\ No newline at end of file diff --git a/src/libserver/dns.h b/src/libserver/dns.h new file mode 100644 index 0000000..acf8d09 --- /dev/null +++ b/src/libserver/dns.h @@ -0,0 +1,110 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_DNS_H +#define RSPAMD_DNS_H + +#include "config.h" +#include "mem_pool.h" +#include "async_session.h" +#include "logger.h" +#include "rdns.h" +#include "upstream.h" +#include "libutil/hash.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_config; +struct rspamd_task; +struct event_loop; + +struct rspamd_dns_resolver { + struct rdns_resolver *r; + struct ev_loop *event_loop; + rspamd_lru_hash_t *fails_cache; + void *uidna; + double fails_cache_time; + struct upstream_list *ups; + struct rspamd_config *cfg; + gdouble request_timeout; + guint max_retransmits; +}; + +/* Rspamd DNS API */ + +/** + * Init DNS resolver, params are obtained from a config file or system file /etc/resolv.conf + */ +struct rspamd_dns_resolver *rspamd_dns_resolver_init(rspamd_logger_t *logger, + struct ev_loop *ev_base, + struct rspamd_config *cfg); + +void rspamd_dns_resolver_deinit(struct rspamd_dns_resolver *resolver); + +struct rspamd_dns_request_ud; + +/** + * Make a DNS request + * @param resolver resolver object + * @param session async session to register event + * @param pool memory pool for storage + * @param cb callback to call on resolve completing + * @param ud user data for callback + * @param type request type + * @param ... string or ip address based on a request type + * @return TRUE if request was sent. + */ +struct rspamd_dns_request_ud *rspamd_dns_resolver_request(struct rspamd_dns_resolver *resolver, + struct rspamd_async_session *session, + rspamd_mempool_t *pool, + dns_callback_type cb, + gpointer ud, + enum rdns_request_type type, + const char *name); + +gboolean rspamd_dns_resolver_request_task(struct rspamd_task *task, + dns_callback_type cb, + gpointer ud, + enum rdns_request_type type, + const char *name); + +gboolean rspamd_dns_resolver_request_task_forced(struct rspamd_task *task, + dns_callback_type cb, + gpointer ud, + enum rdns_request_type type, + const char *name); + +/** + * Converts a name into idna from UTF8 + * @param resolver resolver (must be initialised) + * @param pool optional memory pool (can be NULL, then you need to g_free) the result + * @param name input name + * @param namelen length of input (-1 for zero terminated) + * @return encoded string + */ +gchar *rspamd_dns_resolver_idna_convert_utf8(struct rspamd_dns_resolver *resolver, + rspamd_mempool_t *pool, + const char *name, + gint namelen, + guint *outlen); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/dynamic_cfg.c b/src/libserver/dynamic_cfg.c new file mode 100644 index 0000000..cd5cc4e --- /dev/null +++ b/src/libserver/dynamic_cfg.c @@ -0,0 +1,743 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "rspamd.h" +#include "libserver/maps/map.h" +#include "scan_result.h" +#include "dynamic_cfg.h" +#include "unix-std.h" +#include "lua/lua_common.h" + +#include <math.h> + +struct config_json_buf { + GString *buf; + struct rspamd_config *cfg; +}; + +/** + * Apply configuration to the specified configuration + * @param conf_metrics + * @param cfg + */ +static void +apply_dynamic_conf(const ucl_object_t *top, struct rspamd_config *cfg) +{ + enum rspamd_action_type test_act; + const ucl_object_t *cur_elt, *cur_nm, *it_val; + ucl_object_iter_t it = NULL; + const gchar *name; + gdouble nscore; + static const guint priority = 3; + + while ((cur_elt = ucl_object_iterate(top, &it, true))) { + if (ucl_object_type(cur_elt) != UCL_OBJECT) { + msg_err("loaded json array element is not an object"); + continue; + } + + cur_nm = ucl_object_lookup(cur_elt, "metric"); + if (!cur_nm || ucl_object_type(cur_nm) != UCL_STRING) { + msg_err( + "loaded json metric object element has no 'metric' attribute"); + continue; + } + + cur_nm = ucl_object_lookup(cur_elt, "symbols"); + /* Parse symbols */ + if (cur_nm && ucl_object_type(cur_nm) == UCL_ARRAY) { + ucl_object_iter_t nit = NULL; + + while ((it_val = ucl_object_iterate(cur_nm, &nit, true))) { + if (ucl_object_lookup(it_val, "name") && + ucl_object_lookup(it_val, "value")) { + const ucl_object_t *n = + ucl_object_lookup(it_val, "name"); + const ucl_object_t *v = + ucl_object_lookup(it_val, "value"); + + nscore = ucl_object_todouble(v); + + /* + * We use priority = 3 here + */ + rspamd_config_add_symbol(cfg, + ucl_object_tostring(n), nscore, NULL, NULL, + 0, priority, cfg->default_max_shots); + } + else { + msg_info( + "json symbol object has no mandatory 'name' and 'value' attributes"); + } + } + } + else { + ucl_object_t *arr; + + arr = ucl_object_typed_new(UCL_ARRAY); + ucl_object_insert_key((ucl_object_t *) cur_elt, arr, "symbols", + sizeof("symbols") - 1, false); + } + cur_nm = ucl_object_lookup(cur_elt, "actions"); + /* Parse actions */ + if (cur_nm && ucl_object_type(cur_nm) == UCL_ARRAY) { + ucl_object_iter_t nit = NULL; + + while ((it_val = ucl_object_iterate(cur_nm, &nit, true))) { + const ucl_object_t *n = ucl_object_lookup(it_val, "name"); + const ucl_object_t *v = ucl_object_lookup(it_val, "value"); + + if (n != NULL && v != NULL) { + name = ucl_object_tostring(n); + + if (!name || !rspamd_action_from_str(name, &test_act)) { + msg_err("unknown action: %s", + ucl_object_tostring(ucl_object_lookup(it_val, + "name"))); + continue; + } + + + if (ucl_object_type(v) == UCL_NULL) { + nscore = NAN; + } + else { + nscore = ucl_object_todouble(v); + } + + ucl_object_t *obj_tbl = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(obj_tbl, ucl_object_fromdouble(nscore), + "score", 0, false); + ucl_object_insert_key(obj_tbl, ucl_object_fromdouble(priority), + "priority", 0, false); + rspamd_config_set_action_score(cfg, name, obj_tbl); + ucl_object_unref(obj_tbl); + } + else { + msg_info( + "json action object has no mandatory 'name' and 'value' attributes"); + } + } + } + else { + ucl_object_t *arr; + + arr = ucl_object_typed_new(UCL_ARRAY); + ucl_object_insert_key((ucl_object_t *) cur_elt, arr, "actions", + sizeof("actions") - 1, false); + } + } +} + +/* Callbacks for reading json dynamic rules */ +static gchar * +json_config_read_cb(gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct config_json_buf *jb, *pd; + + pd = data->prev_data; + + g_assert(pd != NULL); + + if (data->cur_data == NULL) { + jb = g_malloc0(sizeof(*jb)); + jb->cfg = pd->cfg; + data->cur_data = jb; + } + else { + jb = data->cur_data; + } + + if (jb->buf == NULL) { + /* Allocate memory for buffer */ + jb->buf = g_string_sized_new(MAX(len, BUFSIZ)); + } + + g_string_append_len(jb->buf, chunk, len); + + return NULL; +} + +static void +json_config_fin_cb(struct map_cb_data *data, void **target) +{ + struct config_json_buf *jb; + ucl_object_t *top; + struct ucl_parser *parser; + + /* Now parse json */ + if (data->cur_data) { + jb = data->cur_data; + } + else { + return; + } + + if (jb->buf == NULL) { + msg_err("no data read"); + + return; + } + + parser = ucl_parser_new(0); + + if (!ucl_parser_add_chunk(parser, jb->buf->str, jb->buf->len)) { + msg_err("cannot load json data: parse error %s", + ucl_parser_get_error(parser)); + ucl_parser_free(parser); + return; + } + + top = ucl_parser_get_object(parser); + ucl_parser_free(parser); + + if (ucl_object_type(top) != UCL_ARRAY) { + ucl_object_unref(top); + msg_err("loaded json is not an array"); + return; + } + + ucl_object_unref(jb->cfg->current_dynamic_conf); + apply_dynamic_conf(top, jb->cfg); + jb->cfg->current_dynamic_conf = top; + + if (target) { + *target = data->cur_data; + } + + if (data->prev_data) { + jb = data->prev_data; + /* Clean prev data */ + if (jb->buf) { + g_string_free(jb->buf, TRUE); + } + + g_free(jb); + } +} + +static void +json_config_dtor_cb(struct map_cb_data *data) +{ + struct config_json_buf *jb; + + if (data->cur_data) { + jb = data->cur_data; + /* Clean prev data */ + if (jb->buf) { + g_string_free(jb->buf, TRUE); + } + + if (jb->cfg && jb->cfg->current_dynamic_conf) { + ucl_object_unref(jb->cfg->current_dynamic_conf); + } + + g_free(jb); + } +} + +/** + * Init dynamic configuration using map logic and specific configuration + * @param cfg config file + */ +void init_dynamic_config(struct rspamd_config *cfg) +{ + struct config_json_buf *jb, **pjb; + + if (cfg->dynamic_conf == NULL) { + /* No dynamic conf has been specified, so do not try to load it */ + return; + } + + /* Now try to add map with json data */ + jb = g_malloc(sizeof(struct config_json_buf)); + pjb = g_malloc(sizeof(struct config_json_buf *)); + jb->buf = NULL; + jb->cfg = cfg; + *pjb = jb; + cfg->current_dynamic_conf = ucl_object_typed_new(UCL_ARRAY); + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) g_free, + pjb); + + if (!rspamd_map_add(cfg, + cfg->dynamic_conf, + "Dynamic configuration map", + json_config_read_cb, + json_config_fin_cb, + json_config_dtor_cb, + (void **) pjb, NULL, RSPAMD_MAP_DEFAULT)) { + msg_err("cannot add map for configuration %s", cfg->dynamic_conf); + } +} + +/** + * Dump dynamic configuration to the disk + * @param cfg + * @return + */ +gboolean +dump_dynamic_config(struct rspamd_config *cfg) +{ + struct stat st; + gchar *dir, pathbuf[PATH_MAX]; + gint fd; + + if (cfg->dynamic_conf == NULL || cfg->current_dynamic_conf == NULL) { + /* No dynamic conf has been specified, so do not try to dump it */ + msg_err("cannot save dynamic conf as it is not specified"); + return FALSE; + } + + dir = g_path_get_dirname(cfg->dynamic_conf); + if (dir == NULL) { + msg_err("invalid path: %s", cfg->dynamic_conf); + return FALSE; + } + + if (stat(cfg->dynamic_conf, &st) == -1) { + msg_debug("%s is unavailable: %s", cfg->dynamic_conf, + strerror(errno)); + st.st_mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + } + if (access(dir, W_OK | R_OK) == -1) { + msg_warn("%s is inaccessible: %s", dir, strerror(errno)); + g_free(dir); + return FALSE; + } + rspamd_snprintf(pathbuf, + sizeof(pathbuf), + "%s%crconf-XXXXXX", + dir, + G_DIR_SEPARATOR); + g_free(dir); +#ifdef HAVE_MKSTEMP + /* Umask is set before */ + fd = mkstemp(pathbuf); +#else + fd = g_mkstemp_full(pathbuf, O_RDWR, S_IWUSR | S_IRUSR); +#endif + if (fd == -1) { + msg_err("mkstemp error: %s", strerror(errno)); + + return FALSE; + } + + struct ucl_emitter_functions *emitter_functions; + FILE *fp; + + fp = fdopen(fd, "w"); + emitter_functions = ucl_object_emit_file_funcs(fp); + + if (!ucl_object_emit_full(cfg->current_dynamic_conf, UCL_EMIT_JSON, + emitter_functions, NULL)) { + msg_err("cannot emit ucl object: %s", strerror(errno)); + ucl_object_emit_funcs_free(emitter_functions); + fclose(fp); + return FALSE; + } + + (void) unlink(cfg->dynamic_conf); + + /* Rename old config */ + if (rename(pathbuf, cfg->dynamic_conf) == -1) { + msg_err("rename error: %s", strerror(errno)); + fclose(fp); + ucl_object_emit_funcs_free(emitter_functions); + unlink(pathbuf); + + return FALSE; + } + /* Set permissions */ + + if (chmod(cfg->dynamic_conf, st.st_mode) == -1) { + msg_warn("chmod failed: %s", strerror(errno)); + } + + fclose(fp); + ucl_object_emit_funcs_free(emitter_functions); + + return TRUE; +} + +static ucl_object_t * +new_dynamic_metric(const gchar *metric_name, ucl_object_t *top) +{ + ucl_object_t *metric; + + metric = ucl_object_typed_new(UCL_OBJECT); + + ucl_object_insert_key(metric, ucl_object_fromstring(metric_name), + "metric", sizeof("metric") - 1, true); + ucl_object_insert_key(metric, ucl_object_typed_new(UCL_ARRAY), + "actions", sizeof("actions") - 1, false); + ucl_object_insert_key(metric, ucl_object_typed_new(UCL_ARRAY), + "symbols", sizeof("symbols") - 1, false); + + ucl_array_append(top, metric); + + return metric; +} + +static ucl_object_t * +dynamic_metric_find_elt(const ucl_object_t *arr, const gchar *name) +{ + ucl_object_iter_t it = NULL; + const ucl_object_t *cur, *n; + + it = ucl_object_iterate_new(arr); + + while ((cur = ucl_object_iterate_safe(it, true)) != NULL) { + if (cur->type == UCL_OBJECT) { + n = ucl_object_lookup(cur, "name"); + if (n && n->type == UCL_STRING && + strcmp(name, ucl_object_tostring(n)) == 0) { + ucl_object_iterate_free(it); + + return (ucl_object_t *) ucl_object_lookup(cur, "value"); + } + } + } + + ucl_object_iterate_free(it); + + return NULL; +} + +static ucl_object_t * +dynamic_metric_find_metric(const ucl_object_t *arr, const gchar *metric) +{ + ucl_object_iter_t it = NULL; + const ucl_object_t *cur, *n; + + it = ucl_object_iterate_new(arr); + + while ((cur = ucl_object_iterate_safe(it, true)) != NULL) { + if (cur->type == UCL_OBJECT) { + n = ucl_object_lookup(cur, "metric"); + if (n && n->type == UCL_STRING && + strcmp(metric, ucl_object_tostring(n)) == 0) { + ucl_object_iterate_free(it); + + return (ucl_object_t *) cur; + } + } + } + + ucl_object_iterate_free(it); + + return NULL; +} + +static ucl_object_t * +new_dynamic_elt(ucl_object_t *arr, const gchar *name, gdouble value) +{ + ucl_object_t *n; + + n = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(n, ucl_object_fromstring(name), "name", + sizeof("name") - 1, false); + ucl_object_insert_key(n, ucl_object_fromdouble(value), "value", + sizeof("value") - 1, false); + + ucl_array_append(arr, n); + + return n; +} + +static gint +rspamd_maybe_add_lua_dynsym(struct rspamd_config *cfg, + const gchar *sym, + gdouble score) +{ + lua_State *L = cfg->lua_state; + gint ret = -1; + struct rspamd_config **pcfg; + + lua_getglobal(L, "rspamd_plugins"); + if (lua_type(L, -1) == LUA_TTABLE) { + lua_pushstring(L, "dynamic_conf"); + lua_gettable(L, -2); + + if (lua_type(L, -1) == LUA_TTABLE) { + lua_pushstring(L, "add_symbol"); + lua_gettable(L, -2); + + if (lua_type(L, -1) == LUA_TFUNCTION) { + pcfg = lua_newuserdata(L, sizeof(*pcfg)); + *pcfg = cfg; + rspamd_lua_setclass(L, "rspamd{config}", -1); + lua_pushstring(L, sym); + lua_pushnumber(L, score); + + if (lua_pcall(L, 3, 1, 0) != 0) { + msg_err_config("cannot execute add_symbol script: %s", + lua_tostring(L, -1)); + } + else { + ret = lua_toboolean(L, -1); + } + + lua_pop(L, 1); + } + else { + lua_pop(L, 1); + } + } + + lua_pop(L, 1); + } + + lua_pop(L, 1); + + return ret; +} + +static gint +rspamd_maybe_add_lua_dynact(struct rspamd_config *cfg, + const gchar *action, + gdouble score) +{ + lua_State *L = cfg->lua_state; + gint ret = -1; + struct rspamd_config **pcfg; + + lua_getglobal(L, "rspamd_plugins"); + if (lua_type(L, -1) == LUA_TTABLE) { + lua_pushstring(L, "dynamic_conf"); + lua_gettable(L, -2); + + if (lua_type(L, -1) == LUA_TTABLE) { + lua_pushstring(L, "add_action"); + lua_gettable(L, -2); + + if (lua_type(L, -1) == LUA_TFUNCTION) { + pcfg = lua_newuserdata(L, sizeof(*pcfg)); + *pcfg = cfg; + rspamd_lua_setclass(L, "rspamd{config}", -1); + lua_pushstring(L, action); + lua_pushnumber(L, score); + + if (lua_pcall(L, 3, 1, 0) != 0) { + msg_err_config("cannot execute add_action script: %s", + lua_tostring(L, -1)); + } + else { + ret = lua_toboolean(L, -1); + } + + lua_pop(L, 1); + } + else { + lua_pop(L, 1); + } + } + + lua_pop(L, 1); + } + + lua_pop(L, 1); + + return ret; +} + +/** + * Add symbol for specified metric + * @param cfg config file object + * @param metric metric's name + * @param symbol symbol's name + * @param value value of symbol + * @return + */ +gboolean +add_dynamic_symbol(struct rspamd_config *cfg, + const gchar *metric_name, + const gchar *symbol, + gdouble value) +{ + ucl_object_t *metric, *syms; + gint ret; + + if ((ret = rspamd_maybe_add_lua_dynsym(cfg, symbol, value)) != -1) { + return ret == 0 ? FALSE : TRUE; + } + + if (cfg->dynamic_conf == NULL) { + msg_info("dynamic conf is disabled"); + return FALSE; + } + + metric = dynamic_metric_find_metric(cfg->current_dynamic_conf, + metric_name); + if (metric == NULL) { + metric = new_dynamic_metric(metric_name, cfg->current_dynamic_conf); + } + + syms = (ucl_object_t *) ucl_object_lookup(metric, "symbols"); + if (syms != NULL) { + ucl_object_t *sym; + + sym = dynamic_metric_find_elt(syms, symbol); + if (sym) { + sym->value.dv = value; + } + else { + new_dynamic_elt(syms, symbol, value); + } + } + + apply_dynamic_conf(cfg->current_dynamic_conf, cfg); + + return TRUE; +} + +gboolean +remove_dynamic_symbol(struct rspamd_config *cfg, + const gchar *metric_name, + const gchar *symbol) +{ + ucl_object_t *metric, *syms; + gboolean ret = FALSE; + + if (cfg->dynamic_conf == NULL) { + msg_info("dynamic conf is disabled"); + return FALSE; + } + + metric = dynamic_metric_find_metric(cfg->current_dynamic_conf, + metric_name); + if (metric == NULL) { + return FALSE; + } + + syms = (ucl_object_t *) ucl_object_lookup(metric, "symbols"); + if (syms != NULL) { + ucl_object_t *sym; + + sym = dynamic_metric_find_elt(syms, symbol); + + if (sym) { + ret = ucl_array_delete((ucl_object_t *) syms, sym) != NULL; + + if (ret) { + ucl_object_unref(sym); + } + } + } + + if (ret) { + apply_dynamic_conf(cfg->current_dynamic_conf, cfg); + } + + return ret; +} + + +/** + * Add action for specified metric + * @param cfg config file object + * @param metric metric's name + * @param action action's name + * @param value value of symbol + * @return + */ +gboolean +add_dynamic_action(struct rspamd_config *cfg, + const gchar *metric_name, + guint action, + gdouble value) +{ + ucl_object_t *metric, *acts; + const gchar *action_name = rspamd_action_to_str(action); + gint ret; + + if ((ret = rspamd_maybe_add_lua_dynact(cfg, action_name, value)) != -1) { + return ret == 0 ? FALSE : TRUE; + } + + if (cfg->dynamic_conf == NULL) { + msg_info("dynamic conf is disabled"); + return FALSE; + } + + metric = dynamic_metric_find_metric(cfg->current_dynamic_conf, + metric_name); + if (metric == NULL) { + metric = new_dynamic_metric(metric_name, cfg->current_dynamic_conf); + } + + acts = (ucl_object_t *) ucl_object_lookup(metric, "actions"); + if (acts != NULL) { + ucl_object_t *act; + + act = dynamic_metric_find_elt(acts, action_name); + if (act) { + act->value.dv = value; + } + else { + new_dynamic_elt(acts, action_name, value); + } + } + + apply_dynamic_conf(cfg->current_dynamic_conf, cfg); + + return TRUE; +} + +gboolean +remove_dynamic_action(struct rspamd_config *cfg, + const gchar *metric_name, + guint action) +{ + ucl_object_t *metric, *acts; + const gchar *action_name = rspamd_action_to_str(action); + gboolean ret = FALSE; + + if (cfg->dynamic_conf == NULL) { + msg_info("dynamic conf is disabled"); + return FALSE; + } + + metric = dynamic_metric_find_metric(cfg->current_dynamic_conf, + metric_name); + if (metric == NULL) { + return FALSE; + } + + acts = (ucl_object_t *) ucl_object_lookup(metric, "actions"); + + if (acts != NULL) { + ucl_object_t *act; + + act = dynamic_metric_find_elt(acts, action_name); + + if (act) { + ret = ucl_array_delete(acts, act) != NULL; + } + if (ret) { + ucl_object_unref(act); + } + } + + if (ret) { + apply_dynamic_conf(cfg->current_dynamic_conf, cfg); + } + + return ret; +} diff --git a/src/libserver/dynamic_cfg.h b/src/libserver/dynamic_cfg.h new file mode 100644 index 0000000..bb386ca --- /dev/null +++ b/src/libserver/dynamic_cfg.h @@ -0,0 +1,81 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DYNAMIC_CFG_H_ +#define DYNAMIC_CFG_H_ + +#include "config.h" +#include "cfg_file.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Init dynamic configuration using map logic and specific configuration + * @param cfg config file + */ +void init_dynamic_config(struct rspamd_config *cfg); + +/** + * Dump dynamic configuration to the disk + * @param cfg + * @return + */ +gboolean dump_dynamic_config(struct rspamd_config *cfg); + +/** + * Add symbol for specified metric + * @param cfg config file object + * @param metric metric's name + * @param symbol symbol's name + * @param value value of symbol + * @return + */ +gboolean add_dynamic_symbol(struct rspamd_config *cfg, + const gchar *metric, + const gchar *symbol, + gdouble value); + +gboolean remove_dynamic_symbol(struct rspamd_config *cfg, + const gchar *metric, + const gchar *symbol); + +/** + * Add action for specified metric + * @param cfg config file object + * @param metric metric's name + * @param action action's name + * @param value value of symbol + * @return + */ +gboolean add_dynamic_action(struct rspamd_config *cfg, + const gchar *metric, + guint action, + gdouble value); + +/** + * Removes dynamic action + */ +gboolean remove_dynamic_action(struct rspamd_config *cfg, + const gchar *metric, + guint action); + +#ifdef __cplusplus +} +#endif + +#endif /* DYNAMIC_CFG_H_ */ diff --git a/src/libserver/fuzzy_backend/fuzzy_backend.c b/src/libserver/fuzzy_backend/fuzzy_backend.c new file mode 100644 index 0000000..9099f38 --- /dev/null +++ b/src/libserver/fuzzy_backend/fuzzy_backend.c @@ -0,0 +1,560 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "fuzzy_backend.h" +#include "fuzzy_backend_sqlite.h" +#include "fuzzy_backend_redis.h" +#include "cfg_file.h" +#include "fuzzy_wire.h" + +#define DEFAULT_EXPIRE 172800L + +enum rspamd_fuzzy_backend_type { + RSPAMD_FUZZY_BACKEND_SQLITE = 0, + RSPAMD_FUZZY_BACKEND_REDIS = 1, +}; + +static void *rspamd_fuzzy_backend_init_sqlite(struct rspamd_fuzzy_backend *bk, + const ucl_object_t *obj, struct rspamd_config *cfg, GError **err); +static void rspamd_fuzzy_backend_check_sqlite(struct rspamd_fuzzy_backend *bk, + const struct rspamd_fuzzy_cmd *cmd, + rspamd_fuzzy_check_cb cb, void *ud, + void *subr_ud); +static void rspamd_fuzzy_backend_update_sqlite(struct rspamd_fuzzy_backend *bk, + GArray *updates, const gchar *src, + rspamd_fuzzy_update_cb cb, void *ud, + void *subr_ud); +static void rspamd_fuzzy_backend_count_sqlite(struct rspamd_fuzzy_backend *bk, + rspamd_fuzzy_count_cb cb, void *ud, + void *subr_ud); +static void rspamd_fuzzy_backend_version_sqlite(struct rspamd_fuzzy_backend *bk, + const gchar *src, + rspamd_fuzzy_version_cb cb, void *ud, + void *subr_ud); +static const gchar *rspamd_fuzzy_backend_id_sqlite(struct rspamd_fuzzy_backend *bk, + void *subr_ud); +static void rspamd_fuzzy_backend_expire_sqlite(struct rspamd_fuzzy_backend *bk, + void *subr_ud); +static void rspamd_fuzzy_backend_close_sqlite(struct rspamd_fuzzy_backend *bk, + void *subr_ud); + +struct rspamd_fuzzy_backend_subr { + void *(*init)(struct rspamd_fuzzy_backend *bk, const ucl_object_t *obj, + struct rspamd_config *cfg, + GError **err); + void (*check)(struct rspamd_fuzzy_backend *bk, + const struct rspamd_fuzzy_cmd *cmd, + rspamd_fuzzy_check_cb cb, void *ud, + void *subr_ud); + void (*update)(struct rspamd_fuzzy_backend *bk, + GArray *updates, const gchar *src, + rspamd_fuzzy_update_cb cb, void *ud, + void *subr_ud); + void (*count)(struct rspamd_fuzzy_backend *bk, + rspamd_fuzzy_count_cb cb, void *ud, + void *subr_ud); + void (*version)(struct rspamd_fuzzy_backend *bk, + const gchar *src, + rspamd_fuzzy_version_cb cb, void *ud, + void *subr_ud); + const gchar *(*id)(struct rspamd_fuzzy_backend *bk, void *subr_ud); + void (*periodic)(struct rspamd_fuzzy_backend *bk, void *subr_ud); + void (*close)(struct rspamd_fuzzy_backend *bk, void *subr_ud); +}; + +static const struct rspamd_fuzzy_backend_subr fuzzy_subrs[] = { + [RSPAMD_FUZZY_BACKEND_SQLITE] = { + .init = rspamd_fuzzy_backend_init_sqlite, + .check = rspamd_fuzzy_backend_check_sqlite, + .update = rspamd_fuzzy_backend_update_sqlite, + .count = rspamd_fuzzy_backend_count_sqlite, + .version = rspamd_fuzzy_backend_version_sqlite, + .id = rspamd_fuzzy_backend_id_sqlite, + .periodic = rspamd_fuzzy_backend_expire_sqlite, + .close = rspamd_fuzzy_backend_close_sqlite, + }, + [RSPAMD_FUZZY_BACKEND_REDIS] = { + .init = rspamd_fuzzy_backend_init_redis, + .check = rspamd_fuzzy_backend_check_redis, + .update = rspamd_fuzzy_backend_update_redis, + .count = rspamd_fuzzy_backend_count_redis, + .version = rspamd_fuzzy_backend_version_redis, + .id = rspamd_fuzzy_backend_id_redis, + .periodic = rspamd_fuzzy_backend_expire_redis, + .close = rspamd_fuzzy_backend_close_redis, + }}; + +struct rspamd_fuzzy_backend { + enum rspamd_fuzzy_backend_type type; + gdouble expire; + gdouble sync; + struct ev_loop *event_loop; + rspamd_fuzzy_periodic_cb periodic_cb; + void *periodic_ud; + const struct rspamd_fuzzy_backend_subr *subr; + void *subr_ud; + ev_timer periodic_event; +}; + +static GQuark +rspamd_fuzzy_backend_quark(void) +{ + return g_quark_from_static_string("fuzzy-backend"); +} + +static void * +rspamd_fuzzy_backend_init_sqlite(struct rspamd_fuzzy_backend *bk, + const ucl_object_t *obj, struct rspamd_config *cfg, GError **err) +{ + const ucl_object_t *elt; + + elt = ucl_object_lookup_any(obj, "hashfile", "hash_file", "file", + "database", NULL); + + if (elt == NULL || ucl_object_type(elt) != UCL_STRING) { + g_set_error(err, rspamd_fuzzy_backend_quark(), + EINVAL, "missing sqlite3 path"); + return NULL; + } + + return rspamd_fuzzy_backend_sqlite_open(ucl_object_tostring(elt), + FALSE, err); +} + +static void +rspamd_fuzzy_backend_check_sqlite(struct rspamd_fuzzy_backend *bk, + const struct rspamd_fuzzy_cmd *cmd, + rspamd_fuzzy_check_cb cb, void *ud, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_sqlite *sq = subr_ud; + struct rspamd_fuzzy_reply rep; + + rep = rspamd_fuzzy_backend_sqlite_check(sq, cmd, bk->expire); + + if (cb) { + cb(&rep, ud); + } +} + +static void +rspamd_fuzzy_backend_update_sqlite(struct rspamd_fuzzy_backend *bk, + GArray *updates, const gchar *src, + rspamd_fuzzy_update_cb cb, void *ud, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_sqlite *sq = subr_ud; + gboolean success = FALSE; + guint i; + struct fuzzy_peer_cmd *io_cmd; + struct rspamd_fuzzy_cmd *cmd; + gpointer ptr; + guint nupdates = 0, nadded = 0, ndeleted = 0, nextended = 0, nignored = 0; + + if (rspamd_fuzzy_backend_sqlite_prepare_update(sq, src)) { + for (i = 0; i < updates->len; i++) { + io_cmd = &g_array_index(updates, struct fuzzy_peer_cmd, i); + + if (io_cmd->is_shingle) { + cmd = &io_cmd->cmd.shingle.basic; + ptr = &io_cmd->cmd.shingle; + } + else { + cmd = &io_cmd->cmd.normal; + ptr = &io_cmd->cmd.normal; + } + + if (cmd->cmd == FUZZY_WRITE) { + rspamd_fuzzy_backend_sqlite_add(sq, ptr); + nadded++; + nupdates++; + } + else if (cmd->cmd == FUZZY_DEL) { + rspamd_fuzzy_backend_sqlite_del(sq, ptr); + ndeleted++; + nupdates++; + } + else { + if (cmd->cmd == FUZZY_REFRESH) { + nextended++; + } + else { + nignored++; + } + } + } + + if (rspamd_fuzzy_backend_sqlite_finish_update(sq, src, + nupdates > 0)) { + success = TRUE; + } + } + + if (cb) { + cb(success, nadded, ndeleted, nextended, nignored, ud); + } +} + +static void +rspamd_fuzzy_backend_count_sqlite(struct rspamd_fuzzy_backend *bk, + rspamd_fuzzy_count_cb cb, void *ud, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_sqlite *sq = subr_ud; + guint64 nhashes; + + nhashes = rspamd_fuzzy_backend_sqlite_count(sq); + + if (cb) { + cb(nhashes, ud); + } +} + +static void +rspamd_fuzzy_backend_version_sqlite(struct rspamd_fuzzy_backend *bk, + const gchar *src, + rspamd_fuzzy_version_cb cb, void *ud, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_sqlite *sq = subr_ud; + guint64 rev; + + rev = rspamd_fuzzy_backend_sqlite_version(sq, src); + + if (cb) { + cb(rev, ud); + } +} + +static const gchar * +rspamd_fuzzy_backend_id_sqlite(struct rspamd_fuzzy_backend *bk, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_sqlite *sq = subr_ud; + + return rspamd_fuzzy_sqlite_backend_id(sq); +} +static void +rspamd_fuzzy_backend_expire_sqlite(struct rspamd_fuzzy_backend *bk, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_sqlite *sq = subr_ud; + + rspamd_fuzzy_backend_sqlite_sync(sq, bk->expire, TRUE); +} + +static void +rspamd_fuzzy_backend_close_sqlite(struct rspamd_fuzzy_backend *bk, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_sqlite *sq = subr_ud; + + rspamd_fuzzy_backend_sqlite_close(sq); +} + + +struct rspamd_fuzzy_backend * +rspamd_fuzzy_backend_create(struct ev_loop *ev_base, + const ucl_object_t *config, + struct rspamd_config *cfg, + GError **err) +{ + struct rspamd_fuzzy_backend *bk; + enum rspamd_fuzzy_backend_type type = RSPAMD_FUZZY_BACKEND_SQLITE; + const ucl_object_t *elt; + gdouble expire = DEFAULT_EXPIRE; + + if (config != NULL) { + elt = ucl_object_lookup(config, "backend"); + + if (elt != NULL && ucl_object_type(elt) == UCL_STRING) { + if (strcmp(ucl_object_tostring(elt), "sqlite") == 0) { + type = RSPAMD_FUZZY_BACKEND_SQLITE; + } + else if (strcmp(ucl_object_tostring(elt), "redis") == 0) { + type = RSPAMD_FUZZY_BACKEND_REDIS; + } + else { + g_set_error(err, rspamd_fuzzy_backend_quark(), + EINVAL, "invalid backend type: %s", + ucl_object_tostring(elt)); + return NULL; + } + } + + elt = ucl_object_lookup(config, "expire"); + + if (elt != NULL) { + expire = ucl_object_todouble(elt); + } + } + + bk = g_malloc0(sizeof(*bk)); + bk->event_loop = ev_base; + bk->expire = expire; + bk->type = type; + bk->subr = &fuzzy_subrs[type]; + + if ((bk->subr_ud = bk->subr->init(bk, config, cfg, err)) == NULL) { + g_free(bk); + + return NULL; + } + + return bk; +} + + +void rspamd_fuzzy_backend_check(struct rspamd_fuzzy_backend *bk, + const struct rspamd_fuzzy_cmd *cmd, + rspamd_fuzzy_check_cb cb, void *ud) +{ + g_assert(bk != NULL); + + bk->subr->check(bk, cmd, cb, ud, bk->subr_ud); +} + +static guint +rspamd_fuzzy_digest_hash(gconstpointer key) +{ + guint ret; + + /* Distributed uniformly already */ + memcpy(&ret, key, sizeof(ret)); + + return ret; +} + +static gboolean +rspamd_fuzzy_digest_equal(gconstpointer v, gconstpointer v2) +{ + return memcmp(v, v2, rspamd_cryptobox_HASHBYTES) == 0; +} + +static void +rspamd_fuzzy_backend_deduplicate_queue(GArray *updates) +{ + GHashTable *seen = g_hash_table_new(rspamd_fuzzy_digest_hash, + rspamd_fuzzy_digest_equal); + struct fuzzy_peer_cmd *io_cmd, *found; + struct rspamd_fuzzy_cmd *cmd; + guchar *digest; + guint i; + + for (i = 0; i < updates->len; i++) { + io_cmd = &g_array_index(updates, struct fuzzy_peer_cmd, i); + + if (io_cmd->is_shingle) { + cmd = &io_cmd->cmd.shingle.basic; + } + else { + cmd = &io_cmd->cmd.normal; + } + + digest = cmd->digest; + + found = g_hash_table_lookup(seen, digest); + + if (found == NULL) { + /* Add to the seen list, if not a duplicate (huh?) */ + if (cmd->cmd != FUZZY_DUP) { + g_hash_table_insert(seen, digest, io_cmd); + } + } + else { + if (found->cmd.normal.flag != cmd->flag) { + /* TODO: deal with flags better at some point */ + continue; + } + + /* Apply heuristic */ + switch (cmd->cmd) { + case FUZZY_WRITE: + if (found->cmd.normal.cmd == FUZZY_WRITE) { + /* Already seen */ + found->cmd.normal.value += cmd->value; + cmd->cmd = FUZZY_DUP; /* Ignore this one */ + } + else if (found->cmd.normal.cmd == FUZZY_REFRESH) { + /* Seen refresh command, remove it as write has higher priority */ + g_hash_table_replace(seen, digest, io_cmd); + found->cmd.normal.cmd = FUZZY_DUP; + } + else if (found->cmd.normal.cmd == FUZZY_DEL) { + /* Request delete + add, weird, but ignore add */ + cmd->cmd = FUZZY_DUP; /* Ignore this one */ + } + break; + case FUZZY_REFRESH: + if (found->cmd.normal.cmd == FUZZY_WRITE) { + /* No need to expire, handled by addition */ + cmd->cmd = FUZZY_DUP; /* Ignore this one */ + } + else if (found->cmd.normal.cmd == FUZZY_DEL) { + /* Request delete + expire, ignore expire */ + cmd->cmd = FUZZY_DUP; /* Ignore this one */ + } + else if (found->cmd.normal.cmd == FUZZY_REFRESH) { + /* Already handled */ + cmd->cmd = FUZZY_DUP; /* Ignore this one */ + } + break; + case FUZZY_DEL: + /* Delete has priority over all other commands */ + g_hash_table_replace(seen, digest, io_cmd); + found->cmd.normal.cmd = FUZZY_DUP; + break; + default: + break; + } + } + } + + g_hash_table_unref(seen); +} + +void rspamd_fuzzy_backend_process_updates(struct rspamd_fuzzy_backend *bk, + GArray *updates, const gchar *src, rspamd_fuzzy_update_cb cb, + void *ud) +{ + g_assert(bk != NULL); + g_assert(updates != NULL); + + if (updates) { + rspamd_fuzzy_backend_deduplicate_queue(updates); + bk->subr->update(bk, updates, src, cb, ud, bk->subr_ud); + } + else if (cb) { + cb(TRUE, 0, 0, 0, 0, ud); + } +} + + +void rspamd_fuzzy_backend_count(struct rspamd_fuzzy_backend *bk, + rspamd_fuzzy_count_cb cb, void *ud) +{ + g_assert(bk != NULL); + + bk->subr->count(bk, cb, ud, bk->subr_ud); +} + + +void rspamd_fuzzy_backend_version(struct rspamd_fuzzy_backend *bk, + const gchar *src, + rspamd_fuzzy_version_cb cb, void *ud) +{ + g_assert(bk != NULL); + + bk->subr->version(bk, src, cb, ud, bk->subr_ud); +} + +const gchar * +rspamd_fuzzy_backend_id(struct rspamd_fuzzy_backend *bk) +{ + g_assert(bk != NULL); + + if (bk->subr->id) { + return bk->subr->id(bk, bk->subr_ud); + } + + return NULL; +} + +static inline void +rspamd_fuzzy_backend_periodic_sync(struct rspamd_fuzzy_backend *bk) +{ + if (bk->periodic_cb) { + if (bk->periodic_cb(bk->periodic_ud)) { + if (bk->subr->periodic) { + bk->subr->periodic(bk, bk->subr_ud); + } + } + } + else { + if (bk->subr->periodic) { + bk->subr->periodic(bk, bk->subr_ud); + } + } +} + +static void +rspamd_fuzzy_backend_periodic_cb(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_fuzzy_backend *bk = (struct rspamd_fuzzy_backend *) w->data; + gdouble jittered; + + jittered = rspamd_time_jitter(bk->sync, bk->sync / 2.0); + w->repeat = jittered; + rspamd_fuzzy_backend_periodic_sync(bk); + ev_timer_again(EV_A_ w); +} + +void rspamd_fuzzy_backend_start_update(struct rspamd_fuzzy_backend *bk, + gdouble timeout, + rspamd_fuzzy_periodic_cb cb, + void *ud) +{ + gdouble jittered; + + g_assert(bk != NULL); + + if (bk->subr->periodic) { + if (bk->sync > 0.0) { + ev_timer_stop(bk->event_loop, &bk->periodic_event); + } + + if (cb) { + bk->periodic_cb = cb; + bk->periodic_ud = ud; + } + + rspamd_fuzzy_backend_periodic_sync(bk); + bk->sync = timeout; + jittered = rspamd_time_jitter(timeout, timeout / 2.0); + + bk->periodic_event.data = bk; + ev_timer_init(&bk->periodic_event, rspamd_fuzzy_backend_periodic_cb, + jittered, 0.0); + ev_timer_start(bk->event_loop, &bk->periodic_event); + } +} + +void rspamd_fuzzy_backend_close(struct rspamd_fuzzy_backend *bk) +{ + g_assert(bk != NULL); + + if (bk->sync > 0.0) { + rspamd_fuzzy_backend_periodic_sync(bk); + ev_timer_stop(bk->event_loop, &bk->periodic_event); + } + + bk->subr->close(bk, bk->subr_ud); + + g_free(bk); +} + +struct ev_loop * +rspamd_fuzzy_backend_event_base(struct rspamd_fuzzy_backend *backend) +{ + return backend->event_loop; +} + +gdouble +rspamd_fuzzy_backend_get_expire(struct rspamd_fuzzy_backend *backend) +{ + return backend->expire; +} diff --git a/src/libserver/fuzzy_backend/fuzzy_backend.h b/src/libserver/fuzzy_backend/fuzzy_backend.h new file mode 100644 index 0000000..a1b74bc --- /dev/null +++ b/src/libserver/fuzzy_backend/fuzzy_backend.h @@ -0,0 +1,131 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBSERVER_FUZZY_BACKEND_H_ +#define SRC_LIBSERVER_FUZZY_BACKEND_H_ + +#include "config.h" +#include "contrib/libev/ev.h" +#include "fuzzy_wire.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_fuzzy_backend; +struct rspamd_config; + +/* + * Callbacks for fuzzy methods + */ +typedef void (*rspamd_fuzzy_check_cb)(struct rspamd_fuzzy_reply *rep, void *ud); + +typedef void (*rspamd_fuzzy_update_cb)(gboolean success, + guint nadded, + guint ndeleted, + guint nextended, + guint nignored, + void *ud); + +typedef void (*rspamd_fuzzy_version_cb)(guint64 rev, void *ud); + +typedef void (*rspamd_fuzzy_count_cb)(guint64 count, void *ud); + +typedef gboolean (*rspamd_fuzzy_periodic_cb)(void *ud); + +/** + * Open fuzzy backend + * @param ev_base + * @param config + * @param err + * @return + */ +struct rspamd_fuzzy_backend *rspamd_fuzzy_backend_create(struct ev_loop *ev_base, + const ucl_object_t *config, + struct rspamd_config *cfg, + GError **err); + + +/** + * Check a specific hash in storage + * @param cmd + * @param cb + * @param ud + */ +void rspamd_fuzzy_backend_check(struct rspamd_fuzzy_backend *bk, + const struct rspamd_fuzzy_cmd *cmd, + rspamd_fuzzy_check_cb cb, void *ud); + +/** + * Process updates for a specific queue + * @param bk + * @param updates queue of struct fuzzy_peer_cmd + * @param src + */ +void rspamd_fuzzy_backend_process_updates(struct rspamd_fuzzy_backend *bk, + GArray *updates, const gchar *src, rspamd_fuzzy_update_cb cb, + void *ud); + +/** + * Gets number of hashes from the backend + * @param bk + * @param cb + * @param ud + */ +void rspamd_fuzzy_backend_count(struct rspamd_fuzzy_backend *bk, + rspamd_fuzzy_count_cb cb, void *ud); + +/** + * Returns number of revision for a specific source + * @param bk + * @param src + * @param cb + * @param ud + */ +void rspamd_fuzzy_backend_version(struct rspamd_fuzzy_backend *bk, + const gchar *src, + rspamd_fuzzy_version_cb cb, void *ud); + +/** + * Returns unique id for backend + * @param backend + * @return + */ +const gchar *rspamd_fuzzy_backend_id(struct rspamd_fuzzy_backend *backend); + +/** + * Starts expire process for the backend + * @param backend + */ +void rspamd_fuzzy_backend_start_update(struct rspamd_fuzzy_backend *backend, + gdouble timeout, + rspamd_fuzzy_periodic_cb cb, + void *ud); + +struct ev_loop *rspamd_fuzzy_backend_event_base(struct rspamd_fuzzy_backend *backend); + +gdouble rspamd_fuzzy_backend_get_expire(struct rspamd_fuzzy_backend *backend); + +/** + * Closes backend + * @param backend + */ +void rspamd_fuzzy_backend_close(struct rspamd_fuzzy_backend *backend); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBSERVER_FUZZY_BACKEND_H_ */ diff --git a/src/libserver/fuzzy_backend/fuzzy_backend_redis.c b/src/libserver/fuzzy_backend/fuzzy_backend_redis.c new file mode 100644 index 0000000..7ab7ca6 --- /dev/null +++ b/src/libserver/fuzzy_backend/fuzzy_backend_redis.c @@ -0,0 +1,1666 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "ref.h" +#include "fuzzy_backend.h" +#include "fuzzy_backend_redis.h" +#include "redis_pool.h" +#include "cryptobox.h" +#include "str_util.h" +#include "upstream.h" +#include "contrib/hiredis/hiredis.h" +#include "contrib/hiredis/async.h" +#include "lua/lua_common.h" + +#define REDIS_DEFAULT_PORT 6379 +#define REDIS_DEFAULT_OBJECT "fuzzy" +#define REDIS_DEFAULT_TIMEOUT 2.0 + +#define msg_err_redis_session(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "fuzzy_redis", session->backend->id, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_warn_redis_session(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "fuzzy_redis", session->backend->id, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_info_redis_session(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "fuzzy_redis", session->backend->id, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_debug_redis_session(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_fuzzy_redis_log_id, "fuzzy_redis", session->backend->id, \ + G_STRFUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(fuzzy_redis) + +struct rspamd_fuzzy_backend_redis { + lua_State *L; + const gchar *redis_object; + const gchar *username; + const gchar *password; + const gchar *dbname; + gchar *id; + struct rspamd_redis_pool *pool; + gdouble timeout; + gint conf_ref; + bool terminated; + ref_entry_t ref; +}; + +enum rspamd_fuzzy_redis_command { + RSPAMD_FUZZY_REDIS_COMMAND_COUNT, + RSPAMD_FUZZY_REDIS_COMMAND_VERSION, + RSPAMD_FUZZY_REDIS_COMMAND_UPDATES, + RSPAMD_FUZZY_REDIS_COMMAND_CHECK +}; + +struct rspamd_fuzzy_redis_session { + struct rspamd_fuzzy_backend_redis *backend; + redisAsyncContext *ctx; + ev_timer timeout; + const struct rspamd_fuzzy_cmd *cmd; + struct ev_loop *event_loop; + float prob; + gboolean shingles_checked; + + enum rspamd_fuzzy_redis_command command; + guint nargs; + + guint nadded; + guint ndeleted; + guint nextended; + guint nignored; + + union { + rspamd_fuzzy_check_cb cb_check; + rspamd_fuzzy_update_cb cb_update; + rspamd_fuzzy_version_cb cb_version; + rspamd_fuzzy_count_cb cb_count; + } callback; + void *cbdata; + + gchar **argv; + gsize *argv_lens; + struct upstream *up; + guchar found_digest[rspamd_cryptobox_HASHBYTES]; +}; + +static inline struct upstream_list * +rspamd_redis_get_servers(struct rspamd_fuzzy_backend_redis *ctx, + const gchar *what) +{ + lua_State *L = ctx->L; + struct upstream_list *res = NULL; + + lua_rawgeti(L, LUA_REGISTRYINDEX, ctx->conf_ref); + lua_pushstring(L, what); + lua_gettable(L, -2); + + if (lua_type(L, -1) == LUA_TUSERDATA) { + res = *((struct upstream_list **) lua_touserdata(L, -1)); + } + else { + struct lua_logger_trace tr; + gchar outbuf[8192]; + + memset(&tr, 0, sizeof(tr)); + lua_logger_out_type(L, -2, outbuf, sizeof(outbuf) - 1, &tr, + LUA_ESCAPE_UNPRINTABLE); + + msg_err("cannot get %s upstreams for Redis fuzzy storage %s; table content: %s", + what, ctx->id, outbuf); + } + + lua_settop(L, 0); + + return res; +} + +static inline void +rspamd_fuzzy_redis_session_free_args(struct rspamd_fuzzy_redis_session *session) +{ + guint i; + + if (session->argv) { + for (i = 0; i < session->nargs; i++) { + g_free(session->argv[i]); + } + + g_free(session->argv); + g_free(session->argv_lens); + } +} +static void +rspamd_fuzzy_redis_session_dtor(struct rspamd_fuzzy_redis_session *session, + gboolean is_fatal) +{ + redisAsyncContext *ac; + + + if (session->ctx) { + ac = session->ctx; + session->ctx = NULL; + rspamd_redis_pool_release_connection(session->backend->pool, + ac, + is_fatal ? RSPAMD_REDIS_RELEASE_FATAL : RSPAMD_REDIS_RELEASE_DEFAULT); + } + + ev_timer_stop(session->event_loop, &session->timeout); + rspamd_fuzzy_redis_session_free_args(session); + + REF_RELEASE(session->backend); + rspamd_upstream_unref(session->up); + g_free(session); +} + +static void +rspamd_fuzzy_backend_redis_dtor(struct rspamd_fuzzy_backend_redis *backend) +{ + if (!backend->terminated && backend->conf_ref != -1) { + luaL_unref(backend->L, LUA_REGISTRYINDEX, backend->conf_ref); + } + + if (backend->id) { + g_free(backend->id); + } + + g_free(backend); +} + +void * +rspamd_fuzzy_backend_init_redis(struct rspamd_fuzzy_backend *bk, + const ucl_object_t *obj, struct rspamd_config *cfg, GError **err) +{ + struct rspamd_fuzzy_backend_redis *backend; + const ucl_object_t *elt; + gboolean ret = FALSE; + guchar id_hash[rspamd_cryptobox_HASHBYTES]; + rspamd_cryptobox_hash_state_t st; + lua_State *L = (lua_State *) cfg->lua_state; + gint conf_ref = -1; + + backend = g_malloc0(sizeof(*backend)); + + backend->timeout = REDIS_DEFAULT_TIMEOUT; + backend->redis_object = REDIS_DEFAULT_OBJECT; + backend->L = L; + + ret = rspamd_lua_try_load_redis(L, obj, cfg, &conf_ref); + + /* Now try global redis settings */ + if (!ret) { + elt = ucl_object_lookup(cfg->cfg_ucl_obj, "redis"); + + if (elt) { + const ucl_object_t *specific_obj; + + specific_obj = ucl_object_lookup_any(elt, "fuzzy", "fuzzy_storage", + NULL); + + if (specific_obj) { + ret = rspamd_lua_try_load_redis(L, specific_obj, cfg, &conf_ref); + } + else { + ret = rspamd_lua_try_load_redis(L, elt, cfg, &conf_ref); + } + } + } + + if (!ret) { + msg_err_config("cannot init redis backend for fuzzy storage"); + g_free(backend); + + return NULL; + } + + elt = ucl_object_lookup(obj, "prefix"); + if (elt == NULL || ucl_object_type(elt) != UCL_STRING) { + backend->redis_object = REDIS_DEFAULT_OBJECT; + } + else { + backend->redis_object = ucl_object_tostring(elt); + } + + backend->conf_ref = conf_ref; + + /* Check some common table values */ + lua_rawgeti(L, LUA_REGISTRYINDEX, conf_ref); + + lua_pushstring(L, "timeout"); + lua_gettable(L, -2); + if (lua_type(L, -1) == LUA_TNUMBER) { + backend->timeout = lua_tonumber(L, -1); + } + lua_pop(L, 1); + + lua_pushstring(L, "db"); + lua_gettable(L, -2); + if (lua_type(L, -1) == LUA_TSTRING) { + backend->dbname = rspamd_mempool_strdup(cfg->cfg_pool, + lua_tostring(L, -1)); + } + lua_pop(L, 1); + + lua_pushstring(L, "username"); + lua_gettable(L, -2); + if (lua_type(L, -1) == LUA_TSTRING) { + backend->username = rspamd_mempool_strdup(cfg->cfg_pool, + lua_tostring(L, -1)); + } + lua_pop(L, 1); + + lua_pushstring(L, "password"); + lua_gettable(L, -2); + if (lua_type(L, -1) == LUA_TSTRING) { + backend->password = rspamd_mempool_strdup(cfg->cfg_pool, + lua_tostring(L, -1)); + } + lua_pop(L, 1); + + lua_settop(L, 0); + + REF_INIT_RETAIN(backend, rspamd_fuzzy_backend_redis_dtor); + backend->pool = cfg->redis_pool; + rspamd_cryptobox_hash_init(&st, NULL, 0); + rspamd_cryptobox_hash_update(&st, backend->redis_object, + strlen(backend->redis_object)); + + if (backend->dbname) { + rspamd_cryptobox_hash_update(&st, backend->dbname, + strlen(backend->dbname)); + } + + if (backend->username) { + rspamd_cryptobox_hash_update(&st, backend->username, + strlen(backend->username)); + } + + if (backend->password) { + rspamd_cryptobox_hash_update(&st, backend->password, + strlen(backend->password)); + } + + rspamd_cryptobox_hash_final(&st, id_hash); + backend->id = rspamd_encode_base32(id_hash, sizeof(id_hash), RSPAMD_BASE32_DEFAULT); + + return backend; +} + +static void +rspamd_fuzzy_redis_timeout(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_fuzzy_redis_session *session = + (struct rspamd_fuzzy_redis_session *) w->data; + redisAsyncContext *ac; + static char errstr[128]; + + if (session->ctx) { + ac = session->ctx; + session->ctx = NULL; + ac->err = REDIS_ERR_IO; + /* Should be safe as in hiredis it is char[128] */ + rspamd_snprintf(errstr, sizeof(errstr), "%s", strerror(ETIMEDOUT)); + ac->errstr = errstr; + + /* This will cause session closing */ + rspamd_redis_pool_release_connection(session->backend->pool, + ac, RSPAMD_REDIS_RELEASE_FATAL); + } +} + +static void rspamd_fuzzy_redis_check_callback(redisAsyncContext *c, gpointer r, + gpointer priv); + +struct _rspamd_fuzzy_shingles_helper { + guchar digest[64]; + guint found; +}; + +static gint +rspamd_fuzzy_backend_redis_shingles_cmp(const void *a, const void *b) +{ + const struct _rspamd_fuzzy_shingles_helper *sha = a, + *shb = b; + + return memcmp(sha->digest, shb->digest, sizeof(sha->digest)); +} + +static void +rspamd_fuzzy_redis_shingles_callback(redisAsyncContext *c, gpointer r, + gpointer priv) +{ + struct rspamd_fuzzy_redis_session *session = priv; + redisReply *reply = r, *cur; + struct rspamd_fuzzy_reply rep; + GString *key; + struct _rspamd_fuzzy_shingles_helper *shingles, *prev = NULL, *sel = NULL; + guint i, found = 0, max_found = 0, cur_found = 0; + + ev_timer_stop(session->event_loop, &session->timeout); + memset(&rep, 0, sizeof(rep)); + + if (c->err == 0 && reply != NULL) { + rspamd_upstream_ok(session->up); + + if (reply->type == REDIS_REPLY_ARRAY && + reply->elements == RSPAMD_SHINGLE_SIZE) { + shingles = g_alloca(sizeof(struct _rspamd_fuzzy_shingles_helper) * + RSPAMD_SHINGLE_SIZE); + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { + cur = reply->element[i]; + + if (cur->type == REDIS_REPLY_STRING) { + shingles[i].found = 1; + memcpy(shingles[i].digest, cur->str, MIN(64, cur->len)); + found++; + } + else { + memset(shingles[i].digest, 0, sizeof(shingles[i].digest)); + shingles[i].found = 0; + } + } + + if (found > RSPAMD_SHINGLE_SIZE / 2) { + /* Now sort to find the most frequent element */ + qsort(shingles, RSPAMD_SHINGLE_SIZE, + sizeof(struct _rspamd_fuzzy_shingles_helper), + rspamd_fuzzy_backend_redis_shingles_cmp); + + prev = &shingles[0]; + + for (i = 1; i < RSPAMD_SHINGLE_SIZE; i++) { + if (!shingles[i].found) { + continue; + } + + if (memcmp(shingles[i].digest, prev->digest, 64) == 0) { + cur_found++; + + if (cur_found > max_found) { + max_found = cur_found; + sel = &shingles[i]; + } + } + else { + cur_found = 1; + prev = &shingles[i]; + } + } + + if (max_found > RSPAMD_SHINGLE_SIZE / 2) { + session->prob = ((float) max_found) / RSPAMD_SHINGLE_SIZE; + rep.v1.prob = session->prob; + + g_assert(sel != NULL); + + /* Prepare new check command */ + rspamd_fuzzy_redis_session_free_args(session); + session->nargs = 5; + session->argv = g_malloc(sizeof(gchar *) * session->nargs); + session->argv_lens = g_malloc(sizeof(gsize) * session->nargs); + + key = g_string_new(session->backend->redis_object); + g_string_append_len(key, sel->digest, sizeof(sel->digest)); + session->argv[0] = g_strdup("HMGET"); + session->argv_lens[0] = 5; + session->argv[1] = key->str; + session->argv_lens[1] = key->len; + session->argv[2] = g_strdup("V"); + session->argv_lens[2] = 1; + session->argv[3] = g_strdup("F"); + session->argv_lens[3] = 1; + session->argv[4] = g_strdup("C"); + session->argv_lens[4] = 1; + g_string_free(key, FALSE); /* Do not free underlying array */ + memcpy(session->found_digest, sel->digest, + sizeof(session->cmd->digest)); + + g_assert(session->ctx != NULL); + if (redisAsyncCommandArgv(session->ctx, + rspamd_fuzzy_redis_check_callback, + session, session->nargs, + (const gchar **) session->argv, + session->argv_lens) != REDIS_OK) { + + if (session->callback.cb_check) { + memset(&rep, 0, sizeof(rep)); + session->callback.cb_check(&rep, session->cbdata); + } + + rspamd_fuzzy_redis_session_dtor(session, TRUE); + } + else { + /* Add timeout */ + session->timeout.data = session; + ev_now_update_if_cheap((struct ev_loop *) session->event_loop); + ev_timer_init(&session->timeout, + rspamd_fuzzy_redis_timeout, + session->backend->timeout, 0.0); + ev_timer_start(session->event_loop, &session->timeout); + } + + return; + } + } + } + else if (reply->type == REDIS_REPLY_ERROR) { + msg_err_redis_session("fuzzy backend redis error: \"%s\"", + reply->str); + } + + if (session->callback.cb_check) { + session->callback.cb_check(&rep, session->cbdata); + } + } + else { + if (session->callback.cb_check) { + session->callback.cb_check(&rep, session->cbdata); + } + + if (c->errstr) { + msg_err_redis_session("error getting shingles: %s", c->errstr); + rspamd_upstream_fail(session->up, FALSE, c->errstr); + } + } + + rspamd_fuzzy_redis_session_dtor(session, FALSE); +} + +static void +rspamd_fuzzy_backend_check_shingles(struct rspamd_fuzzy_redis_session *session) +{ + struct rspamd_fuzzy_reply rep; + const struct rspamd_fuzzy_shingle_cmd *shcmd; + GString *key; + guint i, init_len; + + rspamd_fuzzy_redis_session_free_args(session); + /* First of all check digest */ + session->nargs = RSPAMD_SHINGLE_SIZE + 1; + session->argv = g_malloc(sizeof(gchar *) * session->nargs); + session->argv_lens = g_malloc(sizeof(gsize) * session->nargs); + shcmd = (const struct rspamd_fuzzy_shingle_cmd *) session->cmd; + + session->argv[0] = g_strdup("MGET"); + session->argv_lens[0] = 4; + init_len = strlen(session->backend->redis_object); + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { + + key = g_string_sized_new(init_len + 2 + 2 + sizeof("18446744073709551616")); + rspamd_printf_gstring(key, "%s_%d_%uL", session->backend->redis_object, + i, shcmd->sgl.hashes[i]); + session->argv[i + 1] = key->str; + session->argv_lens[i + 1] = key->len; + g_string_free(key, FALSE); /* Do not free underlying array */ + } + + session->shingles_checked = TRUE; + + g_assert(session->ctx != NULL); + + if (redisAsyncCommandArgv(session->ctx, rspamd_fuzzy_redis_shingles_callback, + session, session->nargs, + (const gchar **) session->argv, session->argv_lens) != REDIS_OK) { + msg_err("cannot execute redis command on %s: %s", + rspamd_inet_address_to_string_pretty(rspamd_upstream_addr_cur(session->up)), + session->ctx->errstr); + + if (session->callback.cb_check) { + memset(&rep, 0, sizeof(rep)); + session->callback.cb_check(&rep, session->cbdata); + } + + rspamd_fuzzy_redis_session_dtor(session, TRUE); + } + else { + /* Add timeout */ + session->timeout.data = session; + ev_now_update_if_cheap((struct ev_loop *) session->event_loop); + ev_timer_init(&session->timeout, + rspamd_fuzzy_redis_timeout, + session->backend->timeout, 0.0); + ev_timer_start(session->event_loop, &session->timeout); + } +} + +static void +rspamd_fuzzy_redis_check_callback(redisAsyncContext *c, gpointer r, + gpointer priv) +{ + struct rspamd_fuzzy_redis_session *session = priv; + redisReply *reply = r, *cur; + struct rspamd_fuzzy_reply rep; + gulong value; + guint found_elts = 0; + + ev_timer_stop(session->event_loop, &session->timeout); + memset(&rep, 0, sizeof(rep)); + + if (c->err == 0 && reply != NULL) { + rspamd_upstream_ok(session->up); + + if (reply->type == REDIS_REPLY_ARRAY && reply->elements >= 2) { + cur = reply->element[0]; + + if (cur->type == REDIS_REPLY_STRING) { + value = strtoul(cur->str, NULL, 10); + rep.v1.value = value; + found_elts++; + } + + cur = reply->element[1]; + + if (cur->type == REDIS_REPLY_STRING) { + value = strtoul(cur->str, NULL, 10); + rep.v1.flag = value; + found_elts++; + } + + if (found_elts >= 2) { + rep.v1.prob = session->prob; + memcpy(rep.digest, session->found_digest, sizeof(rep.digest)); + } + + rep.ts = 0; + + if (reply->elements > 2) { + cur = reply->element[2]; + + if (cur->type == REDIS_REPLY_STRING) { + rep.ts = strtoul(cur->str, NULL, 10); + } + } + } + else if (reply->type == REDIS_REPLY_ERROR) { + msg_err_redis_session("fuzzy backend redis error: \"%s\"", + reply->str); + } + + if (found_elts < 2) { + if (session->cmd->shingles_count > 0 && !session->shingles_checked) { + /* We also need to check all shingles here */ + rspamd_fuzzy_backend_check_shingles(session); + /* Do not free session */ + return; + } + else { + if (session->callback.cb_check) { + session->callback.cb_check(&rep, session->cbdata); + } + } + } + else { + if (session->callback.cb_check) { + session->callback.cb_check(&rep, session->cbdata); + } + } + } + else { + if (session->callback.cb_check) { + session->callback.cb_check(&rep, session->cbdata); + } + + if (c->errstr) { + msg_err_redis_session("error getting hashes on %s: %s", + rspamd_inet_address_to_string_pretty(rspamd_upstream_addr_cur(session->up)), + c->errstr); + rspamd_upstream_fail(session->up, FALSE, c->errstr); + } + } + + rspamd_fuzzy_redis_session_dtor(session, FALSE); +} + +void rspamd_fuzzy_backend_check_redis(struct rspamd_fuzzy_backend *bk, + const struct rspamd_fuzzy_cmd *cmd, + rspamd_fuzzy_check_cb cb, void *ud, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_redis *backend = subr_ud; + struct rspamd_fuzzy_redis_session *session; + struct upstream *up; + struct upstream_list *ups; + rspamd_inet_addr_t *addr; + struct rspamd_fuzzy_reply rep; + GString *key; + + g_assert(backend != NULL); + + ups = rspamd_redis_get_servers(backend, "read_servers"); + if (!ups) { + if (cb) { + memset(&rep, 0, sizeof(rep)); + cb(&rep, ud); + } + + return; + } + + session = g_malloc0(sizeof(*session)); + session->backend = backend; + REF_RETAIN(session->backend); + + session->callback.cb_check = cb; + session->cbdata = ud; + session->command = RSPAMD_FUZZY_REDIS_COMMAND_CHECK; + session->cmd = cmd; + session->prob = 1.0; + memcpy(rep.digest, session->cmd->digest, sizeof(rep.digest)); + memcpy(session->found_digest, session->cmd->digest, sizeof(rep.digest)); + session->event_loop = rspamd_fuzzy_backend_event_base(bk); + + /* First of all check digest */ + session->nargs = 5; + session->argv = g_malloc(sizeof(gchar *) * session->nargs); + session->argv_lens = g_malloc(sizeof(gsize) * session->nargs); + + key = g_string_new(backend->redis_object); + g_string_append_len(key, cmd->digest, sizeof(cmd->digest)); + session->argv[0] = g_strdup("HMGET"); + session->argv_lens[0] = 5; + session->argv[1] = key->str; + session->argv_lens[1] = key->len; + session->argv[2] = g_strdup("V"); + session->argv_lens[2] = 1; + session->argv[3] = g_strdup("F"); + session->argv_lens[3] = 1; + session->argv[4] = g_strdup("C"); + session->argv_lens[4] = 1; + g_string_free(key, FALSE); /* Do not free underlying array */ + + up = rspamd_upstream_get(ups, + RSPAMD_UPSTREAM_ROUND_ROBIN, + NULL, + 0); + + session->up = rspamd_upstream_ref(up); + addr = rspamd_upstream_addr_next(up); + g_assert(addr != NULL); + session->ctx = rspamd_redis_pool_connect(backend->pool, + backend->dbname, + backend->username, backend->password, + rspamd_inet_address_to_string(addr), + rspamd_inet_address_get_port(addr)); + + if (session->ctx == NULL) { + rspamd_upstream_fail(up, TRUE, strerror(errno)); + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + if (cb) { + memset(&rep, 0, sizeof(rep)); + cb(&rep, ud); + } + } + else { + if (redisAsyncCommandArgv(session->ctx, rspamd_fuzzy_redis_check_callback, + session, session->nargs, + (const gchar **) session->argv, session->argv_lens) != REDIS_OK) { + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + if (cb) { + memset(&rep, 0, sizeof(rep)); + cb(&rep, ud); + } + } + else { + /* Add timeout */ + session->timeout.data = session; + ev_now_update_if_cheap((struct ev_loop *) session->event_loop); + ev_timer_init(&session->timeout, + rspamd_fuzzy_redis_timeout, + session->backend->timeout, 0.0); + ev_timer_start(session->event_loop, &session->timeout); + } + } +} + +static void +rspamd_fuzzy_redis_count_callback(redisAsyncContext *c, gpointer r, + gpointer priv) +{ + struct rspamd_fuzzy_redis_session *session = priv; + redisReply *reply = r; + gulong nelts; + + ev_timer_stop(session->event_loop, &session->timeout); + + if (c->err == 0 && reply != NULL) { + rspamd_upstream_ok(session->up); + + if (reply->type == REDIS_REPLY_INTEGER) { + if (session->callback.cb_count) { + session->callback.cb_count(reply->integer, session->cbdata); + } + } + else if (reply->type == REDIS_REPLY_STRING) { + nelts = strtoul(reply->str, NULL, 10); + + if (session->callback.cb_count) { + session->callback.cb_count(nelts, session->cbdata); + } + } + else { + if (reply->type == REDIS_REPLY_ERROR) { + msg_err_redis_session("fuzzy backend redis error: \"%s\"", + reply->str); + } + if (session->callback.cb_count) { + session->callback.cb_count(0, session->cbdata); + } + } + } + else { + if (session->callback.cb_count) { + session->callback.cb_count(0, session->cbdata); + } + + if (c->errstr) { + msg_err_redis_session("error getting count on %s: %s", + rspamd_inet_address_to_string_pretty(rspamd_upstream_addr_cur(session->up)), + c->errstr); + rspamd_upstream_fail(session->up, FALSE, c->errstr); + } + } + + rspamd_fuzzy_redis_session_dtor(session, FALSE); +} + +void rspamd_fuzzy_backend_count_redis(struct rspamd_fuzzy_backend *bk, + rspamd_fuzzy_count_cb cb, void *ud, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_redis *backend = subr_ud; + struct rspamd_fuzzy_redis_session *session; + struct upstream *up; + struct upstream_list *ups; + rspamd_inet_addr_t *addr; + GString *key; + + g_assert(backend != NULL); + + ups = rspamd_redis_get_servers(backend, "read_servers"); + if (!ups) { + if (cb) { + cb(0, ud); + } + + return; + } + + session = g_malloc0(sizeof(*session)); + session->backend = backend; + REF_RETAIN(session->backend); + + session->callback.cb_count = cb; + session->cbdata = ud; + session->command = RSPAMD_FUZZY_REDIS_COMMAND_COUNT; + session->event_loop = rspamd_fuzzy_backend_event_base(bk); + + session->nargs = 2; + session->argv = g_malloc(sizeof(gchar *) * 2); + session->argv_lens = g_malloc(sizeof(gsize) * 2); + key = g_string_new(backend->redis_object); + g_string_append(key, "_count"); + session->argv[0] = g_strdup("GET"); + session->argv_lens[0] = 3; + session->argv[1] = key->str; + session->argv_lens[1] = key->len; + g_string_free(key, FALSE); /* Do not free underlying array */ + + up = rspamd_upstream_get(ups, + RSPAMD_UPSTREAM_ROUND_ROBIN, + NULL, + 0); + + session->up = rspamd_upstream_ref(up); + addr = rspamd_upstream_addr_next(up); + g_assert(addr != NULL); + session->ctx = rspamd_redis_pool_connect(backend->pool, + backend->dbname, + backend->username, backend->password, + rspamd_inet_address_to_string(addr), + rspamd_inet_address_get_port(addr)); + + if (session->ctx == NULL) { + rspamd_upstream_fail(up, TRUE, strerror(errno)); + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + if (cb) { + cb(0, ud); + } + } + else { + if (redisAsyncCommandArgv(session->ctx, rspamd_fuzzy_redis_count_callback, + session, session->nargs, + (const gchar **) session->argv, session->argv_lens) != REDIS_OK) { + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + if (cb) { + cb(0, ud); + } + } + else { + /* Add timeout */ + session->timeout.data = session; + ev_now_update_if_cheap((struct ev_loop *) session->event_loop); + ev_timer_init(&session->timeout, + rspamd_fuzzy_redis_timeout, + session->backend->timeout, 0.0); + ev_timer_start(session->event_loop, &session->timeout); + } + } +} + +static void +rspamd_fuzzy_redis_version_callback(redisAsyncContext *c, gpointer r, + gpointer priv) +{ + struct rspamd_fuzzy_redis_session *session = priv; + redisReply *reply = r; + gulong nelts; + + ev_timer_stop(session->event_loop, &session->timeout); + + if (c->err == 0 && reply != NULL) { + rspamd_upstream_ok(session->up); + + if (reply->type == REDIS_REPLY_INTEGER) { + if (session->callback.cb_version) { + session->callback.cb_version(reply->integer, session->cbdata); + } + } + else if (reply->type == REDIS_REPLY_STRING) { + nelts = strtoul(reply->str, NULL, 10); + + if (session->callback.cb_version) { + session->callback.cb_version(nelts, session->cbdata); + } + } + else { + if (reply->type == REDIS_REPLY_ERROR) { + msg_err_redis_session("fuzzy backend redis error: \"%s\"", + reply->str); + } + if (session->callback.cb_version) { + session->callback.cb_version(0, session->cbdata); + } + } + } + else { + if (session->callback.cb_version) { + session->callback.cb_version(0, session->cbdata); + } + + if (c->errstr) { + msg_err_redis_session("error getting version on %s: %s", + rspamd_inet_address_to_string_pretty(rspamd_upstream_addr_cur(session->up)), + c->errstr); + rspamd_upstream_fail(session->up, FALSE, c->errstr); + } + } + + rspamd_fuzzy_redis_session_dtor(session, FALSE); +} + +void rspamd_fuzzy_backend_version_redis(struct rspamd_fuzzy_backend *bk, + const gchar *src, + rspamd_fuzzy_version_cb cb, void *ud, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_redis *backend = subr_ud; + struct rspamd_fuzzy_redis_session *session; + struct upstream *up; + struct upstream_list *ups; + rspamd_inet_addr_t *addr; + GString *key; + + g_assert(backend != NULL); + + ups = rspamd_redis_get_servers(backend, "read_servers"); + if (!ups) { + if (cb) { + cb(0, ud); + } + + return; + } + + session = g_malloc0(sizeof(*session)); + session->backend = backend; + REF_RETAIN(session->backend); + + session->callback.cb_version = cb; + session->cbdata = ud; + session->command = RSPAMD_FUZZY_REDIS_COMMAND_VERSION; + session->event_loop = rspamd_fuzzy_backend_event_base(bk); + + session->nargs = 2; + session->argv = g_malloc(sizeof(gchar *) * 2); + session->argv_lens = g_malloc(sizeof(gsize) * 2); + key = g_string_new(backend->redis_object); + g_string_append(key, src); + session->argv[0] = g_strdup("GET"); + session->argv_lens[0] = 3; + session->argv[1] = key->str; + session->argv_lens[1] = key->len; + g_string_free(key, FALSE); /* Do not free underlying array */ + + up = rspamd_upstream_get(ups, + RSPAMD_UPSTREAM_ROUND_ROBIN, + NULL, + 0); + + session->up = rspamd_upstream_ref(up); + addr = rspamd_upstream_addr_next(up); + g_assert(addr != NULL); + session->ctx = rspamd_redis_pool_connect(backend->pool, + backend->dbname, + backend->username, backend->password, + rspamd_inet_address_to_string(addr), + rspamd_inet_address_get_port(addr)); + + if (session->ctx == NULL) { + rspamd_upstream_fail(up, FALSE, strerror(errno)); + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + if (cb) { + cb(0, ud); + } + } + else { + if (redisAsyncCommandArgv(session->ctx, rspamd_fuzzy_redis_version_callback, + session, session->nargs, + (const gchar **) session->argv, session->argv_lens) != REDIS_OK) { + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + if (cb) { + cb(0, ud); + } + } + else { + /* Add timeout */ + session->timeout.data = session; + ev_now_update_if_cheap((struct ev_loop *) session->event_loop); + ev_timer_init(&session->timeout, + rspamd_fuzzy_redis_timeout, + session->backend->timeout, 0.0); + ev_timer_start(session->event_loop, &session->timeout); + } + } +} + +const gchar * +rspamd_fuzzy_backend_id_redis(struct rspamd_fuzzy_backend *bk, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_redis *backend = subr_ud; + g_assert(backend != NULL); + + return backend->id; +} + +void rspamd_fuzzy_backend_expire_redis(struct rspamd_fuzzy_backend *bk, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_redis *backend = subr_ud; + + g_assert(backend != NULL); +} + +static gboolean +rspamd_fuzzy_update_append_command(struct rspamd_fuzzy_backend *bk, + struct rspamd_fuzzy_redis_session *session, + struct fuzzy_peer_cmd *io_cmd, guint *shift) +{ + GString *key, *value; + guint cur_shift = *shift; + guint i, klen; + struct rspamd_fuzzy_cmd *cmd; + + if (io_cmd->is_shingle) { + cmd = &io_cmd->cmd.shingle.basic; + } + else { + cmd = &io_cmd->cmd.normal; + } + + if (cmd->cmd == FUZZY_WRITE) { + /* + * For each normal hash addition we do 5 redis commands: + * HSET <key> F <flag> + * HSETNX <key> C <time> + * HINCRBY <key> V <weight> + * EXPIRE <key> <expire> + * Where <key> is <prefix> || <digest> + */ + + /* HSET */ + klen = strlen(session->backend->redis_object) + + sizeof(cmd->digest) + 1; + key = g_string_sized_new(klen); + g_string_append(key, session->backend->redis_object); + g_string_append_len(key, cmd->digest, sizeof(cmd->digest)); + value = g_string_sized_new(sizeof("4294967296")); + rspamd_printf_gstring(value, "%d", cmd->flag); + + if (cmd->version & RSPAMD_FUZZY_FLAG_WEAK) { + session->argv[cur_shift] = g_strdup("HSETNX"); + session->argv_lens[cur_shift++] = sizeof("HSETNX") - 1; + } + else { + session->argv[cur_shift] = g_strdup("HSET"); + session->argv_lens[cur_shift++] = sizeof("HSET") - 1; + } + + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + session->argv[cur_shift] = g_strdup("F"); + session->argv_lens[cur_shift++] = sizeof("F") - 1; + session->argv[cur_shift] = value->str; + session->argv_lens[cur_shift++] = value->len; + g_string_free(key, FALSE); + g_string_free(value, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 4, + (const gchar **) &session->argv[cur_shift - 4], + &session->argv_lens[cur_shift - 4]) != REDIS_OK) { + + return FALSE; + } + + /* HSETNX */ + klen = strlen(session->backend->redis_object) + + sizeof(cmd->digest) + 1; + key = g_string_sized_new(klen); + g_string_append(key, session->backend->redis_object); + g_string_append_len(key, cmd->digest, sizeof(cmd->digest)); + value = g_string_sized_new(sizeof("18446744073709551616")); + rspamd_printf_gstring(value, "%L", (gint64) rspamd_get_calendar_ticks()); + session->argv[cur_shift] = g_strdup("HSETNX"); + session->argv_lens[cur_shift++] = sizeof("HSETNX") - 1; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + session->argv[cur_shift] = g_strdup("C"); + session->argv_lens[cur_shift++] = sizeof("C") - 1; + session->argv[cur_shift] = value->str; + session->argv_lens[cur_shift++] = value->len; + g_string_free(key, FALSE); + g_string_free(value, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 4, + (const gchar **) &session->argv[cur_shift - 4], + &session->argv_lens[cur_shift - 4]) != REDIS_OK) { + + return FALSE; + } + + /* HINCRBY */ + key = g_string_sized_new(klen); + g_string_append(key, session->backend->redis_object); + g_string_append_len(key, cmd->digest, sizeof(cmd->digest)); + value = g_string_sized_new(sizeof("4294967296")); + rspamd_printf_gstring(value, "%d", cmd->value); + session->argv[cur_shift] = g_strdup("HINCRBY"); + session->argv_lens[cur_shift++] = sizeof("HINCRBY") - 1; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + session->argv[cur_shift] = g_strdup("V"); + session->argv_lens[cur_shift++] = sizeof("V") - 1; + session->argv[cur_shift] = value->str; + session->argv_lens[cur_shift++] = value->len; + g_string_free(key, FALSE); + g_string_free(value, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 4, + (const gchar **) &session->argv[cur_shift - 4], + &session->argv_lens[cur_shift - 4]) != REDIS_OK) { + + return FALSE; + } + + /* EXPIRE */ + key = g_string_sized_new(klen); + g_string_append(key, session->backend->redis_object); + g_string_append_len(key, cmd->digest, sizeof(cmd->digest)); + value = g_string_sized_new(sizeof("4294967296")); + rspamd_printf_gstring(value, "%d", + (gint) rspamd_fuzzy_backend_get_expire(bk)); + session->argv[cur_shift] = g_strdup("EXPIRE"); + session->argv_lens[cur_shift++] = sizeof("EXPIRE") - 1; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + session->argv[cur_shift] = value->str; + session->argv_lens[cur_shift++] = value->len; + g_string_free(key, FALSE); + g_string_free(value, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 3, + (const gchar **) &session->argv[cur_shift - 3], + &session->argv_lens[cur_shift - 3]) != REDIS_OK) { + + return FALSE; + } + + /* INCR */ + key = g_string_sized_new(klen); + g_string_append(key, session->backend->redis_object); + g_string_append(key, "_count"); + session->argv[cur_shift] = g_strdup("INCR"); + session->argv_lens[cur_shift++] = sizeof("INCR") - 1; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + g_string_free(key, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 2, + (const gchar **) &session->argv[cur_shift - 2], + &session->argv_lens[cur_shift - 2]) != REDIS_OK) { + + return FALSE; + } + } + else if (cmd->cmd == FUZZY_DEL) { + /* DEL */ + klen = strlen(session->backend->redis_object) + + sizeof(cmd->digest) + 1; + + key = g_string_sized_new(klen); + g_string_append(key, session->backend->redis_object); + g_string_append_len(key, cmd->digest, sizeof(cmd->digest)); + session->argv[cur_shift] = g_strdup("DEL"); + session->argv_lens[cur_shift++] = sizeof("DEL") - 1; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + g_string_free(key, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 2, + (const gchar **) &session->argv[cur_shift - 2], + &session->argv_lens[cur_shift - 2]) != REDIS_OK) { + + return FALSE; + } + + /* DECR */ + key = g_string_sized_new(klen); + g_string_append(key, session->backend->redis_object); + g_string_append(key, "_count"); + session->argv[cur_shift] = g_strdup("DECR"); + session->argv_lens[cur_shift++] = sizeof("DECR") - 1; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + g_string_free(key, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 2, + (const gchar **) &session->argv[cur_shift - 2], + &session->argv_lens[cur_shift - 2]) != REDIS_OK) { + + return FALSE; + } + } + else if (cmd->cmd == FUZZY_REFRESH) { + /* + * Issue refresh command by just EXPIRE command + * EXPIRE <key> <expire> + * Where <key> is <prefix> || <digest> + */ + + klen = strlen(session->backend->redis_object) + + sizeof(cmd->digest) + 1; + + /* EXPIRE */ + key = g_string_sized_new(klen); + g_string_append(key, session->backend->redis_object); + g_string_append_len(key, cmd->digest, sizeof(cmd->digest)); + value = g_string_sized_new(sizeof("4294967296")); + rspamd_printf_gstring(value, "%d", + (gint) rspamd_fuzzy_backend_get_expire(bk)); + session->argv[cur_shift] = g_strdup("EXPIRE"); + session->argv_lens[cur_shift++] = sizeof("EXPIRE") - 1; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + session->argv[cur_shift] = value->str; + session->argv_lens[cur_shift++] = value->len; + g_string_free(key, FALSE); + g_string_free(value, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 3, + (const gchar **) &session->argv[cur_shift - 3], + &session->argv_lens[cur_shift - 3]) != REDIS_OK) { + + return FALSE; + } + } + else if (cmd->cmd == FUZZY_DUP) { + /* Ignore */ + } + else { + g_assert_not_reached(); + } + + if (io_cmd->is_shingle) { + if (cmd->cmd == FUZZY_WRITE) { + klen = strlen(session->backend->redis_object) + + 64 + 1; + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { + guchar *hval; + /* + * For each command with shingles we additionally emit 32 commands: + * SETEX <prefix>_<number>_<value> <expire> <digest> + */ + + /* SETEX */ + key = g_string_sized_new(klen); + rspamd_printf_gstring(key, "%s_%d_%uL", + session->backend->redis_object, + i, + io_cmd->cmd.shingle.sgl.hashes[i]); + value = g_string_sized_new(sizeof("4294967296")); + rspamd_printf_gstring(value, "%d", + (gint) rspamd_fuzzy_backend_get_expire(bk)); + hval = g_malloc(sizeof(io_cmd->cmd.shingle.basic.digest)); + memcpy(hval, io_cmd->cmd.shingle.basic.digest, + sizeof(io_cmd->cmd.shingle.basic.digest)); + session->argv[cur_shift] = g_strdup("SETEX"); + session->argv_lens[cur_shift++] = sizeof("SETEX") - 1; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + session->argv[cur_shift] = value->str; + session->argv_lens[cur_shift++] = value->len; + session->argv[cur_shift] = hval; + session->argv_lens[cur_shift++] = sizeof(io_cmd->cmd.shingle.basic.digest); + g_string_free(key, FALSE); + g_string_free(value, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 4, + (const gchar **) &session->argv[cur_shift - 4], + &session->argv_lens[cur_shift - 4]) != REDIS_OK) { + + return FALSE; + } + } + } + else if (cmd->cmd == FUZZY_DEL) { + klen = strlen(session->backend->redis_object) + + 64 + 1; + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { + key = g_string_sized_new(klen); + rspamd_printf_gstring(key, "%s_%d_%uL", + session->backend->redis_object, + i, + io_cmd->cmd.shingle.sgl.hashes[i]); + session->argv[cur_shift] = g_strdup("DEL"); + session->argv_lens[cur_shift++] = sizeof("DEL") - 1; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + g_string_free(key, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 2, + (const gchar **) &session->argv[cur_shift - 2], + &session->argv_lens[cur_shift - 2]) != REDIS_OK) { + + return FALSE; + } + } + } + else if (cmd->cmd == FUZZY_REFRESH) { + klen = strlen(session->backend->redis_object) + + 64 + 1; + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { + /* + * For each command with shingles we additionally emit 32 commands: + * EXPIRE <prefix>_<number>_<value> <expire> + */ + + /* Expire */ + key = g_string_sized_new(klen); + rspamd_printf_gstring(key, "%s_%d_%uL", + session->backend->redis_object, + i, + io_cmd->cmd.shingle.sgl.hashes[i]); + value = g_string_sized_new(sizeof("18446744073709551616")); + rspamd_printf_gstring(value, "%d", + (gint) rspamd_fuzzy_backend_get_expire(bk)); + session->argv[cur_shift] = g_strdup("EXPIRE"); + session->argv_lens[cur_shift++] = sizeof("EXPIRE") - 1; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + session->argv[cur_shift] = value->str; + session->argv_lens[cur_shift++] = value->len; + g_string_free(key, FALSE); + g_string_free(value, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 3, + (const gchar **) &session->argv[cur_shift - 3], + &session->argv_lens[cur_shift - 3]) != REDIS_OK) { + + return FALSE; + } + } + } + else if (cmd->cmd == FUZZY_DUP) { + /* Ignore */ + } + else { + g_assert_not_reached(); + } + } + + *shift = cur_shift; + + return TRUE; +} + +static void +rspamd_fuzzy_redis_update_callback(redisAsyncContext *c, gpointer r, + gpointer priv) +{ + struct rspamd_fuzzy_redis_session *session = priv; + redisReply *reply = r; + + ev_timer_stop(session->event_loop, &session->timeout); + + if (c->err == 0 && reply != NULL) { + rspamd_upstream_ok(session->up); + + if (reply->type == REDIS_REPLY_ARRAY) { + /* TODO: check all replies somehow */ + if (session->callback.cb_update) { + session->callback.cb_update(TRUE, + session->nadded, + session->ndeleted, + session->nextended, + session->nignored, + session->cbdata); + } + } + else { + if (reply->type == REDIS_REPLY_ERROR) { + msg_err_redis_session("fuzzy backend redis error: \"%s\"", + reply->str); + } + if (session->callback.cb_update) { + session->callback.cb_update(FALSE, 0, 0, 0, 0, session->cbdata); + } + } + } + else { + if (session->callback.cb_update) { + session->callback.cb_update(FALSE, 0, 0, 0, 0, session->cbdata); + } + + if (c->errstr) { + msg_err_redis_session("error sending update to redis %s: %s", + rspamd_inet_address_to_string_pretty(rspamd_upstream_addr_cur(session->up)), + c->errstr); + rspamd_upstream_fail(session->up, FALSE, c->errstr); + } + } + + rspamd_fuzzy_redis_session_dtor(session, FALSE); +} + +void rspamd_fuzzy_backend_update_redis(struct rspamd_fuzzy_backend *bk, + GArray *updates, const gchar *src, + rspamd_fuzzy_update_cb cb, void *ud, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_redis *backend = subr_ud; + struct rspamd_fuzzy_redis_session *session; + struct upstream *up; + struct upstream_list *ups; + rspamd_inet_addr_t *addr; + guint i; + GString *key; + struct fuzzy_peer_cmd *io_cmd; + struct rspamd_fuzzy_cmd *cmd = NULL; + guint nargs, cur_shift; + + g_assert(backend != NULL); + + ups = rspamd_redis_get_servers(backend, "write_servers"); + if (!ups) { + if (cb) { + cb(FALSE, 0, 0, 0, 0, ud); + } + + return; + } + + session = g_malloc0(sizeof(*session)); + session->backend = backend; + REF_RETAIN(session->backend); + + /* + * For each normal hash addition we do 3 redis commands: + * HSET <key> F <flag> **OR** HSETNX <key> F <flag> when flag is weak + * HINCRBY <key> V <weight> + * EXPIRE <key> <expire> + * INCR <prefix||fuzzy_count> + * + * Where <key> is <prefix> || <digest> + * + * For each command with shingles we additionally emit 32 commands: + * SETEX <prefix>_<number>_<value> <expire> <digest> + * + * For each delete command we emit: + * DEL <key> + * + * For each delete command with shingles we emit also 32 commands: + * DEL <prefix>_<number>_<value> + * DECR <prefix||fuzzy_count> + */ + + nargs = 4; + + for (i = 0; i < updates->len; i++) { + io_cmd = &g_array_index(updates, struct fuzzy_peer_cmd, i); + + if (io_cmd->is_shingle) { + cmd = &io_cmd->cmd.shingle.basic; + } + else { + cmd = &io_cmd->cmd.normal; + } + + if (cmd->cmd == FUZZY_WRITE) { + nargs += 17; + session->nadded++; + + if (io_cmd->is_shingle) { + nargs += RSPAMD_SHINGLE_SIZE * 4; + } + } + else if (cmd->cmd == FUZZY_DEL) { + nargs += 4; + session->ndeleted++; + + if (io_cmd->is_shingle) { + nargs += RSPAMD_SHINGLE_SIZE * 2; + } + } + else if (cmd->cmd == FUZZY_REFRESH) { + nargs += 3; + session->nextended++; + + if (io_cmd->is_shingle) { + nargs += RSPAMD_SHINGLE_SIZE * 3; + } + } + else { + session->nignored++; + } + } + + /* Now we need to create a new request */ + session->callback.cb_update = cb; + session->cbdata = ud; + session->command = RSPAMD_FUZZY_REDIS_COMMAND_UPDATES; + session->cmd = cmd; + session->prob = 1.0f; + session->event_loop = rspamd_fuzzy_backend_event_base(bk); + + /* First of all check digest */ + session->nargs = nargs; + session->argv = g_malloc0(sizeof(gchar *) * session->nargs); + session->argv_lens = g_malloc0(sizeof(gsize) * session->nargs); + + up = rspamd_upstream_get(ups, + RSPAMD_UPSTREAM_MASTER_SLAVE, + NULL, + 0); + + session->up = rspamd_upstream_ref(up); + addr = rspamd_upstream_addr_next(up); + g_assert(addr != NULL); + session->ctx = rspamd_redis_pool_connect(backend->pool, + backend->dbname, + backend->username, backend->password, + rspamd_inet_address_to_string(addr), + rspamd_inet_address_get_port(addr)); + + if (session->ctx == NULL) { + rspamd_upstream_fail(up, TRUE, strerror(errno)); + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + if (cb) { + cb(FALSE, 0, 0, 0, 0, ud); + } + } + else { + /* Start with MULTI command */ + session->argv[0] = g_strdup("MULTI"); + session->argv_lens[0] = 5; + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 1, + (const gchar **) session->argv, + session->argv_lens) != REDIS_OK) { + + if (cb) { + cb(FALSE, 0, 0, 0, 0, ud); + } + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + return; + } + + /* Now split the rest of commands in packs and emit them command by command */ + cur_shift = 1; + + for (i = 0; i < updates->len; i++) { + io_cmd = &g_array_index(updates, struct fuzzy_peer_cmd, i); + + if (!rspamd_fuzzy_update_append_command(bk, session, io_cmd, + &cur_shift)) { + if (cb) { + cb(FALSE, 0, 0, 0, 0, ud); + } + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + return; + } + } + + /* Now INCR command for the source */ + key = g_string_new(backend->redis_object); + g_string_append(key, src); + session->argv[cur_shift] = g_strdup("INCR"); + session->argv_lens[cur_shift++] = 4; + session->argv[cur_shift] = key->str; + session->argv_lens[cur_shift++] = key->len; + g_string_free(key, FALSE); + + if (redisAsyncCommandArgv(session->ctx, NULL, NULL, + 2, + (const gchar **) &session->argv[cur_shift - 2], + &session->argv_lens[cur_shift - 2]) != REDIS_OK) { + + if (cb) { + cb(FALSE, 0, 0, 0, 0, ud); + } + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + return; + } + + /* Finally we call EXEC with a specific callback */ + session->argv[cur_shift] = g_strdup("EXEC"); + session->argv_lens[cur_shift] = 4; + + if (redisAsyncCommandArgv(session->ctx, + rspamd_fuzzy_redis_update_callback, session, + 1, + (const gchar **) &session->argv[cur_shift], + &session->argv_lens[cur_shift]) != REDIS_OK) { + + if (cb) { + cb(FALSE, 0, 0, 0, 0, ud); + } + rspamd_fuzzy_redis_session_dtor(session, TRUE); + + return; + } + else { + /* Add timeout */ + session->timeout.data = session; + ev_now_update_if_cheap((struct ev_loop *) session->event_loop); + ev_timer_init(&session->timeout, + rspamd_fuzzy_redis_timeout, + session->backend->timeout, 0.0); + ev_timer_start(session->event_loop, &session->timeout); + } + } +} + +void rspamd_fuzzy_backend_close_redis(struct rspamd_fuzzy_backend *bk, + void *subr_ud) +{ + struct rspamd_fuzzy_backend_redis *backend = subr_ud; + + g_assert(backend != NULL); + + /* + * XXX: we leak lua registry element there to avoid crashing + * due to chicken-egg problem between lua state termination and + * redis pool termination. + * Here, we assume that redis pool is destroyed AFTER lua_state, + * so all connections pending will release references but due to + * `terminated` hack they will not try to access Lua stuff + * This is enabled merely if we have connections pending (e.g. refcount > 1) + */ + if (backend->ref.refcount > 1) { + backend->terminated = true; + } + REF_RELEASE(backend); +} diff --git a/src/libserver/fuzzy_backend/fuzzy_backend_redis.h b/src/libserver/fuzzy_backend/fuzzy_backend_redis.h new file mode 100644 index 0000000..3cfa162 --- /dev/null +++ b/src/libserver/fuzzy_backend/fuzzy_backend_redis.h @@ -0,0 +1,67 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBSERVER_FUZZY_BACKEND_REDIS_H_ +#define SRC_LIBSERVER_FUZZY_BACKEND_REDIS_H_ + +#include "config.h" +#include "fuzzy_backend.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Subroutines for fuzzy_backend + */ +void *rspamd_fuzzy_backend_init_redis(struct rspamd_fuzzy_backend *bk, + const ucl_object_t *obj, + struct rspamd_config *cfg, + GError **err); + +void rspamd_fuzzy_backend_check_redis(struct rspamd_fuzzy_backend *bk, + const struct rspamd_fuzzy_cmd *cmd, + rspamd_fuzzy_check_cb cb, void *ud, + void *subr_ud); + +void rspamd_fuzzy_backend_update_redis(struct rspamd_fuzzy_backend *bk, + GArray *updates, const gchar *src, + rspamd_fuzzy_update_cb cb, void *ud, + void *subr_ud); + +void rspamd_fuzzy_backend_count_redis(struct rspamd_fuzzy_backend *bk, + rspamd_fuzzy_count_cb cb, void *ud, + void *subr_ud); + +void rspamd_fuzzy_backend_version_redis(struct rspamd_fuzzy_backend *bk, + const gchar *src, + rspamd_fuzzy_version_cb cb, void *ud, + void *subr_ud); + +const gchar *rspamd_fuzzy_backend_id_redis(struct rspamd_fuzzy_backend *bk, + void *subr_ud); + +void rspamd_fuzzy_backend_expire_redis(struct rspamd_fuzzy_backend *bk, + void *subr_ud); + +void rspamd_fuzzy_backend_close_redis(struct rspamd_fuzzy_backend *bk, + void *subr_ud); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBSERVER_FUZZY_BACKEND_REDIS_H_ */ diff --git a/src/libserver/fuzzy_backend/fuzzy_backend_sqlite.c b/src/libserver/fuzzy_backend/fuzzy_backend_sqlite.c new file mode 100644 index 0000000..9ec448e --- /dev/null +++ b/src/libserver/fuzzy_backend/fuzzy_backend_sqlite.c @@ -0,0 +1,1029 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "rspamd.h" +#include "fuzzy_backend.h" +#include "fuzzy_backend_sqlite.h" +#include "unix-std.h" + +#include <sqlite3.h> +#include "libutil/sqlite_utils.h" + +struct rspamd_fuzzy_backend_sqlite { + sqlite3 *db; + char *path; + gchar id[MEMPOOL_UID_LEN]; + gsize count; + gsize expired; + rspamd_mempool_t *pool; +}; + +static const gdouble sql_sleep_time = 0.1; +static const guint max_retries = 10; + +#define msg_err_fuzzy_backend(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + backend->pool->tag.tagname, backend->pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_warn_fuzzy_backend(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + backend->pool->tag.tagname, backend->pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_info_fuzzy_backend(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + backend->pool->tag.tagname, backend->pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_debug_fuzzy_backend(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_fuzzy_sqlite_log_id, backend->pool->tag.tagname, backend->pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(fuzzy_sqlite) + +static const char *create_tables_sql = + "BEGIN;" + "CREATE TABLE IF NOT EXISTS digests(" + " id INTEGER PRIMARY KEY," + " flag INTEGER NOT NULL," + " digest TEXT NOT NULL," + " value INTEGER," + " time INTEGER);" + "CREATE TABLE IF NOT EXISTS shingles(" + " value INTEGER NOT NULL," + " number INTEGER NOT NULL," + " digest_id INTEGER REFERENCES digests(id) ON DELETE CASCADE " + " ON UPDATE CASCADE);" + "CREATE TABLE IF NOT EXISTS sources(" + " name TEXT UNIQUE," + " version INTEGER," + " last INTEGER);" + "CREATE UNIQUE INDEX IF NOT EXISTS d ON digests(digest);" + "CREATE INDEX IF NOT EXISTS t ON digests(time);" + "CREATE INDEX IF NOT EXISTS dgst_id ON shingles(digest_id);" + "CREATE UNIQUE INDEX IF NOT EXISTS s ON shingles(value, number);" + "COMMIT;"; +#if 0 +static const char *create_index_sql = + "BEGIN;" + "CREATE UNIQUE INDEX IF NOT EXISTS d ON digests(digest);" + "CREATE INDEX IF NOT EXISTS t ON digests(time);" + "CREATE INDEX IF NOT EXISTS dgst_id ON shingles(digest_id);" + "CREATE UNIQUE INDEX IF NOT EXISTS s ON shingles(value, number);" + "COMMIT;"; +#endif +enum rspamd_fuzzy_statement_idx { + RSPAMD_FUZZY_BACKEND_TRANSACTION_START = 0, + RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT, + RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK, + RSPAMD_FUZZY_BACKEND_INSERT, + RSPAMD_FUZZY_BACKEND_UPDATE, + RSPAMD_FUZZY_BACKEND_UPDATE_FLAG, + RSPAMD_FUZZY_BACKEND_INSERT_SHINGLE, + RSPAMD_FUZZY_BACKEND_CHECK, + RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE, + RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID, + RSPAMD_FUZZY_BACKEND_DELETE, + RSPAMD_FUZZY_BACKEND_COUNT, + RSPAMD_FUZZY_BACKEND_EXPIRE, + RSPAMD_FUZZY_BACKEND_VACUUM, + RSPAMD_FUZZY_BACKEND_DELETE_ORPHANED, + RSPAMD_FUZZY_BACKEND_ADD_SOURCE, + RSPAMD_FUZZY_BACKEND_VERSION, + RSPAMD_FUZZY_BACKEND_SET_VERSION, + RSPAMD_FUZZY_BACKEND_MAX +}; +static struct rspamd_fuzzy_stmts { + enum rspamd_fuzzy_statement_idx idx; + const gchar *sql; + const gchar *args; + sqlite3_stmt *stmt; + gint result; +} prepared_stmts[RSPAMD_FUZZY_BACKEND_MAX] = + { + {.idx = RSPAMD_FUZZY_BACKEND_TRANSACTION_START, + .sql = "BEGIN TRANSACTION;", + .args = "", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT, + .sql = "COMMIT;", + .args = "", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK, + .sql = "ROLLBACK;", + .args = "", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_INSERT, + .sql = "INSERT INTO digests(flag, digest, value, time) VALUES" + "(?1, ?2, ?3, strftime('%s','now'));", + .args = "SDI", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_UPDATE, + .sql = "UPDATE digests SET value = value + ?1, time = strftime('%s','now') WHERE " + "digest==?2;", + .args = "ID", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_UPDATE_FLAG, + .sql = "UPDATE digests SET value = ?1, flag = ?2, time = strftime('%s','now') WHERE " + "digest==?3;", + .args = "IID", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_INSERT_SHINGLE, + .sql = "INSERT OR REPLACE INTO shingles(value, number, digest_id) " + "VALUES (?1, ?2, ?3);", + .args = "III", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_CHECK, + .sql = "SELECT value, time, flag FROM digests WHERE digest==?1;", + .args = "D", + .stmt = NULL, + .result = SQLITE_ROW}, + {.idx = RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE, + .sql = "SELECT digest_id FROM shingles WHERE value=?1 AND number=?2", + .args = "IS", + .stmt = NULL, + .result = SQLITE_ROW}, + {.idx = RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID, + .sql = "SELECT digest, value, time, flag FROM digests WHERE id=?1", + .args = "I", + .stmt = NULL, + .result = SQLITE_ROW}, + {.idx = RSPAMD_FUZZY_BACKEND_DELETE, + .sql = "DELETE FROM digests WHERE digest==?1;", + .args = "D", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_COUNT, + .sql = "SELECT COUNT(*) FROM digests;", + .args = "", + .stmt = NULL, + .result = SQLITE_ROW}, + {.idx = RSPAMD_FUZZY_BACKEND_EXPIRE, + .sql = "DELETE FROM digests WHERE id IN (SELECT id FROM digests WHERE time < ?1 LIMIT ?2);", + .args = "II", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_VACUUM, + .sql = "VACUUM;", + .args = "", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_DELETE_ORPHANED, + .sql = "DELETE FROM shingles WHERE value=?1 AND number=?2;", + .args = "II", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_ADD_SOURCE, + .sql = "INSERT OR IGNORE INTO sources(name, version, last) VALUES (?1, ?2, ?3);", + .args = "TII", + .stmt = NULL, + .result = SQLITE_DONE}, + {.idx = RSPAMD_FUZZY_BACKEND_VERSION, + .sql = "SELECT version FROM sources WHERE name=?1;", + .args = "T", + .stmt = NULL, + .result = SQLITE_ROW}, + {.idx = RSPAMD_FUZZY_BACKEND_SET_VERSION, + .sql = "INSERT OR REPLACE INTO sources (name, version, last) VALUES (?3, ?1, ?2);", + .args = "IIT", + .stmt = NULL, + .result = SQLITE_DONE}, +}; + +static GQuark +rspamd_fuzzy_backend_sqlite_quark(void) +{ + return g_quark_from_static_string("fuzzy-backend-sqlite"); +} + +static gboolean +rspamd_fuzzy_backend_sqlite_prepare_stmts(struct rspamd_fuzzy_backend_sqlite *bk, GError **err) +{ + int i; + + for (i = 0; i < RSPAMD_FUZZY_BACKEND_MAX; i++) { + if (prepared_stmts[i].stmt != NULL) { + /* Skip already prepared statements */ + continue; + } + if (sqlite3_prepare_v2(bk->db, prepared_stmts[i].sql, -1, + &prepared_stmts[i].stmt, NULL) != SQLITE_OK) { + g_set_error(err, rspamd_fuzzy_backend_sqlite_quark(), + -1, "Cannot initialize prepared sql `%s`: %s", + prepared_stmts[i].sql, sqlite3_errmsg(bk->db)); + + return FALSE; + } + } + + return TRUE; +} + +static int +rspamd_fuzzy_backend_sqlite_cleanup_stmt(struct rspamd_fuzzy_backend_sqlite *backend, + int idx) +{ + sqlite3_stmt *stmt; + + if (idx < 0 || idx >= RSPAMD_FUZZY_BACKEND_MAX) { + + return -1; + } + + msg_debug_fuzzy_backend("resetting `%s`", prepared_stmts[idx].sql); + stmt = prepared_stmts[idx].stmt; + sqlite3_clear_bindings(stmt); + sqlite3_reset(stmt); + + return SQLITE_OK; +} + +static int +rspamd_fuzzy_backend_sqlite_run_stmt(struct rspamd_fuzzy_backend_sqlite *backend, + gboolean auto_cleanup, + int idx, ...) +{ + int retcode; + va_list ap; + sqlite3_stmt *stmt; + int i; + const char *argtypes; + guint retries = 0; + struct timespec ts; + + if (idx < 0 || idx >= RSPAMD_FUZZY_BACKEND_MAX) { + + return -1; + } + + stmt = prepared_stmts[idx].stmt; + g_assert((int) prepared_stmts[idx].idx == idx); + + if (stmt == NULL) { + if ((retcode = sqlite3_prepare_v2(backend->db, prepared_stmts[idx].sql, -1, + &prepared_stmts[idx].stmt, NULL)) != SQLITE_OK) { + msg_err_fuzzy_backend("Cannot initialize prepared sql `%s`: %s", + prepared_stmts[idx].sql, sqlite3_errmsg(backend->db)); + + return retcode; + } + stmt = prepared_stmts[idx].stmt; + } + + msg_debug_fuzzy_backend("executing `%s` %s auto cleanup", + prepared_stmts[idx].sql, auto_cleanup ? "with" : "without"); + argtypes = prepared_stmts[idx].args; + sqlite3_clear_bindings(stmt); + sqlite3_reset(stmt); + va_start(ap, idx); + + for (i = 0; argtypes[i] != '\0'; i++) { + switch (argtypes[i]) { + case 'T': + sqlite3_bind_text(stmt, i + 1, va_arg(ap, const char *), -1, + SQLITE_STATIC); + break; + case 'I': + sqlite3_bind_int64(stmt, i + 1, va_arg(ap, gint64)); + break; + case 'S': + sqlite3_bind_int(stmt, i + 1, va_arg(ap, gint)); + break; + case 'D': + /* Special case for digests variable */ + sqlite3_bind_text(stmt, i + 1, va_arg(ap, const char *), 64, + SQLITE_STATIC); + break; + } + } + + va_end(ap); + +retry: + retcode = sqlite3_step(stmt); + + if (retcode == prepared_stmts[idx].result) { + retcode = SQLITE_OK; + } + else { + if ((retcode == SQLITE_BUSY || + retcode == SQLITE_LOCKED) && + retries++ < max_retries) { + double_to_ts(sql_sleep_time, &ts); + nanosleep(&ts, NULL); + goto retry; + } + + msg_debug_fuzzy_backend("failed to execute query %s: %d, %s", prepared_stmts[idx].sql, + retcode, sqlite3_errmsg(backend->db)); + } + + if (auto_cleanup) { + sqlite3_clear_bindings(stmt); + sqlite3_reset(stmt); + } + + return retcode; +} + +static void +rspamd_fuzzy_backend_sqlite_close_stmts(struct rspamd_fuzzy_backend_sqlite *bk) +{ + int i; + + for (i = 0; i < RSPAMD_FUZZY_BACKEND_MAX; i++) { + if (prepared_stmts[i].stmt != NULL) { + sqlite3_finalize(prepared_stmts[i].stmt); + prepared_stmts[i].stmt = NULL; + } + } + + return; +} + +static gboolean +rspamd_fuzzy_backend_sqlite_run_sql(const gchar *sql, struct rspamd_fuzzy_backend_sqlite *bk, + GError **err) +{ + guint retries = 0; + struct timespec ts; + gint ret; + + do { + ret = sqlite3_exec(bk->db, sql, NULL, NULL, NULL); + double_to_ts(sql_sleep_time, &ts); + } while (ret == SQLITE_BUSY && retries++ < max_retries && + nanosleep(&ts, NULL) == 0); + + if (ret != SQLITE_OK) { + g_set_error(err, rspamd_fuzzy_backend_sqlite_quark(), + -1, "Cannot execute raw sql `%s`: %s", + sql, sqlite3_errmsg(bk->db)); + return FALSE; + } + + return TRUE; +} + +static struct rspamd_fuzzy_backend_sqlite * +rspamd_fuzzy_backend_sqlite_open_db(const gchar *path, GError **err) +{ + struct rspamd_fuzzy_backend_sqlite *bk; + rspamd_cryptobox_hash_state_t st; + guchar hash_out[rspamd_cryptobox_HASHBYTES]; + + g_assert(path != NULL); + + bk = g_malloc0(sizeof(*bk)); + bk->path = g_strdup(path); + bk->expired = 0; + bk->pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "fuzzy_backend", 0); + bk->db = rspamd_sqlite3_open_or_create(bk->pool, bk->path, + create_tables_sql, 1, err); + + if (bk->db == NULL) { + rspamd_fuzzy_backend_sqlite_close(bk); + + return NULL; + } + + if (!rspamd_fuzzy_backend_sqlite_prepare_stmts(bk, err)) { + rspamd_fuzzy_backend_sqlite_close(bk); + + return NULL; + } + + /* Set id for the backend */ + rspamd_cryptobox_hash_init(&st, NULL, 0); + rspamd_cryptobox_hash_update(&st, path, strlen(path)); + rspamd_cryptobox_hash_final(&st, hash_out); + rspamd_snprintf(bk->id, sizeof(bk->id), "%xs", hash_out); + memcpy(bk->pool->tag.uid, bk->id, sizeof(bk->pool->tag.uid)); + + return bk; +} + +struct rspamd_fuzzy_backend_sqlite * +rspamd_fuzzy_backend_sqlite_open(const gchar *path, + gboolean vacuum, + GError **err) +{ + struct rspamd_fuzzy_backend_sqlite *backend; + + if (path == NULL) { + g_set_error(err, rspamd_fuzzy_backend_sqlite_quark(), + ENOENT, "Path has not been specified"); + return NULL; + } + + /* Open database */ + if ((backend = rspamd_fuzzy_backend_sqlite_open_db(path, err)) == NULL) { + return NULL; + } + + if (rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, RSPAMD_FUZZY_BACKEND_COUNT) == SQLITE_OK) { + backend->count = sqlite3_column_int64( + prepared_stmts[RSPAMD_FUZZY_BACKEND_COUNT].stmt, 0); + } + + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_COUNT); + + return backend; +} + +static gint +rspamd_fuzzy_backend_sqlite_int64_cmp(const void *a, const void *b) +{ + gint64 ia = *(gint64 *) a, ib = *(gint64 *) b; + + return (ia - ib); +} + +struct rspamd_fuzzy_reply +rspamd_fuzzy_backend_sqlite_check(struct rspamd_fuzzy_backend_sqlite *backend, + const struct rspamd_fuzzy_cmd *cmd, gint64 expire) +{ + struct rspamd_fuzzy_reply rep; + const struct rspamd_fuzzy_shingle_cmd *shcmd; + int rc; + gint64 timestamp; + gint64 shingle_values[RSPAMD_SHINGLE_SIZE], i, sel_id, cur_id, + cur_cnt, max_cnt; + + memset(&rep, 0, sizeof(rep)); + memcpy(rep.digest, cmd->digest, sizeof(rep.digest)); + + if (backend == NULL) { + return rep; + } + + /* Try direct match first of all */ + rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_START); + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, + RSPAMD_FUZZY_BACKEND_CHECK, + cmd->digest); + + if (rc == SQLITE_OK) { + timestamp = sqlite3_column_int64( + prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt, 1); + if (time(NULL) - timestamp > expire) { + /* Expire element */ + msg_debug_fuzzy_backend("requested hash has been expired"); + } + else { + rep.v1.value = sqlite3_column_int64( + prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt, 0); + rep.v1.prob = 1.0; + rep.v1.flag = sqlite3_column_int( + prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt, 2); + } + } + else if (cmd->shingles_count > 0) { + /* Fuzzy match */ + + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK); + shcmd = (const struct rspamd_fuzzy_shingle_cmd *) cmd; + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, + RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE, + shcmd->sgl.hashes[i], i); + if (rc == SQLITE_OK) { + shingle_values[i] = sqlite3_column_int64( + prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE].stmt, + 0); + } + else { + shingle_values[i] = -1; + } + msg_debug_fuzzy_backend("looking for shingle %L -> %L: %d", i, + shcmd->sgl.hashes[i], rc); + } + + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, + RSPAMD_FUZZY_BACKEND_CHECK_SHINGLE); + + qsort(shingle_values, RSPAMD_SHINGLE_SIZE, sizeof(gint64), + rspamd_fuzzy_backend_sqlite_int64_cmp); + sel_id = -1; + cur_id = -1; + cur_cnt = 0; + max_cnt = 0; + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { + if (shingle_values[i] == -1) { + continue; + } + + /* We have some value here, so we need to check it */ + if (shingle_values[i] == cur_id) { + cur_cnt++; + } + else { + cur_id = shingle_values[i]; + if (cur_cnt >= max_cnt) { + max_cnt = cur_cnt; + sel_id = cur_id; + } + cur_cnt = 0; + } + } + + if (cur_cnt > max_cnt) { + max_cnt = cur_cnt; + } + + if (sel_id != -1) { + /* We have some id selected here */ + rep.v1.prob = (float) max_cnt / (float) RSPAMD_SHINGLE_SIZE; + + if (rep.v1.prob > 0.5) { + msg_debug_fuzzy_backend( + "found fuzzy hash with probability %.2f", + rep.v1.prob); + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, + RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID, sel_id); + if (rc == SQLITE_OK) { + timestamp = sqlite3_column_int64( + prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt, + 2); + if (time(NULL) - timestamp > expire) { + /* Expire element */ + msg_debug_fuzzy_backend( + "requested hash has been expired"); + rep.v1.prob = 0.0; + } + else { + rep.ts = timestamp; + memcpy(rep.digest, sqlite3_column_blob(prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt, 0), sizeof(rep.digest)); + rep.v1.value = sqlite3_column_int64( + prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt, + 1); + rep.v1.flag = sqlite3_column_int( + prepared_stmts[RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID].stmt, + 3); + } + } + } + else { + /* Otherwise we assume that as error */ + rep.v1.value = 0; + } + + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, + RSPAMD_FUZZY_BACKEND_GET_DIGEST_BY_ID); + } + } + + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK); + rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT); + + return rep; +} + +gboolean +rspamd_fuzzy_backend_sqlite_prepare_update(struct rspamd_fuzzy_backend_sqlite *backend, + const gchar *source) +{ + gint rc; + + if (backend == NULL) { + return FALSE; + } + + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_START); + + if (rc != SQLITE_OK) { + msg_warn_fuzzy_backend("cannot start transaction for updates: %s", + sqlite3_errmsg(backend->db)); + return FALSE; + } + + return TRUE; +} + +gboolean +rspamd_fuzzy_backend_sqlite_add(struct rspamd_fuzzy_backend_sqlite *backend, + const struct rspamd_fuzzy_cmd *cmd) +{ + int rc, i; + gint64 id, flag; + const struct rspamd_fuzzy_shingle_cmd *shcmd; + + if (backend == NULL) { + return FALSE; + } + + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, + RSPAMD_FUZZY_BACKEND_CHECK, + cmd->digest); + + if (rc == SQLITE_OK) { + /* Check flag */ + flag = sqlite3_column_int64( + prepared_stmts[RSPAMD_FUZZY_BACKEND_CHECK].stmt, + 2); + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK); + + if (flag == cmd->flag) { + /* We need to increase weight */ + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_UPDATE, + (gint64) cmd->value, + cmd->digest); + if (rc != SQLITE_OK) { + msg_warn_fuzzy_backend("cannot update hash to %d -> " + "%*xs: %s", + (gint) cmd->flag, + (gint) sizeof(cmd->digest), cmd->digest, + sqlite3_errmsg(backend->db)); + } + } + else { + /* We need to relearn actually */ + + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_UPDATE_FLAG, + (gint64) cmd->value, + (gint64) cmd->flag, + cmd->digest); + + if (rc != SQLITE_OK) { + msg_warn_fuzzy_backend("cannot update hash to %d -> " + "%*xs: %s", + (gint) cmd->flag, + (gint) sizeof(cmd->digest), cmd->digest, + sqlite3_errmsg(backend->db)); + } + } + } + else { + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK); + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, + RSPAMD_FUZZY_BACKEND_INSERT, + (gint) cmd->flag, + cmd->digest, + (gint64) cmd->value); + + if (rc == SQLITE_OK) { + if (cmd->shingles_count > 0) { + id = sqlite3_last_insert_rowid(backend->db); + shcmd = (const struct rspamd_fuzzy_shingle_cmd *) cmd; + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_INSERT_SHINGLE, + shcmd->sgl.hashes[i], (gint64) i, id); + msg_debug_fuzzy_backend("add shingle %d -> %L: %L", + i, + shcmd->sgl.hashes[i], + id); + + if (rc != SQLITE_OK) { + msg_warn_fuzzy_backend("cannot add shingle %d -> " + "%L: %L: %s", + i, + shcmd->sgl.hashes[i], + id, sqlite3_errmsg(backend->db)); + } + } + } + } + else { + msg_warn_fuzzy_backend("cannot add hash to %d -> " + "%*xs: %s", + (gint) cmd->flag, + (gint) sizeof(cmd->digest), cmd->digest, + sqlite3_errmsg(backend->db)); + } + + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, + RSPAMD_FUZZY_BACKEND_INSERT); + } + + return (rc == SQLITE_OK); +} + +gboolean +rspamd_fuzzy_backend_sqlite_finish_update(struct rspamd_fuzzy_backend_sqlite *backend, + const gchar *source, gboolean version_bump) +{ + gint rc = SQLITE_OK, wal_frames, wal_checkpointed, ver; + + /* Get and update version */ + if (version_bump) { + ver = rspamd_fuzzy_backend_sqlite_version(backend, source); + ++ver; + + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_SET_VERSION, + (gint64) ver, (gint64) time(NULL), source); + } + + if (rc == SQLITE_OK) { + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT); + + if (rc != SQLITE_OK) { + msg_warn_fuzzy_backend("cannot commit updates: %s", + sqlite3_errmsg(backend->db)); + rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK); + return FALSE; + } + else { + if (!rspamd_sqlite3_sync(backend->db, &wal_frames, &wal_checkpointed)) { + msg_warn_fuzzy_backend("cannot commit checkpoint: %s", + sqlite3_errmsg(backend->db)); + } + else if (wal_checkpointed > 0) { + msg_info_fuzzy_backend("total number of frames in the wal file: " + "%d, checkpointed: %d", + wal_frames, wal_checkpointed); + } + } + } + else { + msg_warn_fuzzy_backend("cannot update version for %s: %s", source, + sqlite3_errmsg(backend->db)); + rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK); + return FALSE; + } + + return TRUE; +} + +gboolean +rspamd_fuzzy_backend_sqlite_del(struct rspamd_fuzzy_backend_sqlite *backend, + const struct rspamd_fuzzy_cmd *cmd) +{ + int rc = -1; + + if (backend == NULL) { + return FALSE; + } + + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, + RSPAMD_FUZZY_BACKEND_CHECK, + cmd->digest); + + if (rc == SQLITE_OK) { + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK); + + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_DELETE, + cmd->digest); + if (rc != SQLITE_OK) { + msg_warn_fuzzy_backend("cannot update hash to %d -> " + "%*xs: %s", + (gint) cmd->flag, + (gint) sizeof(cmd->digest), cmd->digest, + sqlite3_errmsg(backend->db)); + } + } + else { + /* Hash is missing */ + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_CHECK); + } + + return (rc == SQLITE_OK); +} + +gboolean +rspamd_fuzzy_backend_sqlite_sync(struct rspamd_fuzzy_backend_sqlite *backend, + gint64 expire, + gboolean clean_orphaned) +{ + struct orphaned_shingle_elt { + gint64 value; + gint64 number; + }; + + /* Do not do more than 5k ops per step */ + const guint64 max_changes = 5000; + gboolean ret = FALSE; + gint64 expire_lim, expired; + gint rc, i, orphaned_cnt = 0; + GError *err = NULL; + static const gchar orphaned_shingles[] = "SELECT shingles.value,shingles.number " + "FROM shingles " + "LEFT JOIN digests ON " + "shingles.digest_id=digests.id WHERE " + "digests.id IS NULL;"; + sqlite3_stmt *stmt; + GArray *orphaned; + struct orphaned_shingle_elt orphaned_elt, *pelt; + + + if (backend == NULL) { + return FALSE; + } + + /* Perform expire */ + if (expire > 0) { + expire_lim = time(NULL) - expire; + + if (expire_lim > 0) { + ret = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_START); + + if (ret == SQLITE_OK) { + + rc = rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, + RSPAMD_FUZZY_BACKEND_EXPIRE, expire_lim, max_changes); + + if (rc == SQLITE_OK) { + expired = sqlite3_changes(backend->db); + + if (expired > 0) { + backend->expired += expired; + msg_info_fuzzy_backend("expired %L hashes", expired); + } + } + else { + msg_warn_fuzzy_backend( + "cannot execute expired statement: %s", + sqlite3_errmsg(backend->db)); + } + + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, + RSPAMD_FUZZY_BACKEND_EXPIRE); + + ret = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT); + + if (ret != SQLITE_OK) { + rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK); + } + } + if (ret != SQLITE_OK) { + msg_warn_fuzzy_backend("cannot expire db: %s", + sqlite3_errmsg(backend->db)); + } + } + } + + /* Cleanup database */ + if (clean_orphaned) { + ret = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_START); + + if (ret == SQLITE_OK) { + if ((rc = sqlite3_prepare_v2(backend->db, + orphaned_shingles, + -1, + &stmt, + NULL)) != SQLITE_OK) { + msg_warn_fuzzy_backend("cannot cleanup shingles: %s", + sqlite3_errmsg(backend->db)); + } + else { + orphaned = g_array_new(FALSE, + FALSE, + sizeof(struct orphaned_shingle_elt)); + + while (sqlite3_step(stmt) == SQLITE_ROW) { + orphaned_elt.value = sqlite3_column_int64(stmt, 0); + orphaned_elt.number = sqlite3_column_int64(stmt, 1); + g_array_append_val(orphaned, orphaned_elt); + + if (orphaned->len > max_changes) { + break; + } + } + + sqlite3_finalize(stmt); + orphaned_cnt = orphaned->len; + + if (orphaned_cnt > 0) { + msg_info_fuzzy_backend( + "going to delete %ud orphaned shingles", + orphaned_cnt); + /* Need to delete orphaned elements */ + for (i = 0; i < (gint) orphaned_cnt; i++) { + pelt = &g_array_index(orphaned, + struct orphaned_shingle_elt, + i); + rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_DELETE_ORPHANED, + pelt->value, pelt->number); + } + } + + + g_array_free(orphaned, TRUE); + } + + ret = rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_COMMIT); + + if (ret == SQLITE_OK) { + msg_info_fuzzy_backend( + "deleted %ud orphaned shingles", + orphaned_cnt); + } + else { + msg_warn_fuzzy_backend( + "cannot synchronize fuzzy backend: %e", + err); + rspamd_fuzzy_backend_sqlite_run_stmt(backend, TRUE, + RSPAMD_FUZZY_BACKEND_TRANSACTION_ROLLBACK); + } + } + } + + return ret; +} + + +void rspamd_fuzzy_backend_sqlite_close(struct rspamd_fuzzy_backend_sqlite *backend) +{ + if (backend != NULL) { + if (backend->db != NULL) { + rspamd_fuzzy_backend_sqlite_close_stmts(backend); + sqlite3_close(backend->db); + } + + if (backend->path != NULL) { + g_free(backend->path); + } + + if (backend->pool) { + rspamd_mempool_delete(backend->pool); + } + + g_free(backend); + } +} + + +gsize rspamd_fuzzy_backend_sqlite_count(struct rspamd_fuzzy_backend_sqlite *backend) +{ + if (backend) { + if (rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, + RSPAMD_FUZZY_BACKEND_COUNT) == SQLITE_OK) { + backend->count = sqlite3_column_int64( + prepared_stmts[RSPAMD_FUZZY_BACKEND_COUNT].stmt, 0); + } + + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_COUNT); + + return backend->count; + } + + return 0; +} + +gint rspamd_fuzzy_backend_sqlite_version(struct rspamd_fuzzy_backend_sqlite *backend, + const gchar *source) +{ + gint ret = 0; + + if (backend) { + if (rspamd_fuzzy_backend_sqlite_run_stmt(backend, FALSE, + RSPAMD_FUZZY_BACKEND_VERSION, source) == SQLITE_OK) { + ret = sqlite3_column_int64( + prepared_stmts[RSPAMD_FUZZY_BACKEND_VERSION].stmt, 0); + } + + rspamd_fuzzy_backend_sqlite_cleanup_stmt(backend, RSPAMD_FUZZY_BACKEND_VERSION); + } + + return ret; +} + +gsize rspamd_fuzzy_backend_sqlite_expired(struct rspamd_fuzzy_backend_sqlite *backend) +{ + return backend != NULL ? backend->expired : 0; +} + +const gchar * +rspamd_fuzzy_sqlite_backend_id(struct rspamd_fuzzy_backend_sqlite *backend) +{ + return backend != NULL ? backend->id : 0; +} diff --git a/src/libserver/fuzzy_backend/fuzzy_backend_sqlite.h b/src/libserver/fuzzy_backend/fuzzy_backend_sqlite.h new file mode 100644 index 0000000..766f7c9 --- /dev/null +++ b/src/libserver/fuzzy_backend/fuzzy_backend_sqlite.h @@ -0,0 +1,107 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef FUZZY_BACKEND_H_ +#define FUZZY_BACKEND_H_ + +#include "config.h" +#include "fuzzy_wire.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_fuzzy_backend_sqlite; + +/** + * Open fuzzy backend + * @param path file to open (legacy file will be converted automatically) + * @param err error pointer + * @return backend structure or NULL + */ +struct rspamd_fuzzy_backend_sqlite *rspamd_fuzzy_backend_sqlite_open(const gchar *path, + gboolean vacuum, + GError **err); + +/** + * Check specified fuzzy in the backend + * @param backend + * @param cmd + * @return reply with probability and weight + */ +struct rspamd_fuzzy_reply rspamd_fuzzy_backend_sqlite_check( + struct rspamd_fuzzy_backend_sqlite *backend, + const struct rspamd_fuzzy_cmd *cmd, + gint64 expire); + +/** + * Prepare storage for updates (by starting transaction) + */ +gboolean rspamd_fuzzy_backend_sqlite_prepare_update(struct rspamd_fuzzy_backend_sqlite *backend, + const gchar *source); + +/** + * Add digest to the database + * @param backend + * @param cmd + * @return + */ +gboolean rspamd_fuzzy_backend_sqlite_add(struct rspamd_fuzzy_backend_sqlite *backend, + const struct rspamd_fuzzy_cmd *cmd); + +/** + * Delete digest from the database + * @param backend + * @param cmd + * @return + */ +gboolean rspamd_fuzzy_backend_sqlite_del( + struct rspamd_fuzzy_backend_sqlite *backend, + const struct rspamd_fuzzy_cmd *cmd); + +/** + * Commit updates to storage + */ +gboolean rspamd_fuzzy_backend_sqlite_finish_update(struct rspamd_fuzzy_backend_sqlite *backend, + const gchar *source, gboolean version_bump); + +/** + * Sync storage + * @param backend + * @return + */ +gboolean rspamd_fuzzy_backend_sqlite_sync(struct rspamd_fuzzy_backend_sqlite *backend, + gint64 expire, + gboolean clean_orphaned); + +/** + * Close storage + * @param backend + */ +void rspamd_fuzzy_backend_sqlite_close(struct rspamd_fuzzy_backend_sqlite *backend); + +gsize rspamd_fuzzy_backend_sqlite_count(struct rspamd_fuzzy_backend_sqlite *backend); + +gint rspamd_fuzzy_backend_sqlite_version(struct rspamd_fuzzy_backend_sqlite *backend, const gchar *source); + +gsize rspamd_fuzzy_backend_sqlite_expired(struct rspamd_fuzzy_backend_sqlite *backend); + +const gchar *rspamd_fuzzy_sqlite_backend_id(struct rspamd_fuzzy_backend_sqlite *backend); + +#ifdef __cplusplus +} +#endif + +#endif /* FUZZY_BACKEND_H_ */ diff --git a/src/libserver/fuzzy_wire.h b/src/libserver/fuzzy_wire.h new file mode 100644 index 0000000..c2f93b8 --- /dev/null +++ b/src/libserver/fuzzy_wire.h @@ -0,0 +1,154 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_FUZZY_STORAGE_H +#define RSPAMD_FUZZY_STORAGE_H + +#include "config.h" +#include "rspamd.h" +#include "shingles.h" +#include "cryptobox.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RSPAMD_FUZZY_VERSION 4 +#define RSPAMD_FUZZY_KEYLEN 8 + +#define RSPAMD_FUZZY_FLAG_WEAK (1u << 7u) +/* Use lower 4 bits for the version */ +#define RSPAMD_FUZZY_VERSION_MASK 0x0fu +/* Commands for fuzzy storage */ +#define FUZZY_CHECK 0 +#define FUZZY_WRITE 1 +#define FUZZY_DEL 2 +#define FUZZY_STAT 3 +#define FUZZY_PING 4 +#define FUZZY_CLIENT_MAX 4 +/* Internal commands */ +#define FUZZY_REFRESH 100 /* Update expire */ +#define FUZZY_DUP 101 /* Skip duplicate in update queue */ + +/** + * The epoch of the fuzzy client + */ +enum rspamd_fuzzy_epoch { + RSPAMD_FUZZY_EPOCH10, /**< 1.0+ encryption */ + RSPAMD_FUZZY_EPOCH11, /**< 1.7+ extended reply */ + RSPAMD_FUZZY_EPOCH_MAX +}; + +RSPAMD_PACKED(rspamd_fuzzy_cmd) +{ + guint8 version; + guint8 cmd; + guint8 shingles_count; + guint8 flag; + gint32 value; + guint32 tag; + gchar digest[rspamd_cryptobox_HASHBYTES]; +}; + +RSPAMD_PACKED(rspamd_fuzzy_shingle_cmd) +{ + struct rspamd_fuzzy_cmd basic; + struct rspamd_shingle sgl; +}; + +RSPAMD_PACKED(rspamd_fuzzy_reply_v1) +{ + gint32 value; + guint32 flag; + guint32 tag; + float prob; +}; + +RSPAMD_PACKED(rspamd_fuzzy_reply) +{ + struct rspamd_fuzzy_reply_v1 v1; + gchar digest[rspamd_cryptobox_HASHBYTES]; + guint32 ts; + guchar reserved[12]; +}; + +RSPAMD_PACKED(rspamd_fuzzy_encrypted_req_hdr) +{ + guchar magic[4]; + guchar key_id[RSPAMD_FUZZY_KEYLEN]; + guchar pubkey[32]; + guchar nonce[rspamd_cryptobox_MAX_NONCEBYTES]; + guchar mac[rspamd_cryptobox_MAX_MACBYTES]; +}; + +RSPAMD_PACKED(rspamd_fuzzy_encrypted_cmd) +{ + struct rspamd_fuzzy_encrypted_req_hdr hdr; + struct rspamd_fuzzy_cmd cmd; +}; + +RSPAMD_PACKED(rspamd_fuzzy_encrypted_shingle_cmd) +{ + struct rspamd_fuzzy_encrypted_req_hdr hdr; + struct rspamd_fuzzy_shingle_cmd cmd; +}; + +RSPAMD_PACKED(rspamd_fuzzy_encrypted_rep_hdr) +{ + guchar nonce[rspamd_cryptobox_MAX_NONCEBYTES]; + guchar mac[rspamd_cryptobox_MAX_MACBYTES]; +}; + +RSPAMD_PACKED(rspamd_fuzzy_encrypted_reply) +{ + struct rspamd_fuzzy_encrypted_rep_hdr hdr; + struct rspamd_fuzzy_reply rep; +}; + +static const guchar fuzzy_encrypted_magic[4] = {'r', 's', 'f', 'e'}; + +enum rspamd_fuzzy_extension_type { + RSPAMD_FUZZY_EXT_SOURCE_DOMAIN = 'd', + RSPAMD_FUZZY_EXT_SOURCE_IP4 = '4', + RSPAMD_FUZZY_EXT_SOURCE_IP6 = '6', +}; + +struct rspamd_fuzzy_cmd_extension { + enum rspamd_fuzzy_extension_type ext; + guint length; + struct rspamd_fuzzy_cmd_extension *next; + guchar *payload; +}; + +struct rspamd_fuzzy_stat_entry { + const gchar *name; + guint64 fuzzy_cnt; +}; + +RSPAMD_PACKED(fuzzy_peer_cmd) +{ + gint32 is_shingle; + union { + struct rspamd_fuzzy_cmd normal; + struct rspamd_fuzzy_shingle_cmd shingle; + } cmd; +}; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx new file mode 100644 index 0000000..5861d45 --- /dev/null +++ b/src/libserver/html/html.cxx @@ -0,0 +1,2393 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "util.h" +#include "message.h" +#include "html.h" +#include "html_tags.h" +#include "html_block.hxx" +#include "html.hxx" +#include "libserver/css/css_value.hxx" +#include "libserver/css/css.hxx" +#include "libserver/task.h" +#include "libserver/cfg_file.h" + +#include "url.h" +#include "contrib/libucl/khash.h" +#include "libmime/images.h" +#include "libutil/cxx/utf8_util.h" + +#include "html_tag_defs.hxx" +#include "html_entities.hxx" +#include "html_tag.hxx" +#include "html_url.hxx" + +#include <frozen/unordered_map.h> +#include <frozen/string.h> +#include <fmt/core.h> + +#include <unicode/uversion.h> + +namespace rspamd::html { + +static const guint max_tags = 8192; /* Ignore tags if this maximum is reached */ + +static const html_tags_storage html_tags_defs; + +auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>( + { + {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME}, + {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, + {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, + {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, + {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR}, + {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR}, + {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE}, + {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS}, + {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH}, + {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT}, + {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE}, + {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL}, + {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT}, + {"id", html_component_type::RSPAMD_HTML_COMPONENT_ID}, + {"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN}, + }); + +#define msg_debug_html(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_html_log_id, "html", pool->tag.uid, \ + __FUNCTION__, \ + __VA_ARGS__) + +INIT_LOG_MODULE(html) + +/* + * This function is expected to be called on a closing tag to fill up all tags + * and return the current parent (meaning unclosed) tag + */ +static auto +html_check_balance(struct html_content *hc, + struct html_tag *tag, + goffset tag_start_offset, + goffset tag_end_offset) -> html_tag * +{ + /* As agreed, the closing tag has the last opening at the parent ptr */ + auto *opening_tag = tag->parent; + + auto calculate_content_length = [tag_start_offset, tag_end_offset](html_tag *t) { + auto opening_content_offset = t->content_offset; + + if (t->flags & (CM_EMPTY)) { + /* Attach closing tag just at the opening tag */ + t->closing.start = t->tag_start; + t->closing.end = t->content_offset; + } + else { + + if (opening_content_offset <= tag_start_offset) { + t->closing.start = tag_start_offset; + t->closing.end = tag_end_offset; + } + else { + + t->closing.start = t->content_offset; + t->closing.end = tag_end_offset; + } + } + }; + + auto balance_tag = [&]() -> html_tag * { + auto it = tag->parent; + auto found_pair = false; + + for (; it != nullptr; it = it->parent) { + if (it->id == tag->id && !(it->flags & FL_CLOSED)) { + found_pair = true; + break; + } + } + + /* + * If we have found a closing pair, then we need to close all tags and + * return the top-most tag + */ + if (found_pair) { + for (it = tag->parent; it != nullptr; it = it->parent) { + it->flags |= FL_CLOSED; + /* Insert a virtual closing tag for all tags that are not closed */ + calculate_content_length(it); + if (it->id == tag->id && !(it->flags & FL_CLOSED)) { + break; + } + } + + return it; + } + else { + /* + * We have not found a pair, so this closing tag is bogus and should + * be ignored completely. + * Unfortunately, it also means that we need to insert another tag, + * as the current closing tag is unusable for that purposes. + * + * We assume that callee will recognise that and reconstruct the + * tag at the tag_end_closing state, so we return nullptr... + */ + } + + /* Tag must be ignored and reconstructed */ + return nullptr; + }; + + if (opening_tag) { + + if (opening_tag->id == tag->id) { + opening_tag->flags |= FL_CLOSED; + + calculate_content_length(opening_tag); + /* All good */ + return opening_tag->parent; + } + else { + return balance_tag(); + } + } + else { + /* + * We have no opening tag + * There are two possibilities: + * + * 1) We have some block tag in hc->all_tags; + * 2) We have no tags + */ + + if (hc->all_tags.empty()) { + hc->all_tags.push_back(std::make_unique<html_tag>()); + auto *vtag = hc->all_tags.back().get(); + vtag->id = Tag_HTML; + vtag->flags = FL_VIRTUAL; + vtag->tag_start = 0; + vtag->content_offset = 0; + calculate_content_length(vtag); + + if (!hc->root_tag) { + hc->root_tag = vtag; + } + else { + vtag->parent = hc->root_tag; + } + + tag->parent = vtag; + + /* Recursively call with a virtual <html> tag inserted */ + return html_check_balance(hc, tag, tag_start_offset, tag_end_offset); + } + } + + return nullptr; +} + +auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type> +{ + auto known_component_it = html_components_map.find(st); + + if (known_component_it != html_components_map.end()) { + return known_component_it->second; + } + else { + return std::nullopt; + } +} + +enum tag_parser_state { + parse_start = 0, + parse_name, + parse_attr_name, + parse_equal, + parse_start_dquote, + parse_dqvalue, + parse_end_dquote, + parse_start_squote, + parse_sqvalue, + parse_end_squote, + parse_value, + spaces_before_eq, + spaces_after_eq, + spaces_after_param, + ignore_bad_tag, + tag_end, + slash_after_value, + slash_in_unquoted_value, +}; +struct tag_content_parser_state { + tag_parser_state cur_state = parse_start; + std::string buf; + std::optional<html_component_type> cur_component; + + void reset() + { + cur_state = parse_start; + buf.clear(); + cur_component = std::nullopt; + } +}; + +static inline void +html_parse_tag_content(rspamd_mempool_t *pool, + struct html_content *hc, + struct html_tag *tag, + const char *in, + struct tag_content_parser_state &parser_env) +{ + auto state = parser_env.cur_state; + + /* + * Stores tag component if it doesn't exist, performing copy of the + * value + decoding of the entities + * Parser env is set to clear the current html attribute fields (saved_p and + * cur_component) + */ + auto store_component_value = [&]() -> void { + if (parser_env.cur_component) { + + if (parser_env.buf.empty()) { + tag->components.emplace_back(parser_env.cur_component.value(), + std::string_view{}); + } + else { + /* We need to copy buf to a persistent storage */ + auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); + + if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID || + parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { + /* Lowercase */ + rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size()); + } + else { + memcpy(s, parser_env.buf.data(), parser_env.buf.size()); + } + + auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size()); + tag->components.emplace_back(parser_env.cur_component.value(), + std::string_view{s, sz}); + } + } + + parser_env.buf.clear(); + parser_env.cur_component = std::nullopt; + }; + + auto store_component_name = [&]() -> bool { + decode_html_entitles_inplace(parser_env.buf); + auto known_component_it = html_components_map.find(std::string_view{parser_env.buf}); + parser_env.buf.clear(); + + if (known_component_it != html_components_map.end()) { + parser_env.cur_component = known_component_it->second; + + return true; + } + else { + parser_env.cur_component = std::nullopt; + } + + return false; + }; + + auto store_value_character = [&](bool lc) -> void { + auto c = lc ? g_ascii_tolower(*in) : *in; + + if (c == '\0') { + /* Replace with u0FFD */ + parser_env.buf.append((const char *) u8"\uFFFD"); + } + else { + parser_env.buf.push_back(c); + } + }; + + switch (state) { + case parse_start: + if (!g_ascii_isalpha(*in) && !g_ascii_isspace(*in)) { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + state = ignore_bad_tag; + tag->id = N_TAGS; + tag->flags |= FL_BROKEN; + } + else if (g_ascii_isalpha(*in)) { + state = parse_name; + store_value_character(true); + } + break; + + case parse_name: + if ((g_ascii_isspace(*in) || *in == '>' || *in == '/')) { + if (*in == '/') { + tag->flags |= FL_CLOSED; + } + + if (parser_env.buf.empty()) { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + tag->id = N_TAGS; + tag->flags |= FL_BROKEN; + state = ignore_bad_tag; + } + else { + decode_html_entitles_inplace(parser_env.buf); + const auto *tag_def = rspamd::html::html_tags_defs.by_name(parser_env.buf); + + if (tag_def == nullptr) { + hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS; + /* Assign -hash to match closing tag if needed */ + auto nhash = static_cast<std::int32_t>(std::hash<std::string>{}(parser_env.buf)); + /* Always negative */ + tag->id = static_cast<tag_id_t>(nhash | G_MININT32); + } + else { + tag->id = tag_def->id; + tag->flags = tag_def->flags; + } + + parser_env.buf.clear(); + + state = spaces_after_param; + } + } + else { + store_value_character(true); + } + break; + + case parse_attr_name: + if (*in == '=') { + if (!parser_env.buf.empty()) { + store_component_name(); + } + state = parse_equal; + } + else if (g_ascii_isspace(*in)) { + store_component_name(); + state = spaces_before_eq; + } + else if (*in == '/') { + store_component_name(); + store_component_value(); + state = slash_after_value; + } + else if (*in == '>') { + store_component_name(); + store_component_value(); + state = tag_end; + } + else { + if (*in == '"' || *in == '\'' || *in == '<') { + /* Should never be in attribute names but ignored */ + tag->flags |= FL_BROKEN; + } + + store_value_character(true); + } + + break; + + case spaces_before_eq: + if (*in == '=') { + state = parse_equal; + } + else if (!g_ascii_isspace(*in)) { + /* + * HTML defines that crap could still be restored and + * calculated somehow... So we have to follow this stupid behaviour + */ + /* + * TODO: estimate what insane things do email clients in each case + */ + if (*in == '>') { + /* + * Attribute name followed by end of tag + * Should be okay (empty attribute). The rest is handled outside + * this automata. + */ + store_component_value(); + state = tag_end; + } + else if (*in == '"' || *in == '\'' || *in == '<') { + /* Attribute followed by quote... Missing '=' ? Dunno, need to test */ + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + tag->flags |= FL_BROKEN; + store_component_value(); + store_value_character(true); + state = spaces_after_param; + } + else { + /* Empty attribute */ + store_component_value(); + store_value_character(true); + state = spaces_after_param; + } + } + break; + + case spaces_after_eq: + if (*in == '"') { + state = parse_start_dquote; + } + else if (*in == '\'') { + state = parse_start_squote; + } + else if (!g_ascii_isspace(*in)) { + store_value_character(true); + state = parse_value; + } + break; + + case parse_equal: + if (g_ascii_isspace(*in)) { + state = spaces_after_eq; + } + else if (*in == '"') { + state = parse_start_dquote; + } + else if (*in == '\'') { + state = parse_start_squote; + } + else { + store_value_character(true); + state = parse_value; + } + break; + + case parse_start_dquote: + if (*in == '"') { + state = spaces_after_param; + } + else { + store_value_character(false); + state = parse_dqvalue; + } + break; + + case parse_start_squote: + if (*in == '\'') { + state = spaces_after_param; + } + else { + store_value_character(false); + state = parse_sqvalue; + } + break; + + case parse_dqvalue: + if (*in == '"') { + store_component_value(); + state = parse_end_dquote; + } + else { + store_value_character(false); + } + break; + + case parse_sqvalue: + if (*in == '\'') { + store_component_value(); + state = parse_end_squote; + } + else { + store_value_character(false); + } + + break; + + case parse_value: + if (*in == '/') { + state = slash_in_unquoted_value; + } + else if (g_ascii_isspace(*in) || *in == '>' || *in == '"') { + store_component_value(); + state = spaces_after_param; + } + else { + store_value_character(false); + } + break; + + case parse_end_dquote: + case parse_end_squote: + if (g_ascii_isspace(*in)) { + state = spaces_after_param; + } + else if (*in == '/') { + store_component_value(); + store_value_character(true); + state = slash_after_value; + } + else { + /* No space, proceed immediately to the attribute name */ + state = parse_attr_name; + store_component_value(); + store_value_character(true); + } + break; + + case spaces_after_param: + if (!g_ascii_isspace(*in)) { + if (*in == '/') { + state = slash_after_value; + } + else if (*in == '=') { + /* Attributes cannot start with '=' */ + tag->flags |= FL_BROKEN; + store_value_character(true); + state = parse_attr_name; + } + else { + store_value_character(true); + state = parse_attr_name; + } + } + break; + case slash_after_value: + if (*in == '>') { + tag->flags |= FL_CLOSED; + state = tag_end; + } + else if (!g_ascii_isspace(*in)) { + tag->flags |= FL_BROKEN; + state = parse_attr_name; + } + break; + case slash_in_unquoted_value: + if (*in == '>') { + /* That slash was in fact closing tag slash, woohoo */ + tag->flags |= FL_CLOSED; + state = tag_end; + store_component_value(); + } + else { + /* Welcome to the world of html, revert state and save missing / */ + parser_env.buf.push_back('/'); + store_value_character(false); + state = parse_value; + } + break; + case ignore_bad_tag: + case tag_end: + break; + } + + parser_env.cur_state = state; +} + +static inline auto +html_is_absolute_url(std::string_view st) -> bool +{ + auto alnum_pos = std::find_if(std::begin(st), std::end(st), + [](auto c) { return !g_ascii_isalnum(c); }); + + if (alnum_pos != std::end(st) && alnum_pos != std::begin(st)) { + if (*alnum_pos == ':') { + if (st.substr(0, std::distance(std::begin(st), alnum_pos)) == "mailto") { + return true; + } + + std::advance(alnum_pos, 1); + if (alnum_pos != std::end(st)) { + /* Include even malformed urls */ + if (*alnum_pos == '/' || *alnum_pos == '\\') { + return true; + } + } + } + } + + return false; +} + +static auto +html_process_url_tag(rspamd_mempool_t *pool, + struct html_tag *tag, + struct html_content *hc) -> std::optional<struct rspamd_url *> +{ + auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF); + + if (found_href_maybe) { + /* Check base url */ + auto &href_value = found_href_maybe.value(); + + if (hc && hc->base_url) { + /* + * Relative url cannot start from the following: + * schema:// + * data: + * slash + */ + + if (!html_is_absolute_url(href_value)) { + + if (href_value.size() >= sizeof("data:") && + g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) { + /* Image data url, never insert as url */ + return std::nullopt; + } + + /* Assume relative url */ + auto need_slash = false; + + auto orig_len = href_value.size(); + auto len = orig_len + hc->base_url->urllen; + + if (hc->base_url->datalen == 0) { + need_slash = true; + len++; + } + + auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1); + auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1, + "%*s%s%*s", + (int) hc->base_url->urllen, hc->base_url->string, + need_slash ? "/" : "", + (gint) orig_len, href_value.data()); + href_value = {buf, nlen}; + } + else if (href_value.size() > 2 && href_value[0] == '/' && href_value[1] != '/') { + /* Relative to the hostname */ + auto orig_len = href_value.size(); + auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen + + 3 /* for :// */; + auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1); + auto nlen = (std::size_t) rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s", + (int) hc->base_url->protocollen, hc->base_url->string, + (int) hc->base_url->hostlen, rspamd_url_host_unsafe(hc->base_url), + (gint) orig_len, href_value.data()); + href_value = {buf, nlen}; + } + } + + auto url = html_process_url(pool, href_value).value_or(nullptr); + + if (url) { + if (tag->id != Tag_A) { + /* Mark special tags special */ + url->flags |= RSPAMD_URL_FLAG_SPECIAL; + } + + if (std::holds_alternative<std::monostate>(tag->extra)) { + tag->extra = url; + } + + return url; + } + + return std::nullopt; + } + + return std::nullopt; +} + +struct rspamd_html_url_query_cbd { + rspamd_mempool_t *pool; + khash_t(rspamd_url_hash) * url_set; + struct rspamd_url *url; + GPtrArray *part_urls; +}; + +static gboolean +html_url_query_callback(struct rspamd_url *url, gsize start_offset, + gsize end_offset, gpointer ud) +{ + struct rspamd_html_url_query_cbd *cbd = + (struct rspamd_html_url_query_cbd *) ud; + rspamd_mempool_t *pool; + + pool = cbd->pool; + + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen == 0) { + return FALSE; + } + } + + msg_debug_html("found url %s in query of url" + " %*s", + url->string, + cbd->url->querylen, rspamd_url_query_unsafe(cbd->url)); + + url->flags |= RSPAMD_URL_FLAG_QUERY; + + if (rspamd_url_set_add_or_increase(cbd->url_set, url, false) && cbd->part_urls) { + g_ptr_array_add(cbd->part_urls, url); + } + + return TRUE; +} + +static void +html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls) +{ + if (url->querylen > 0) { + struct rspamd_html_url_query_cbd qcbd; + + qcbd.pool = pool; + qcbd.url_set = url_set; + qcbd.url = url; + qcbd.part_urls = part_urls; + + rspamd_url_find_multiple(pool, + rspamd_url_query_unsafe(url), url->querylen, + RSPAMD_URL_FIND_ALL, NULL, + html_url_query_callback, &qcbd); + } + + if (part_urls) { + g_ptr_array_add(part_urls, url); + } +} + +static auto +html_process_data_image(rspamd_mempool_t *pool, + struct html_image *img, + std::string_view input) -> void +{ + /* + * Here, we do very basic processing of the data: + * detect if we have something like: `` + * We only parse base64 encoded data. + * We ignore content type so far + */ + struct rspamd_image *parsed_image; + const gchar *semicolon_pos = input.data(), + *end = input.data() + input.size(); + + if ((semicolon_pos = (const gchar *) memchr(semicolon_pos, ';', end - semicolon_pos)) != NULL) { + if (end - semicolon_pos > sizeof("base64,")) { + if (memcmp(semicolon_pos + 1, "base64,", sizeof("base64,") - 1) == 0) { + const gchar *data_pos = semicolon_pos + sizeof("base64,"); + gchar *decoded; + gsize encoded_len = end - data_pos, decoded_len; + rspamd_ftok_t inp; + + decoded_len = (encoded_len / 4 * 3) + 12; + decoded = rspamd_mempool_alloc_buffer(pool, decoded_len); + rspamd_cryptobox_base64_decode(data_pos, encoded_len, + reinterpret_cast<guchar *>(decoded), &decoded_len); + inp.begin = decoded; + inp.len = decoded_len; + + parsed_image = rspamd_maybe_process_image(pool, &inp); + + if (parsed_image) { + msg_debug_html("detected %s image of size %ud x %ud in data url", + rspamd_image_type_str(parsed_image->type), + parsed_image->width, parsed_image->height); + img->embedded_image = parsed_image; + } + } + } + else { + /* Nothing useful */ + return; + } + } +} + +static void +html_process_img_tag(rspamd_mempool_t *pool, + struct html_tag *tag, + struct html_content *hc, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls) +{ + struct html_image *img; + + img = rspamd_mempool_alloc0_type(pool, struct html_image); + img->tag = tag; + + for (const auto ¶m: tag->components) { + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) { + /* Check base url */ + const auto &href_value = param.value; + + if (href_value.size() > 0) { + rspamd_ftok_t fstr; + fstr.begin = href_value.data(); + fstr.len = href_value.size(); + img->src = rspamd_mempool_ftokdup(pool, &fstr); + + if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(), + "cid:", sizeof("cid:") - 1) == 0) { + /* We have an embedded image */ + img->src += sizeof("cid:") - 1; + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; + } + else { + if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(), + "data:", sizeof("data:") - 1) == 0) { + /* We have an embedded image in HTML tag */ + img->flags |= + (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); + html_process_data_image(pool, img, href_value); + hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; + } + else { + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; + if (img->src) { + + std::string_view cpy{href_value}; + auto maybe_url = html_process_url(pool, cpy); + + if (maybe_url) { + img->url = maybe_url.value(); + struct rspamd_url *existing; + + img->url->flags |= RSPAMD_URL_FLAG_IMAGE; + existing = rspamd_url_set_add_or_return(url_set, + img->url); + + if (existing && existing != img->url) { + /* + * We have some other URL that could be + * found, e.g. from another part. However, + * we still want to set an image flag on it + */ + existing->flags |= img->url->flags; + existing->count++; + } + else if (part_urls) { + /* New url */ + g_ptr_array_add(part_urls, img->url); + } + } + } + } + } + } + } + + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) { + unsigned long val; + + rspamd_strtoul(param.value.data(), param.value.size(), &val); + img->height = val; + } + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) { + unsigned long val; + + rspamd_strtoul(param.value.data(), param.value.size(), &val); + img->width = val; + } + + /* TODO: rework to css at some time */ + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { + if (img->height == 0) { + auto style_st = param.value; + auto pos = rspamd_substring_search_caseless(style_st.data(), + style_st.size(), + "height", sizeof("height") - 1); + if (pos != -1) { + auto substr = style_st.substr(pos + sizeof("height") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->height = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; + } + } + } + } + if (img->width == 0) { + auto style_st = param.value; + auto pos = rspamd_substring_search_caseless(style_st.data(), + style_st.size(), + "width", sizeof("width") - 1); + if (pos != -1) { + auto substr = style_st.substr(pos + sizeof("width") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->width = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; + } + } + } + } + } + } + + if (img->embedded_image) { + if (img->height == 0) { + img->height = img->embedded_image->height; + } + if (img->width == 0) { + img->width = img->embedded_image->width; + } + } + + hc->images.push_back(img); + + if (std::holds_alternative<std::monostate>(tag->extra)) { + tag->extra = img; + } +} + +static auto +html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag, + struct html_content *hc, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls) -> void +{ + auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL); + + if (found_rel_maybe) { + if (found_rel_maybe.value() == "icon") { + html_process_img_tag(pool, tag, hc, url_set, part_urls); + } + } +} + +static auto +html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, + struct html_content *hc) -> void +{ + std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor; + bool hidden = false; + + for (const auto ¶m: tag->components) { + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) { + maybe_fgcolor = css::css_value::maybe_color_from_string(param.value); + } + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) { + maybe_bgcolor = css::css_value::maybe_color_from_string(param.value); + } + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { + tag->block = rspamd::css::parse_css_declaration(pool, param.value); + } + + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) { + hidden = true; + } + } + + if (!tag->block) { + tag->block = html_block::undefined_html_block_pool(pool); + } + + if (hidden) { + tag->block->set_display(false); + } + + if (maybe_fgcolor) { + tag->block->set_fgcolor(maybe_fgcolor->to_color().value()); + } + + if (maybe_bgcolor) { + tag->block->set_bgcolor(maybe_bgcolor->to_color().value()); + } +} + +static inline auto +html_append_parsed(struct html_content *hc, + std::string_view data, + bool transparent, + std::size_t input_len, + std::string &dest) -> std::size_t +{ + auto cur_offset = dest.size(); + + if (dest.size() > input_len) { + /* Impossible case, refuse to append */ + return 0; + } + + if (data.size() > 0) { + /* Handle multiple spaces at the begin */ + + if (cur_offset > 0) { + auto last = dest.back(); + if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) { + dest.append(" "); + data = {data.data() + 1, data.size() - 1}; + cur_offset++; + } + } + + if (data.find('\0') != std::string_view::npos) { + auto replace_zero_func = [](const auto &input, auto &output) { + const auto last = input.cend(); + for (auto it = input.cbegin(); it != last; ++it) { + if (*it == '\0') { + output.append((const char *) u8"\uFFFD"); + } + else { + output.push_back(*it); + } + } + }; + + dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD")); + replace_zero_func(data, dest); + hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS; + } + else { + dest.append(data); + } + } + + auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset, + dest.size() - cur_offset, true); + + dest.resize(nlen + cur_offset); + + if (transparent) { + /* Replace all visible characters with spaces */ + auto start = std::next(dest.begin(), cur_offset); + std::replace_if( + start, std::end(dest), [](const auto c) { + return !g_ascii_isspace(c); + }, + ' '); + } + + return nlen; +} + +static auto +html_process_displayed_href_tag(rspamd_mempool_t *pool, + struct html_content *hc, + std::string_view data, + const struct html_tag *cur_tag, + GList **exceptions, + khash_t(rspamd_url_hash) * url_set, + goffset dest_offset) -> void +{ + + if (std::holds_alternative<rspamd_url *>(cur_tag->extra)) { + auto *url = std::get<rspamd_url *>(cur_tag->extra); + + html_check_displayed_url(pool, + exceptions, url_set, + data, + dest_offset, + url); + } +} + +static auto +html_append_tag_content(rspamd_mempool_t *pool, + const gchar *start, gsize len, + struct html_content *hc, + html_tag *tag, + GList **exceptions, + khash_t(rspamd_url_hash) * url_set) -> goffset +{ + auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false; + goffset next_tag_offset = tag->closing.end, + initial_parsed_offset = hc->parsed.size(), + initial_invisible_offset = hc->invisible.size(); + + auto calculate_final_tag_offsets = [&]() -> void { + if (is_visible) { + tag->content_offset = initial_parsed_offset; + tag->closing.start = hc->parsed.size(); + } + else { + tag->content_offset = initial_invisible_offset; + tag->closing.start = hc->invisible.size(); + } + }; + + if (tag->closing.end == -1) { + if (tag->closing.start != -1) { + next_tag_offset = tag->closing.start; + tag->closing.end = tag->closing.start; + } + else { + next_tag_offset = tag->content_offset; + tag->closing.end = tag->content_offset; + } + } + if (tag->closing.start == -1) { + tag->closing.start = tag->closing.end; + } + + auto append_margin = [&](char c) -> void { + /* We do care about visible margins only */ + if (is_visible) { + if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') { + if (hc->parsed.back() == ' ') { + /* We also strip extra spaces at the end, but limiting the start */ + auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset); + auto first = std::find_if(hc->parsed.rbegin(), last, + [](auto ch) -> auto { + return ch != ' '; + }); + hc->parsed.erase(first.base(), hc->parsed.end()); + g_assert(hc->parsed.size() >= initial_parsed_offset); + } + hc->parsed.push_back(c); + } + } + }; + + if (tag->id == Tag_BR || tag->id == Tag_HR) { + + if (!(tag->flags & FL_IGNORE)) { + hc->parsed.append("\n"); + } + + auto ret = tag->content_offset; + calculate_final_tag_offsets(); + + return ret; + } + else if ((tag->id == Tag_HEAD && (tag->flags & FL_IGNORE)) || (tag->flags & CM_HEAD)) { + auto ret = tag->closing.end; + calculate_final_tag_offsets(); + + return ret; + } + + if ((tag->flags & (FL_COMMENT | FL_XML | FL_IGNORE | CM_HEAD))) { + is_visible = false; + } + else { + if (!tag->block) { + is_visible = true; + } + else if (!tag->block->is_visible()) { + if (!tag->block->is_transparent()) { + is_visible = false; + } + else { + if (tag->block->has_display() && + tag->block->display == css::css_display_value::DISPLAY_HIDDEN) { + is_visible = false; + } + else { + is_transparent = true; + } + } + } + else { + if (tag->block->display == css::css_display_value::DISPLAY_BLOCK) { + is_block = true; + } + else if (tag->block->display == css::css_display_value::DISPLAY_TABLE_ROW) { + is_spaces = true; + } + } + } + + if (is_block) { + append_margin('\n'); + } + else if (is_spaces) { + append_margin(' '); + } + + goffset cur_offset = tag->content_offset; + + for (auto *cld: tag->children) { + auto enclosed_start = cld->tag_start; + goffset initial_part_len = enclosed_start - cur_offset; + + if (initial_part_len > 0) { + if (is_visible) { + html_append_parsed(hc, + {start + cur_offset, std::size_t(initial_part_len)}, + is_transparent, len, hc->parsed); + } + else { + html_append_parsed(hc, + {start + cur_offset, std::size_t(initial_part_len)}, + is_transparent, len, hc->invisible); + } + } + + auto next_offset = html_append_tag_content(pool, start, len, + hc, cld, exceptions, url_set); + + /* Do not allow shifting back */ + if (next_offset > cur_offset) { + cur_offset = next_offset; + } + } + + if (cur_offset < tag->closing.start) { + goffset final_part_len = tag->closing.start - cur_offset; + + if (final_part_len > 0) { + if (is_visible) { + html_append_parsed(hc, + {start + cur_offset, std::size_t(final_part_len)}, + is_transparent, + len, + hc->parsed); + } + else { + html_append_parsed(hc, + {start + cur_offset, std::size_t(final_part_len)}, + is_transparent, + len, + hc->invisible); + } + } + } + if (is_block) { + append_margin('\n'); + } + else if (is_spaces) { + append_margin(' '); + } + + if (is_visible) { + if (tag->id == Tag_A) { + auto written_len = hc->parsed.size() - initial_parsed_offset; + html_process_displayed_href_tag(pool, hc, + {hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)}, + tag, exceptions, + url_set, initial_parsed_offset); + } + else if (tag->id == Tag_IMG) { + /* Process ALT if presented */ + auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT); + + if (maybe_alt) { + if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) { + /* Add a space */ + hc->parsed += ' '; + } + + hc->parsed.append(maybe_alt.value()); + + if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) { + /* Add a space */ + hc->parsed += ' '; + } + } + } + } + else { + /* Invisible stuff */ + if (std::holds_alternative<rspamd_url *>(tag->extra)) { + auto *url_enclosed = std::get<rspamd_url *>(tag->extra); + + /* + * TODO: when hash is fixed to include flags we need to remove and add + * url to the hash set + */ + if (url_enclosed) { + url_enclosed->flags |= RSPAMD_URL_FLAG_INVISIBLE; + } + } + } + + calculate_final_tag_offsets(); + + return next_tag_offset; +} + +auto html_process_input(struct rspamd_task *task, + GByteArray *in, + GList **exceptions, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls, + bool allow_css, + std::uint16_t *cur_url_order) -> html_content * +{ + const gchar *p, *c, *end, *start; + guchar t; + auto closing = false; + guint obrace = 0, ebrace = 0; + struct rspamd_url *url = nullptr; + gint href_offset = -1; + auto overflow_input = false; + struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag; + struct tag_content_parser_state content_parser_env; + auto process_size = in->len; + + + enum { + parse_start = 0, + content_before_start, + tag_begin, + sgml_tag, + xml_tag, + compound_tag, + comment_tag, + comment_content, + sgml_content, + tag_content, + tag_end_opening, + tag_end_closing, + html_text_content, + xml_tag_end, + tag_raw_text, + tag_raw_text_less_than, + tags_limit_overflow, + } state = parse_start; + + enum class html_document_state { + doctype, + head, + body + } html_document_state = html_document_state::doctype; + + g_assert(in != NULL); + g_assert(task != NULL); + + auto *pool = task->task_pool; + auto cur_url_part_order = 0u; + + auto *hc = new html_content; + rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc); + + if (task->cfg && in->len > task->cfg->max_html_len) { + msg_notice_task("html input is too big: %z, limit is %z", + in->len, + task->cfg->max_html_len); + process_size = task->cfg->max_html_len; + overflow_input = true; + } + + auto new_tag = [&](int flags = 0) -> struct html_tag * + { + + if (hc->all_tags.size() > rspamd::html::max_tags) { + hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS; + + return nullptr; + } + + hc->all_tags.emplace_back(std::make_unique<html_tag>()); + auto *ntag = hc->all_tags.back().get(); + ntag->tag_start = c - start; + ntag->flags = flags; + + if (cur_tag && !(cur_tag->flags & (CM_EMPTY | FL_CLOSED)) && cur_tag != &cur_closing_tag) { + parent_tag = cur_tag; + } + + if (flags & FL_XML) { + return ntag; + } + + return ntag; + }; + + auto process_opening_tag = [&]() { + if (cur_tag->id > Tag_UNKNOWN) { + if (cur_tag->flags & CM_UNIQUE) { + if (!hc->tags_seen[cur_tag->id]) { + /* Duplicate tag has been found */ + hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS; + } + } + hc->tags_seen[cur_tag->id] = true; + } + + /* Shift to the first unclosed tag */ + auto *pt = parent_tag; + while (pt && (pt->flags & FL_CLOSED)) { + pt = pt->parent; + } + + if (pt) { + g_assert(cur_tag != pt); + cur_tag->parent = pt; + g_assert(cur_tag->parent != &cur_closing_tag); + parent_tag = pt; + parent_tag->children.push_back(cur_tag); + } + else { + if (hc->root_tag) { + if (cur_tag != hc->root_tag) { + cur_tag->parent = hc->root_tag; + g_assert(cur_tag->parent != cur_tag); + hc->root_tag->children.push_back(cur_tag); + parent_tag = hc->root_tag; + } + } + else { + if (cur_tag->id == Tag_HTML) { + hc->root_tag = cur_tag; + } + else { + /* Insert a fake html tag */ + hc->all_tags.emplace_back(std::make_unique<html_tag>()); + auto *top_tag = hc->all_tags.back().get(); + top_tag->tag_start = 0; + top_tag->flags = FL_VIRTUAL; + top_tag->id = Tag_HTML; + top_tag->content_offset = 0; + top_tag->children.push_back(cur_tag); + cur_tag->parent = top_tag; + g_assert(cur_tag->parent != cur_tag); + hc->root_tag = top_tag; + parent_tag = top_tag; + } + } + } + + if (cur_tag->flags & FL_HREF && html_document_state == html_document_state::body) { + auto maybe_url = html_process_url_tag(pool, cur_tag, hc); + + if (maybe_url.has_value()) { + url = maybe_url.value(); + + if (url_set != NULL) { + struct rspamd_url *maybe_existing = + rspamd_url_set_add_or_return(url_set, maybe_url.value()); + if (maybe_existing == maybe_url.value()) { + if (cur_url_order) { + url->order = (*cur_url_order)++; + } + url->part_order = cur_url_part_order++; + html_process_query_url(pool, url, url_set, + part_urls); + } + else { + url = maybe_existing; + /* Replace extra as well */ + cur_tag->extra = maybe_existing; + /* Increase count to avoid odd checks failure */ + url->count++; + } + } + if (part_urls) { + g_ptr_array_add(part_urls, url); + } + + href_offset = hc->parsed.size(); + } + } + else if (cur_tag->id == Tag_BASE) { + /* + * Base is allowed only within head tag but HTML is retarded + */ + auto maybe_url = html_process_url_tag(pool, cur_tag, hc); + + if (maybe_url) { + msg_debug_html("got valid base tag"); + cur_tag->extra = maybe_url.value(); + cur_tag->flags |= FL_HREF; + + if (hc->base_url == nullptr) { + hc->base_url = maybe_url.value(); + } + else { + msg_debug_html("ignore redundant base tag"); + } + } + else { + msg_debug_html("got invalid base tag!"); + } + } + + if (cur_tag->id == Tag_IMG) { + html_process_img_tag(pool, cur_tag, hc, url_set, + part_urls); + } + else if (cur_tag->id == Tag_LINK) { + html_process_link_tag(pool, cur_tag, hc, url_set, + part_urls); + } + + if (!(cur_tag->flags & CM_EMPTY)) { + html_process_block_tag(pool, cur_tag, hc); + } + else { + /* Implicitly close */ + cur_tag->flags |= FL_CLOSED; + } + + if (cur_tag->flags & FL_CLOSED) { + cur_tag->closing.end = cur_tag->content_offset; + cur_tag->closing.start = cur_tag->tag_start; + + cur_tag = parent_tag; + } + }; + + p = (const char *) in->data; + c = p; + end = p + process_size; + start = c; + + while (p < end) { + t = *p; + + switch (state) { + case parse_start: + if (t == '<') { + state = tag_begin; + } + else { + /* We have no starting tag, so assume that it's content */ + hc->flags |= RSPAMD_HTML_FLAG_BAD_START; + cur_tag = new_tag(); + html_document_state = html_document_state::body; + + if (cur_tag) { + cur_tag->id = Tag_HTML; + hc->root_tag = cur_tag; + state = content_before_start; + } + else { + state = tags_limit_overflow; + } + } + break; + case content_before_start: + if (t == '<') { + state = tag_begin; + } + else { + p++; + } + break; + case tag_begin: + switch (t) { + case '<': + c = p; + p++; + closing = FALSE; + break; + case '!': + cur_tag = new_tag(FL_XML | FL_CLOSED); + if (cur_tag) { + state = sgml_tag; + } + else { + state = tags_limit_overflow; + } + p++; + break; + case '?': + cur_tag = new_tag(FL_XML | FL_CLOSED); + if (cur_tag) { + state = xml_tag; + } + else { + state = tags_limit_overflow; + } + hc->flags |= RSPAMD_HTML_FLAG_XML; + p++; + break; + case '/': + closing = TRUE; + /* We fill fake closing tag to fill it with the content parser */ + cur_closing_tag.clear(); + /* + * For closing tags, we need to find some corresponding opening tag. + * However, at this point we have not even parsed a name, so we + * can not assume anything about balancing, etc. + * + * So we need to ensure that: + * 1) We have some opening tag in the chain cur_tag->parent... + * 2) cur_tag is nullptr - okay, html is just brain damaged + * 3) cur_tag must NOT be equal to cur_closing tag. It means that + * we had some poor closing tag but we still need to find an opening + * tag... Somewhere... + */ + + if (cur_tag == &cur_closing_tag) { + if (parent_tag != &cur_closing_tag) { + cur_closing_tag.parent = parent_tag; + } + else { + cur_closing_tag.parent = nullptr; + } + } + else if (cur_tag && cur_tag->flags & FL_CLOSED) { + /* Cur tag is already closed, we should find something else */ + auto *tmp = cur_tag; + while (tmp) { + tmp = tmp->parent; + + if (tmp == nullptr || !(tmp->flags & FL_CLOSED)) { + break; + } + } + + cur_closing_tag.parent = tmp; + } + else { + cur_closing_tag.parent = cur_tag; + } + + cur_tag = &cur_closing_tag; + p++; + break; + case '>': + /* Empty tag */ + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + state = html_text_content; + continue; + default: + if (g_ascii_isalpha(t)) { + state = tag_content; + content_parser_env.reset(); + + if (!closing) { + cur_tag = new_tag(); + } + + if (cur_tag) { + state = tag_content; + } + else { + state = tags_limit_overflow; + } + } + else { + /* Wrong bad tag */ + state = html_text_content; + } + break; + } + + break; + + case sgml_tag: + switch (t) { + case '[': + state = compound_tag; + obrace = 1; + ebrace = 0; + p++; + break; + case '-': + cur_tag->flags |= FL_COMMENT; + state = comment_tag; + p++; + break; + default: + state = sgml_content; + break; + } + + break; + + case xml_tag: + if (t == '?') { + state = xml_tag_end; + } + else if (t == '>') { + /* Misformed xml tag */ + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + state = tag_end_opening; + continue; + } + /* We efficiently ignore xml tags */ + p++; + break; + + case xml_tag_end: + if (t == '>') { + state = tag_end_opening; + cur_tag->content_offset = p - start + 1; + continue; + } + else { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + } + p++; + break; + + case compound_tag: + if (t == '[') { + obrace++; + } + else if (t == ']') { + ebrace++; + } + else if (t == '>' && obrace == ebrace) { + state = tag_end_opening; + cur_tag->content_offset = p - start + 1; + continue; + } + p++; + break; + + case comment_tag: + if (t != '-') { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + state = tag_end_opening; + } + else { + p++; + ebrace = 0; + /* + * https://www.w3.org/TR/2012/WD-html5-20120329/syntax.html#syntax-comments + * ... the text must not start with a single + * U+003E GREATER-THAN SIGN character (>), + * nor start with a "-" (U+002D) character followed by + * a U+003E GREATER-THAN SIGN (>) character, + * nor contain two consecutive U+002D HYPHEN-MINUS + * characters (--), nor end with a "-" (U+002D) character. + */ + if (p[0] == '-' && p + 1 < end && p[1] == '>') { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + p++; + state = tag_end_opening; + } + else if (*p == '>') { + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + state = tag_end_opening; + } + else { + state = comment_content; + } + } + break; + + case comment_content: + if (t == '-') { + ebrace++; + } + else if (t == '>' && ebrace >= 2) { + cur_tag->content_offset = p - start + 1; + state = tag_end_opening; + continue; + } + else { + ebrace = 0; + } + + p++; + break; + + case html_text_content: + if (t != '<') { + p++; + } + else { + state = tag_begin; + } + break; + + case tag_raw_text: + if (t == '<') { + c = p; + state = tag_raw_text_less_than; + } + p++; + break; + case tag_raw_text_less_than: + if (t == '/') { + /* Here are special things: we look for obrace and then ensure + * that if there is any closing brace nearby + * (we look maximum at 30 characters). We also need to ensure + * that we have no special characters, such as punctuation marks and + * so on. + * Basically, we validate the input to be sane. + * Since closing tags must not have attributes, these assumptions + * seems to be reasonable enough for our toy parser. + */ + gint cur_lookahead = 1; + gint max_lookahead = MIN(end - p, 30); + bool valid_closing_tag = true; + + if (p + 1 < end && !g_ascii_isalpha(p[1])) { + valid_closing_tag = false; + } + else { + while (cur_lookahead < max_lookahead) { + gchar tt = p[cur_lookahead]; + if (tt == '>') { + break; + } + else if (tt < '\n' || tt == ',') { + valid_closing_tag = false; + break; + } + cur_lookahead++; + } + + if (cur_lookahead == max_lookahead) { + valid_closing_tag = false; + } + } + + if (valid_closing_tag) { + /* Shift back */ + p = c; + state = tag_begin; + } + else { + p++; + state = tag_raw_text; + } + } + else { + p++; + state = tag_raw_text; + } + break; + case sgml_content: + /* TODO: parse DOCTYPE here */ + if (t == '>') { + cur_tag->content_offset = p - start + 1; + state = tag_end_opening; + } + else { + p++; + } + break; + + case tag_content: + html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env); + + if (t == '>') { + if (content_parser_env.cur_state != parse_dqvalue && content_parser_env.cur_state != parse_sqvalue) { + /* We have a closing element */ + if (closing) { + cur_tag->closing.start = c - start; + cur_tag->closing.end = p - start + 1; + + closing = FALSE; + state = tag_end_closing; + } + else { + cur_tag->content_offset = p - start + 1; + state = tag_end_opening; + } + } + else { + /* + * We are in the parse_quoted value state but got + * an unescaped `>` character. + * HTML is written for monkeys, so there are two possibilities: + * 1) We have missing ending quote + * 2) We have unescaped `>` character + * How to distinguish between those possibilities? + * Well, the idea is to do some lookahead and try to find a + * quote. If we can find a quote, we just pretend as we have + * not seen `>` character. Otherwise, we pretend that it is an + * unquoted stuff. This logic is quite fragile but I really + * don't know any better options... + */ + auto end_quote = content_parser_env.cur_state == parse_sqvalue ? '\'' : '"'; + if (memchr(p, end_quote, end - p) != nullptr) { + /* Unencoded `>` */ + p++; + continue; + } + else { + if (closing) { + cur_tag->closing.start = c - start; + cur_tag->closing.end = p - start + 1; + + closing = FALSE; + state = tag_end_closing; + } + else { + cur_tag->content_offset = p - start + 1; + state = tag_end_opening; + } + } + } + continue; + } + p++; + break; + + case tag_end_opening: + content_parser_env.reset(); + state = html_text_content; + + if (cur_tag) { + if (cur_tag->id == Tag_STYLE || cur_tag->id == Tag_NOSCRIPT || cur_tag->id == Tag_SCRIPT) { + state = tag_raw_text; + } + if (html_document_state == html_document_state::doctype) { + if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) { + html_document_state = html_document_state::head; + cur_tag->flags |= FL_IGNORE; + } + else if (cur_tag->id != Tag_HTML) { + html_document_state = html_document_state::body; + } + } + else if (html_document_state == html_document_state::head) { + if (!(cur_tag->flags & (CM_EMPTY | CM_HEAD))) { + if (parent_tag && (parent_tag->id == Tag_HEAD || !(parent_tag->flags & CM_HEAD))) { + /* + * As by standard, we have to close the HEAD tag + * and switch to the body state + */ + parent_tag->flags |= FL_CLOSED; + parent_tag->closing.start = cur_tag->tag_start; + parent_tag->closing.end = cur_tag->content_offset; + + html_document_state = html_document_state::body; + } + else if (cur_tag->id == Tag_BODY) { + html_document_state = html_document_state::body; + } + else { + /* + * For propagation in something like + * <title><p><a>ololo</a></p></title> - should be unprocessed + */ + cur_tag->flags |= CM_HEAD; + } + } + } + + process_opening_tag(); + } + + p++; + c = p; + break; + case tag_end_closing: { + if (cur_tag) { + + if (cur_tag->flags & CM_EMPTY) { + /* Ignore closing empty tags */ + cur_tag->flags |= FL_IGNORE; + } + if (html_document_state == html_document_state::doctype) { + } + else if (html_document_state == html_document_state::head) { + if (cur_tag->id == Tag_HEAD) { + html_document_state = html_document_state::body; + } + } + + /* cur_tag here is a closing tag */ + auto *next_cur_tag = html_check_balance(hc, cur_tag, + c - start, p - start + 1); + + if (cur_tag->id == Tag_STYLE && allow_css) { + auto *opening_tag = cur_tag->parent; + + if (opening_tag && opening_tag->id == Tag_STYLE && + (int) opening_tag->content_offset < opening_tag->closing.start) { + auto ret_maybe = rspamd::css::parse_css(pool, + {start + opening_tag->content_offset, + opening_tag->closing.start - opening_tag->content_offset}, + std::move(hc->css_style)); + + if (!ret_maybe.has_value()) { + if (ret_maybe.error().is_fatal()) { + auto err_str = fmt::format( + "cannot parse css (error code: {}): {}", + static_cast<int>(ret_maybe.error().type), + ret_maybe.error().description.value_or("unknown error")); + msg_info_pool("%*s", (int) err_str.size(), err_str.data()); + } + } + else { + hc->css_style = ret_maybe.value(); + } + } + } + + if (next_cur_tag != nullptr) { + cur_tag = next_cur_tag; + } + else { + /* + * Here, we handle cases like <p>lala</b>... + * So the tag </b> is bogus and unpaired + * However, we need to exclude it from the output of <p> tag + * To do that, we create a fake opening tag and insert that to + * the current opening tag + */ + auto *cur_opening_tag = cur_tag->parent; + + while (cur_opening_tag && (cur_opening_tag->flags & FL_CLOSED)) { + cur_opening_tag = cur_opening_tag->parent; + } + + if (!cur_opening_tag) { + cur_opening_tag = hc->root_tag; + } + + auto &&vtag = std::make_unique<html_tag>(); + vtag->id = cur_tag->id; + vtag->flags = FL_VIRTUAL | FL_CLOSED | cur_tag->flags; + vtag->tag_start = cur_tag->closing.start; + vtag->content_offset = p - start + 1; + vtag->closing = cur_tag->closing; + vtag->parent = cur_opening_tag; + g_assert(vtag->parent != &cur_closing_tag); + cur_opening_tag->children.push_back(vtag.get()); + hc->all_tags.emplace_back(std::move(vtag)); + cur_tag = cur_opening_tag; + parent_tag = cur_tag->parent; + g_assert(cur_tag->parent != &cur_closing_tag); + } + } /* if cur_tag != nullptr */ + state = html_text_content; + p++; + c = p; + break; + } + case tags_limit_overflow: + msg_warn_pool("tags limit of %d tags is reached at the position %d;" + " ignoring the rest of the HTML content", + (int) hc->all_tags.size(), (int) (p - start)); + c = p; + p = end; + break; + } + } + + if (cur_tag && !(cur_tag->flags & FL_CLOSED) && cur_tag != &cur_closing_tag) { + cur_closing_tag.parent = cur_tag; + cur_closing_tag.id = cur_tag->id; + cur_tag = &cur_closing_tag; + html_check_balance(hc, cur_tag, + end - start, end - start); + } + + /* Propagate styles */ + hc->traverse_block_tags([&hc, &pool](const html_tag *tag) -> bool { + if (hc->css_style && tag->id > Tag_UNKNOWN && tag->id < Tag_MAX) { + auto *css_block = hc->css_style->check_tag_block(tag); + + if (css_block) { + if (tag->block) { + tag->block->set_block(*css_block); + } + else { + tag->block = css_block; + } + } + } + if (tag->block) { + if (!tag->block->has_display()) { + /* If we have no display field, we can check it by tag */ + if (tag->flags & CM_HEAD) { + tag->block->set_display(css::css_display_value::DISPLAY_HIDDEN, + html_block::set); + } + else if (tag->flags & (CM_BLOCK | CM_TABLE)) { + tag->block->set_display(css::css_display_value::DISPLAY_BLOCK, + html_block::implicit); + } + else if (tag->flags & CM_ROW) { + tag->block->set_display(css::css_display_value::DISPLAY_TABLE_ROW, + html_block::implicit); + } + else { + tag->block->set_display(css::css_display_value::DISPLAY_INLINE, + html_block::implicit); + } + } + + tag->block->compute_visibility(); + + for (const auto *cld_tag: tag->children) { + + if (cld_tag->block) { + cld_tag->block->propagate_block(*tag->block); + } + else { + cld_tag->block = rspamd_mempool_alloc0_type(pool, html_block); + *cld_tag->block = *tag->block; + } + } + } + return true; + }, + html_content::traverse_type::PRE_ORDER); + + /* Leftover before content */ + switch (state) { + case tag_end_opening: + if (cur_tag != nullptr) { + process_opening_tag(); + } + break; + default: + /* Do nothing */ + break; + } + + if (!hc->all_tags.empty() && hc->root_tag) { + html_append_tag_content(pool, start, end - start, hc, hc->root_tag, + exceptions, url_set); + } + + /* Leftover after content */ + switch (state) { + case tags_limit_overflow: + html_append_parsed(hc, {c, (std::size_t)(end - c)}, + false, end - start, hc->parsed); + break; + default: + /* Do nothing */ + break; + } + + if (overflow_input) { + /* + * Append the rest of the input as raw html, this might work as + * further algorithms can skip words when auto *pool = task->task_pool;there are too many. + * It is still unclear about urls though... + */ + html_append_parsed(hc, {end, in->len - process_size}, false, + end - start, hc->parsed); + } + + if (!hc->parsed.empty()) { + /* Trim extra spaces at the end if needed */ + if (g_ascii_isspace(hc->parsed.back())) { + auto last_it = std::end(hc->parsed); + + /* Allow last newline */ + if (hc->parsed.back() == '\n') { + --last_it; + } + + hc->parsed.erase(std::find_if(hc->parsed.rbegin(), hc->parsed.rend(), + [](auto ch) -> auto { + return !g_ascii_isspace(ch); + }) + .base(), + last_it); + } + } + + return hc; +} + +static auto +html_find_image_by_cid(const html_content &hc, std::string_view cid) + -> std::optional<const html_image *> +{ + for (const auto *html_image: hc.images) { + /* Filter embedded images */ + if (html_image->flags & RSPAMD_HTML_FLAG_IMAGE_EMBEDDED && + html_image->src != nullptr) { + if (cid == html_image->src) { + return html_image; + } + } + } + + return std::nullopt; +} + +auto html_debug_structure(const html_content &hc) -> std::string +{ + std::string output; + + if (hc.root_tag) { + auto rec_functor = [&](const html_tag *t, int level, auto rec_functor) -> void { + std::string pluses(level, '+'); + + if (!(t->flags & (FL_VIRTUAL | FL_IGNORE))) { + if (t->flags & FL_XML) { + output += fmt::format("{}xml;", pluses); + } + else { + output += fmt::format("{}{};", pluses, + html_tags_defs.name_by_id_safe(t->id)); + } + level++; + } + for (const auto *cld: t->children) { + rec_functor(cld, level, rec_functor); + } + }; + + rec_functor(hc.root_tag, 1, rec_functor); + } + + return output; +} + +auto html_tag_by_name(const std::string_view &name) + -> std::optional<tag_id_t> +{ + const auto *td = rspamd::html::html_tags_defs.by_name(name); + + if (td != nullptr) { + return td->id; + } + + return std::nullopt; +} + +auto html_tag::get_content(const struct html_content *hc) const -> std::string_view +{ + const std::string *dest = &hc->parsed; + + if (block && !block->is_visible()) { + dest = &hc->invisible; + } + const auto clen = get_content_length(); + if (content_offset < dest->size()) { + if (dest->size() - content_offset >= clen) { + return std::string_view{*dest}.substr(content_offset, clen); + } + else { + return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset); + } + } + + return std::string_view{}; +} + +}// namespace rspamd::html + +void * +rspamd_html_process_part_full(struct rspamd_task *task, + GByteArray *in, GList **exceptions, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls, + bool allow_css, + uint16_t *cur_url_order) +{ + return rspamd::html::html_process_input(task, in, exceptions, url_set, + part_urls, allow_css, cur_url_order); +} + +void * +rspamd_html_process_part(rspamd_mempool_t *pool, + GByteArray *in) +{ + struct rspamd_task fake_task; + memset(&fake_task, 0, sizeof(fake_task)); + fake_task.task_pool = pool; + uint16_t order = 0; + + return rspamd_html_process_part_full(&fake_task, in, NULL, + NULL, NULL, FALSE, &order); +} + +guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len) +{ + return rspamd::html::decode_html_entitles_inplace(s, len); +} + +gint rspamd_html_tag_by_name(const gchar *name) +{ + const auto *td = rspamd::html::html_tags_defs.by_name(name); + + if (td != nullptr) { + return td->id; + } + + return -1; +} + +gboolean +rspamd_html_tag_seen(void *ptr, const gchar *tagname) +{ + gint id; + auto *hc = rspamd::html::html_content::from_ptr(ptr); + + g_assert(hc != NULL); + + id = rspamd_html_tag_by_name(tagname); + + if (id != -1) { + return hc->tags_seen[id]; + } + + return FALSE; +} + +const gchar * +rspamd_html_tag_by_id(gint id) +{ + if (id > Tag_UNKNOWN && id < Tag_MAX) { + const auto *td = rspamd::html::html_tags_defs.by_id(id); + + if (td != nullptr) { + return td->name.c_str(); + } + } + + return nullptr; +} + +const gchar * +rspamd_html_tag_name(void *p, gsize *len) +{ + auto *tag = reinterpret_cast<rspamd::html::html_tag *>(p); + auto tname = rspamd::html::html_tags_defs.name_by_id_safe(tag->id); + + if (len) { + *len = tname.size(); + } + + return tname.data(); +} + +struct html_image * +rspamd_html_find_embedded_image(void *html_content, + const char *cid, gsize cid_len) +{ + auto *hc = rspamd::html::html_content::from_ptr(html_content); + + auto maybe_img = rspamd::html::html_find_image_by_cid(*hc, {cid, cid_len}); + + if (maybe_img) { + return (html_image *) maybe_img.value(); + } + + return nullptr; +} + +bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest) +{ + auto *hc = rspamd::html::html_content::from_ptr(html_content); + + dest->begin = hc->parsed.data(); + dest->len = hc->parsed.size(); + + return true; +} + +gsize rspamd_html_get_tags_count(void *html_content) +{ + auto *hc = rspamd::html::html_content::from_ptr(html_content); + + if (!hc) { + return 0; + } + + return hc->all_tags.size(); +}
\ No newline at end of file diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h new file mode 100644 index 0000000..2d34f2a --- /dev/null +++ b/src/libserver/html/html.h @@ -0,0 +1,137 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_H +#define RSPAMD_HTML_H + +#include "config.h" +#include "libutil/mem_pool.h" +#include "libserver/url.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * HTML content flags + */ +#define RSPAMD_HTML_FLAG_BAD_START (1 << 0) +#define RSPAMD_HTML_FLAG_BAD_ELEMENTS (1 << 1) +#define RSPAMD_HTML_FLAG_XML (1 << 2) +#define RSPAMD_HTML_FLAG_UNBALANCED (1 << 3) +#define RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS (1 << 4) +#define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5) +#define RSPAMD_HTML_FLAG_TOO_MANY_TAGS (1 << 6) +#define RSPAMD_HTML_FLAG_HAS_DATA_URLS (1 << 7) +#define RSPAMD_HTML_FLAG_HAS_ZEROS (1 << 8) + +/* + * Image flags + */ +#define RSPAMD_HTML_FLAG_IMAGE_EMBEDDED (1 << 0) +#define RSPAMD_HTML_FLAG_IMAGE_EXTERNAL (1 << 1) +#define RSPAMD_HTML_FLAG_IMAGE_DATA (1 << 2) + + +struct rspamd_image; + +struct html_image { + guint height; + guint width; + guint flags; + gchar *src; + struct rspamd_url *url; + struct rspamd_image *embedded_image; + void *tag; +}; + + +/* Forwarded declaration */ +struct rspamd_task; + +/* + * Decode HTML entitles in text. Text is modified in place. + */ +guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len); + +void *rspamd_html_process_part(rspamd_mempool_t *pool, + GByteArray *in); + +void *rspamd_html_process_part_full(struct rspamd_task *task, + GByteArray *in, GList **exceptions, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls, + bool allow_css, + uint16_t *cur_url_order); + +/* + * Returns true if a specified tag has been seen in a part + */ +gboolean rspamd_html_tag_seen(void *ptr, const gchar *tagname); + +/** + * Returns name for the specified tag id + * @param id + * @return + */ +const gchar *rspamd_html_tag_by_id(gint id); + +/** + * Returns HTML tag id by name + * @param name + * @return + */ +gint rspamd_html_tag_by_name(const gchar *name); + +/** + * Gets a name for a tag + * @param tag + * @param len + * @return + */ +const gchar *rspamd_html_tag_name(void *tag, gsize *len); + +/** + * Find HTML image by content id + * @param html_content + * @param cid + * @param cid_len + * @return + */ +struct html_image *rspamd_html_find_embedded_image(void *html_content, + const char *cid, gsize cid_len); + +/** + * Stores parsed content in ftok_t structure + * @param html_content + * @param dest + * @return + */ +bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest); + +/** + * Returns number of tags in the html content + * @param html_content + * @return + */ +gsize rspamd_html_get_tags_count(void *html_content); + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx new file mode 100644 index 0000000..3320fd6 --- /dev/null +++ b/src/libserver/html/html.hxx @@ -0,0 +1,146 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_HXX +#define RSPAMD_HTML_HXX +#pragma once + +#include "config.h" +#include "libserver/url.h" +#include "libserver/html/html_tag.hxx" +#include "libserver/html/html.h" +#include "libserver/html/html_tags.h" + + +#include <vector> +#include <memory> +#include <string> +#include "function2/function2.hpp" + +namespace rspamd::css { +/* Forward declaration */ +class css_style_sheet; +}// namespace rspamd::css + +namespace rspamd::html { + +struct html_block; + +struct html_content { + struct rspamd_url *base_url = nullptr; + struct html_tag *root_tag = nullptr; + gint flags = 0; + std::vector<bool> tags_seen; + std::vector<html_image *> images; + std::vector<std::unique_ptr<struct html_tag>> all_tags; + std::string parsed; + std::string invisible; + std::shared_ptr<css::css_style_sheet> css_style; + + /* Preallocate and reserve all internal structures */ + html_content() + { + tags_seen.resize(Tag_MAX, false); + all_tags.reserve(128); + parsed.reserve(256); + } + + static void html_content_dtor(void *ptr) + { + delete html_content::from_ptr(ptr); + } + + static auto from_ptr(void *ptr) -> html_content * + { + return static_cast<html_content *>(ptr); + } + + enum class traverse_type { + PRE_ORDER, + POST_ORDER + }; + auto traverse_block_tags(fu2::function<bool(const html_tag *)> &&func, + traverse_type how = traverse_type::PRE_ORDER) const -> bool + { + + if (root_tag == nullptr) { + return false; + } + + auto rec_functor_pre_order = [&](const html_tag *root, auto &&rec) -> bool { + if (func(root)) { + + for (const auto *c: root->children) { + if (!rec(c, rec)) { + return false; + } + } + + return true; + } + return false; + }; + auto rec_functor_post_order = [&](const html_tag *root, auto &&rec) -> bool { + for (const auto *c: root->children) { + if (!rec(c, rec)) { + return false; + } + } + + return func(root); + }; + + switch (how) { + case traverse_type::PRE_ORDER: + return rec_functor_pre_order(root_tag, rec_functor_pre_order); + case traverse_type::POST_ORDER: + return rec_functor_post_order(root_tag, rec_functor_post_order); + default: + RSPAMD_UNREACHABLE; + } + } + + auto traverse_all_tags(fu2::function<bool(const html_tag *)> &&func) const -> bool + { + for (const auto &tag: all_tags) { + if (!(tag->flags & (FL_XML | FL_VIRTUAL))) { + if (!func(tag.get())) { + return false; + } + } + } + + return true; + } + +private: + ~html_content() = default; +}; + + +auto html_tag_by_name(const std::string_view &name) -> std::optional<tag_id_t>; +auto html_process_input(struct rspamd_task *task, + GByteArray *in, + GList **exceptions, + khash_t(rspamd_url_hash) * url_set, + GPtrArray *part_urls, + bool allow_css, + std::uint16_t *cur_url_order) -> html_content *; +auto html_debug_structure(const html_content &hc) -> std::string; + +}// namespace rspamd::html + +#endif//RSPAMD_HTML_HXX diff --git a/src/libserver/html/html_block.hxx b/src/libserver/html/html_block.hxx new file mode 100644 index 0000000..f9b5184 --- /dev/null +++ b/src/libserver/html/html_block.hxx @@ -0,0 +1,358 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_HTML_BLOCK_HXX +#define RSPAMD_HTML_BLOCK_HXX +#pragma once + +#include "libserver/css/css_value.hxx" +#include <cmath> + +namespace rspamd::html { + +/* + * Block tag definition + */ +struct html_block { + rspamd::css::css_color fg_color; + rspamd::css::css_color bg_color; + std::int16_t height; + std::int16_t width; + rspamd::css::css_display_value display; + std::int8_t font_size; + + unsigned fg_color_mask : 2; + unsigned bg_color_mask : 2; + unsigned height_mask : 2; + unsigned width_mask : 2; + unsigned font_mask : 2; + unsigned display_mask : 2; + unsigned visibility_mask : 2; + + constexpr static const auto unset = 0; + constexpr static const auto inherited = 1; + constexpr static const auto implicit = 1; + constexpr static const auto set = 3; + constexpr static const auto invisible_flag = 1; + constexpr static const auto transparent_flag = 2; + + /* Helpers to set mask when setting the elements */ + auto set_fgcolor(const rspamd::css::css_color &c, int how = html_block::set) -> void + { + fg_color = c; + fg_color_mask = how; + } + auto set_bgcolor(const rspamd::css::css_color &c, int how = html_block::set) -> void + { + bg_color = c; + bg_color_mask = how; + } + auto set_height(float h, bool is_percent = false, int how = html_block::set) -> void + { + h = is_percent ? (-h) : h; + if (h < INT16_MIN) { + /* Negative numbers encode percents... */ + height = -100; + } + else if (h > INT16_MAX) { + height = INT16_MAX; + } + else { + height = h; + } + height_mask = how; + } + + auto set_width(float w, bool is_percent = false, int how = html_block::set) -> void + { + w = is_percent ? (-w) : w; + if (w < INT16_MIN) { + width = INT16_MIN; + } + else if (w > INT16_MAX) { + width = INT16_MAX; + } + else { + width = w; + } + width_mask = how; + } + + auto set_display(bool v, int how = html_block::set) -> void + { + if (v) { + display = rspamd::css::css_display_value::DISPLAY_INLINE; + } + else { + display = rspamd::css::css_display_value::DISPLAY_HIDDEN; + } + display_mask = how; + } + + auto set_display(rspamd::css::css_display_value v, int how = html_block::set) -> void + { + display = v; + display_mask = how; + } + + auto set_font_size(float fs, bool is_percent = false, int how = html_block::set) -> void + { + fs = is_percent ? (-fs) : fs; + if (fs < INT8_MIN) { + font_size = -100; + } + else if (fs > INT8_MAX) { + font_size = INT8_MAX; + } + else { + font_size = fs; + } + font_mask = how; + } + +private: + template<typename T, typename MT> + static constexpr auto simple_prop(MT mask_val, MT other_mask, T &our_val, + T other_val) -> MT + { + if (other_mask && other_mask > mask_val) { + our_val = other_val; + mask_val = html_block::inherited; + } + + return mask_val; + } + + /* Sizes propagation logic + * We can have multiple cases: + * 1) Our size is > 0 and we can use it as is + * 2) Parent size is > 0 and our size is undefined, so propagate parent + * 3) Parent size is < 0 and our size is undefined - propagate parent + * 4) Parent size is > 0 and our size is < 0 - multiply parent by abs(ours) + * 5) Parent size is undefined and our size is < 0 - tricky stuff, assume some defaults + */ + template<typename T, typename MT> + static constexpr auto size_prop(MT mask_val, MT other_mask, T &our_val, + T other_val, T default_val) -> MT + { + if (mask_val) { + /* We have our value */ + if (our_val < 0) { + if (other_mask > 0) { + if (other_val >= 0) { + our_val = other_val * (-our_val / 100.0); + } + else { + our_val *= (-other_val / 100.0); + } + } + else { + /* Parent value is not defined and our value is relative */ + our_val = default_val * (-our_val / 100.0); + } + } + else if (other_mask && other_mask > mask_val) { + our_val = other_val; + mask_val = html_block::inherited; + } + } + else { + /* We propagate parent if defined */ + if (other_mask && other_mask > mask_val) { + our_val = other_val; + mask_val = html_block::inherited; + } + /* Otherwise do nothing */ + } + + return mask_val; + } + +public: + /** + * Propagate values from the block if they are not defined by the current block + * @param other + * @return + */ + auto propagate_block(const html_block &other) -> void + { + fg_color_mask = html_block::simple_prop(fg_color_mask, other.fg_color_mask, + fg_color, other.fg_color); + bg_color_mask = html_block::simple_prop(bg_color_mask, other.bg_color_mask, + bg_color, other.bg_color); + display_mask = html_block::simple_prop(display_mask, other.display_mask, + display, other.display); + + height_mask = html_block::size_prop(height_mask, other.height_mask, + height, other.height, static_cast<std::int16_t>(800)); + width_mask = html_block::size_prop(width_mask, other.width_mask, + width, other.width, static_cast<std::int16_t>(1024)); + font_mask = html_block::size_prop(font_mask, other.font_mask, + font_size, other.font_size, static_cast<std::int8_t>(10)); + } + + /* + * Set block overriding all inherited values + */ + auto set_block(const html_block &other) -> void + { + constexpr auto set_value = [](auto mask_val, auto other_mask, auto &our_val, + auto other_val) constexpr -> int { + if (other_mask && mask_val != html_block::set) { + our_val = other_val; + mask_val = other_mask; + } + + return mask_val; + }; + + fg_color_mask = set_value(fg_color_mask, other.fg_color_mask, fg_color, other.fg_color); + bg_color_mask = set_value(bg_color_mask, other.bg_color_mask, bg_color, other.bg_color); + display_mask = set_value(display_mask, other.display_mask, display, other.display); + height_mask = set_value(height_mask, other.height_mask, height, other.height); + width_mask = set_value(width_mask, other.width_mask, width, other.width); + font_mask = set_value(font_mask, other.font_mask, font_size, other.font_size); + } + + auto compute_visibility(void) -> void + { + if (display_mask) { + if (display == css::css_display_value::DISPLAY_HIDDEN) { + visibility_mask = html_block::invisible_flag; + + return; + } + } + + if (font_mask) { + if (font_size == 0) { + visibility_mask = html_block::invisible_flag; + + return; + } + } + + auto is_similar_colors = [](const rspamd::css::css_color &fg, const rspamd::css::css_color &bg) -> bool { + constexpr const auto min_visible_diff = 0.1f; + auto diff_r = ((float) fg.r - bg.r); + auto diff_g = ((float) fg.g - bg.g); + auto diff_b = ((float) fg.b - bg.b); + auto ravg = ((float) fg.r + bg.r) / 2.0f; + + /* Square diffs */ + diff_r *= diff_r; + diff_g *= diff_g; + diff_b *= diff_b; + + auto diff = std::sqrt(2.0f * diff_r + 4.0f * diff_g + 3.0f * diff_b + + (ravg * (diff_r - diff_b) / 256.0f)) / + 256.0f; + + return diff < min_visible_diff; + }; + /* Check if we have both bg/fg colors */ + if (fg_color_mask && bg_color_mask) { + if (fg_color.alpha < 10) { + /* Too transparent */ + visibility_mask = html_block::transparent_flag; + + return; + } + + if (bg_color.alpha > 10) { + if (is_similar_colors(fg_color, bg_color)) { + visibility_mask = html_block::transparent_flag; + return; + } + } + } + else if (fg_color_mask) { + /* Merely fg color */ + if (fg_color.alpha < 10) { + /* Too transparent */ + visibility_mask = html_block::transparent_flag; + + return; + } + + /* Implicit fg color */ + if (is_similar_colors(fg_color, rspamd::css::css_color::white())) { + visibility_mask = html_block::transparent_flag; + return; + } + } + else if (bg_color_mask) { + if (bg_color.alpha > 10) { + if (is_similar_colors(rspamd::css::css_color::black(), bg_color)) { + visibility_mask = html_block::transparent_flag; + return; + } + } + } + + visibility_mask = html_block::unset; + } + + constexpr auto is_visible(void) const -> bool + { + return visibility_mask == html_block::unset; + } + + constexpr auto is_transparent(void) const -> bool + { + return visibility_mask == html_block::transparent_flag; + } + + constexpr auto has_display(int how = html_block::set) const -> bool + { + return display_mask >= how; + } + + /** + * Returns a default html block for root HTML element + * @return + */ + static auto default_html_block(void) -> html_block + { + return html_block{.fg_color = rspamd::css::css_color::black(), + .bg_color = rspamd::css::css_color::white(), + .height = 0, + .width = 0, + .display = rspamd::css::css_display_value::DISPLAY_INLINE, + .font_size = 12, + .fg_color_mask = html_block::inherited, + .bg_color_mask = html_block::inherited, + .height_mask = html_block::unset, + .width_mask = html_block::unset, + .font_mask = html_block::unset, + .display_mask = html_block::inherited, + .visibility_mask = html_block::unset}; + } + /** + * Produces html block with no defined values allocated from the pool + * @param pool + * @return + */ + static auto undefined_html_block_pool(rspamd_mempool_t *pool) -> html_block * + { + auto *bl = rspamd_mempool_alloc0_type(pool, html_block); + + return bl; + } +}; + +}// namespace rspamd::html + +#endif//RSPAMD_HTML_BLOCK_HXX diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx new file mode 100644 index 0000000..c642536 --- /dev/null +++ b/src/libserver/html/html_entities.cxx @@ -0,0 +1,2644 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "html_entities.hxx" + +#include <string> +#include <utility> +#include <vector> +#include "contrib/ankerl/unordered_dense.h" +#include <unicode/utf8.h> +#include <unicode/uchar.h> +#include "libutil/cxx/util.hxx" + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +namespace rspamd::html { + +struct html_entity_def { + const char *name; + const char *replacement; + unsigned code; + bool allow_heuristic; +}; + +#define ENTITY_DEF(name, code, replacement) \ + html_entity_def \ + { \ + (name), (replacement), (code), false \ + } +#define ENTITY_DEF_HEUR(name, code, replacement) \ + html_entity_def \ + { \ + (name), (replacement), (code), true \ + } + +static const html_entity_def html_entities_array[] = { + ENTITY_DEF_HEUR("szlig", 223, "\xc3\x9f"), + ENTITY_DEF("prime", 8242, "\xe2\x80\xb2"), + ENTITY_DEF("lnsim", 8934, "\xe2\x8b\xa6"), + ENTITY_DEF("nvDash", 8877, "\xe2\x8a\xad"), + ENTITY_DEF("isinsv", 8947, "\xe2\x8b\xb3"), + ENTITY_DEF("notin", 8713, "\xe2\x88\x89"), + ENTITY_DEF("becaus", 8757, "\xe2\x88\xb5"), + ENTITY_DEF("Leftrightarrow", 8660, "\xe2\x87\x94"), + ENTITY_DEF("EmptySmallSquare", 9723, "\xe2\x97\xbb"), + ENTITY_DEF("SquareUnion", 8852, "\xe2\x8a\x94"), + ENTITY_DEF("subdot", 10941, "\xe2\xaa\xbd"), + ENTITY_DEF("Dstrok", 272, "\xc4\x90"), + ENTITY_DEF("rrarr", 8649, "\xe2\x87\x89"), + ENTITY_DEF("rArr", 8658, "\xe2\x87\x92"), + ENTITY_DEF_HEUR("Aacute", 193, "\xc3\x81"), + ENTITY_DEF("kappa", 954, "\xce\xba"), + ENTITY_DEF("Iopf", 120128, "\xf0\x9d\x95\x80"), + ENTITY_DEF("hyphen", 8208, "\xe2\x80\x90"), + ENTITY_DEF("rarrbfs", 10528, "\xe2\xa4\xa0"), + ENTITY_DEF("supsetneqq", 10956, "\xe2\xab\x8c"), + ENTITY_DEF("gacute", 501, "\xc7\xb5"), + ENTITY_DEF("VeryThinSpace", 8202, "\xe2\x80\x8a"), + ENTITY_DEF("tint", 8749, "\xe2\x88\xad"), + ENTITY_DEF("ffr", 120099, "\xf0\x9d\x94\xa3"), + ENTITY_DEF("kgreen", 312, "\xc4\xb8"), + ENTITY_DEF("nis", 8956, "\xe2\x8b\xbc"), + ENTITY_DEF("NotRightTriangleBar", 10704, "\xe2\xa7\x90\xcc\xb8"), + ENTITY_DEF("Eogon", 280, "\xc4\x98"), + ENTITY_DEF("lbrke", 10635, "\xe2\xa6\x8b"), + ENTITY_DEF("phi", 966, "\xcf\x86"), + ENTITY_DEF("notnivc", 8957, "\xe2\x8b\xbd"), + ENTITY_DEF("utilde", 361, "\xc5\xa9"), + ENTITY_DEF("Fopf", 120125, "\xf0\x9d\x94\xbd"), + ENTITY_DEF("Vcy", 1042, "\xd0\x92"), + ENTITY_DEF("erDot", 8787, "\xe2\x89\x93"), + ENTITY_DEF("nsubE", 10949, "\xe2\xab\x85\xcc\xb8"), + ENTITY_DEF_HEUR("egrave", 232, "\xc3\xa8"), + ENTITY_DEF("Lcedil", 315, "\xc4\xbb"), + ENTITY_DEF("lharul", 10602, "\xe2\xa5\xaa"), + ENTITY_DEF_HEUR("middot", 183, "\xc2\xb7"), + ENTITY_DEF("ggg", 8921, "\xe2\x8b\x99"), + ENTITY_DEF("NestedLessLess", 8810, "\xe2\x89\xaa"), + ENTITY_DEF("tau", 964, "\xcf\x84"), + ENTITY_DEF("setmn", 8726, "\xe2\x88\x96"), + ENTITY_DEF("frac78", 8542, "\xe2\x85\x9e"), + ENTITY_DEF_HEUR("para", 182, "\xc2\xb6"), + ENTITY_DEF("Rcedil", 342, "\xc5\x96"), + ENTITY_DEF("propto", 8733, "\xe2\x88\x9d"), + ENTITY_DEF("sqsubset", 8847, "\xe2\x8a\x8f"), + ENTITY_DEF("ensp", 8194, "\xe2\x80\x82"), + ENTITY_DEF("boxvH", 9578, "\xe2\x95\xaa"), + ENTITY_DEF("NotGreaterTilde", 8821, "\xe2\x89\xb5"), + ENTITY_DEF("ffllig", 64260, "\xef\xac\x84"), + ENTITY_DEF("kcedil", 311, "\xc4\xb7"), + ENTITY_DEF("omega", 969, "\xcf\x89"), + ENTITY_DEF("sime", 8771, "\xe2\x89\x83"), + ENTITY_DEF("LeftTriangleEqual", 8884, "\xe2\x8a\xb4"), + ENTITY_DEF("bsemi", 8271, "\xe2\x81\x8f"), + ENTITY_DEF("rdquor", 8221, "\xe2\x80\x9d"), + ENTITY_DEF("Utilde", 360, "\xc5\xa8"), + ENTITY_DEF("bsol", 92, "\x5c"), + ENTITY_DEF("risingdotseq", 8787, "\xe2\x89\x93"), + ENTITY_DEF("ultri", 9720, "\xe2\x97\xb8"), + ENTITY_DEF("rhov", 1009, "\xcf\xb1"), + ENTITY_DEF("TildeEqual", 8771, "\xe2\x89\x83"), + ENTITY_DEF("jukcy", 1108, "\xd1\x94"), + ENTITY_DEF("perp", 8869, "\xe2\x8a\xa5"), + ENTITY_DEF("capbrcup", 10825, "\xe2\xa9\x89"), + ENTITY_DEF("ltrie", 8884, "\xe2\x8a\xb4"), + ENTITY_DEF("LessTilde", 8818, "\xe2\x89\xb2"), + ENTITY_DEF("popf", 120161, "\xf0\x9d\x95\xa1"), + ENTITY_DEF("dbkarow", 10511, "\xe2\xa4\x8f"), + ENTITY_DEF("roang", 10221, "\xe2\x9f\xad"), + ENTITY_DEF_HEUR("brvbar", 166, "\xc2\xa6"), + ENTITY_DEF("CenterDot", 183, "\xc2\xb7"), + ENTITY_DEF("notindot", 8949, "\xe2\x8b\xb5\xcc\xb8"), + ENTITY_DEF("supmult", 10946, "\xe2\xab\x82"), + ENTITY_DEF("multimap", 8888, "\xe2\x8a\xb8"), + ENTITY_DEF_HEUR("frac34", 190, "\xc2\xbe"), + ENTITY_DEF("mapsto", 8614, "\xe2\x86\xa6"), + ENTITY_DEF("flat", 9837, "\xe2\x99\xad"), + ENTITY_DEF("updownarrow", 8597, "\xe2\x86\x95"), + ENTITY_DEF("gne", 10888, "\xe2\xaa\x88"), + ENTITY_DEF("nrarrc", 10547, "\xe2\xa4\xb3\xcc\xb8"), + ENTITY_DEF("suphsol", 10185, "\xe2\x9f\x89"), + ENTITY_DEF("nGtv", 8811, "\xe2\x89\xab\xcc\xb8"), + ENTITY_DEF("hopf", 120153, "\xf0\x9d\x95\x99"), + ENTITY_DEF("pointint", 10773, "\xe2\xa8\x95"), + ENTITY_DEF("glj", 10916, "\xe2\xaa\xa4"), + ENTITY_DEF("LeftDoubleBracket", 10214, "\xe2\x9f\xa6"), + ENTITY_DEF("NotSupersetEqual", 8841, "\xe2\x8a\x89"), + ENTITY_DEF("dot", 729, "\xcb\x99"), + ENTITY_DEF("tbrk", 9140, "\xe2\x8e\xb4"), + ENTITY_DEF("LeftUpDownVector", 10577, "\xe2\xa5\x91"), + ENTITY_DEF_HEUR("uml", 168, "\xc2\xa8"), + ENTITY_DEF("bbrk", 9141, "\xe2\x8e\xb5"), + ENTITY_DEF("nearrow", 8599, "\xe2\x86\x97"), + ENTITY_DEF("backsimeq", 8909, "\xe2\x8b\x8d"), + ENTITY_DEF("dblac", 733, "\xcb\x9d"), + ENTITY_DEF("circleddash", 8861, "\xe2\x8a\x9d"), + ENTITY_DEF("ldsh", 8626, "\xe2\x86\xb2"), + ENTITY_DEF("sce", 10928, "\xe2\xaa\xb0"), + ENTITY_DEF("angst", 197, "\xc3\x85"), + ENTITY_DEF_HEUR("yen", 165, "\xc2\xa5"), + ENTITY_DEF("nsupE", 10950, "\xe2\xab\x86\xcc\xb8"), + ENTITY_DEF("Uscr", 119984, "\xf0\x9d\x92\xb0"), + ENTITY_DEF("subplus", 10943, "\xe2\xaa\xbf"), + ENTITY_DEF("nleqq", 8806, "\xe2\x89\xa6\xcc\xb8"), + ENTITY_DEF("nprcue", 8928, "\xe2\x8b\xa0"), + ENTITY_DEF("Ocirc", 212, "\xc3\x94"), + ENTITY_DEF("disin", 8946, "\xe2\x8b\xb2"), + ENTITY_DEF("EqualTilde", 8770, "\xe2\x89\x82"), + ENTITY_DEF("YUcy", 1070, "\xd0\xae"), + ENTITY_DEF("Kscr", 119974, "\xf0\x9d\x92\xa6"), + ENTITY_DEF("lg", 8822, "\xe2\x89\xb6"), + ENTITY_DEF("nLeftrightarrow", 8654, "\xe2\x87\x8e"), + ENTITY_DEF("eplus", 10865, "\xe2\xa9\xb1"), + ENTITY_DEF("les", 10877, "\xe2\xa9\xbd"), + ENTITY_DEF("sfr", 120112, "\xf0\x9d\x94\xb0"), + ENTITY_DEF("HumpDownHump", 8782, "\xe2\x89\x8e"), + ENTITY_DEF("Fouriertrf", 8497, "\xe2\x84\xb1"), + ENTITY_DEF("Updownarrow", 8661, "\xe2\x87\x95"), + ENTITY_DEF("nrarr", 8603, "\xe2\x86\x9b"), + ENTITY_DEF("radic", 8730, "\xe2\x88\x9a"), + ENTITY_DEF("gnap", 10890, "\xe2\xaa\x8a"), + ENTITY_DEF("zeta", 950, "\xce\xb6"), + ENTITY_DEF("Qscr", 119980, "\xf0\x9d\x92\xac"), + ENTITY_DEF("NotRightTriangleEqual", 8941, "\xe2\x8b\xad"), + ENTITY_DEF("nshortmid", 8740, "\xe2\x88\xa4"), + ENTITY_DEF("SHCHcy", 1065, "\xd0\xa9"), + ENTITY_DEF("piv", 982, "\xcf\x96"), + ENTITY_DEF("angmsdaa", 10664, "\xe2\xa6\xa8"), + ENTITY_DEF("curlywedge", 8911, "\xe2\x8b\x8f"), + ENTITY_DEF("sqcaps", 8851, "\xe2\x8a\x93\xef\xb8\x80"), + ENTITY_DEF("sum", 8721, "\xe2\x88\x91"), + ENTITY_DEF("rarrtl", 8611, "\xe2\x86\xa3"), + ENTITY_DEF("gescc", 10921, "\xe2\xaa\xa9"), + ENTITY_DEF("sup", 8835, "\xe2\x8a\x83"), + ENTITY_DEF("smid", 8739, "\xe2\x88\xa3"), + ENTITY_DEF("cularr", 8630, "\xe2\x86\xb6"), + ENTITY_DEF("olcross", 10683, "\xe2\xa6\xbb"), + ENTITY_DEF_HEUR("GT", 62, "\x3e"), + ENTITY_DEF("scap", 10936, "\xe2\xaa\xb8"), + ENTITY_DEF("capcup", 10823, "\xe2\xa9\x87"), + ENTITY_DEF("NotSquareSubsetEqual", 8930, "\xe2\x8b\xa2"), + ENTITY_DEF("uhblk", 9600, "\xe2\x96\x80"), + ENTITY_DEF("latail", 10521, "\xe2\xa4\x99"), + ENTITY_DEF("smtes", 10924, "\xe2\xaa\xac\xef\xb8\x80"), + ENTITY_DEF("RoundImplies", 10608, "\xe2\xa5\xb0"), + ENTITY_DEF("wreath", 8768, "\xe2\x89\x80"), + ENTITY_DEF("curlyvee", 8910, "\xe2\x8b\x8e"), + ENTITY_DEF("uscr", 120010, "\xf0\x9d\x93\x8a"), + ENTITY_DEF("nleftrightarrow", 8622, "\xe2\x86\xae"), + ENTITY_DEF("ucy", 1091, "\xd1\x83"), + ENTITY_DEF("nvge", 8805, "\xe2\x89\xa5\xe2\x83\x92"), + ENTITY_DEF("bnot", 8976, "\xe2\x8c\x90"), + ENTITY_DEF("alefsym", 8501, "\xe2\x84\xb5"), + ENTITY_DEF("star", 9734, "\xe2\x98\x86"), + ENTITY_DEF("boxHd", 9572, "\xe2\x95\xa4"), + ENTITY_DEF("vsubnE", 10955, "\xe2\xab\x8b\xef\xb8\x80"), + ENTITY_DEF("Popf", 8473, "\xe2\x84\x99"), + ENTITY_DEF("simgE", 10912, "\xe2\xaa\xa0"), + ENTITY_DEF("upsilon", 965, "\xcf\x85"), + ENTITY_DEF("NoBreak", 8288, "\xe2\x81\xa0"), + ENTITY_DEF("realine", 8475, "\xe2\x84\x9b"), + ENTITY_DEF("frac38", 8540, "\xe2\x85\x9c"), + ENTITY_DEF("YAcy", 1071, "\xd0\xaf"), + ENTITY_DEF("bnequiv", 8801, "\xe2\x89\xa1\xe2\x83\xa5"), + ENTITY_DEF("cudarrr", 10549, "\xe2\xa4\xb5"), + ENTITY_DEF("lsime", 10893, "\xe2\xaa\x8d"), + ENTITY_DEF("lowbar", 95, "\x5f"), + ENTITY_DEF("utdot", 8944, "\xe2\x8b\xb0"), + ENTITY_DEF("ReverseElement", 8715, "\xe2\x88\x8b"), + ENTITY_DEF("nshortparallel", 8742, "\xe2\x88\xa6"), + ENTITY_DEF("DJcy", 1026, "\xd0\x82"), + ENTITY_DEF("nsube", 8840, "\xe2\x8a\x88"), + ENTITY_DEF("VDash", 8875, "\xe2\x8a\xab"), + ENTITY_DEF("Ncaron", 327, "\xc5\x87"), + ENTITY_DEF("LeftUpVector", 8639, "\xe2\x86\xbf"), + ENTITY_DEF("Kcy", 1050, "\xd0\x9a"), + ENTITY_DEF("NotLeftTriangleEqual", 8940, "\xe2\x8b\xac"), + ENTITY_DEF("nvHarr", 10500, "\xe2\xa4\x84"), + ENTITY_DEF("lotimes", 10804, "\xe2\xa8\xb4"), + ENTITY_DEF("RightFloor", 8971, "\xe2\x8c\x8b"), + ENTITY_DEF("succ", 8827, "\xe2\x89\xbb"), + ENTITY_DEF("Ucy", 1059, "\xd0\xa3"), + ENTITY_DEF("darr", 8595, "\xe2\x86\x93"), + ENTITY_DEF("lbarr", 10508, "\xe2\xa4\x8c"), + ENTITY_DEF("xfr", 120117, "\xf0\x9d\x94\xb5"), + ENTITY_DEF("zopf", 120171, "\xf0\x9d\x95\xab"), + ENTITY_DEF("Phi", 934, "\xce\xa6"), + ENTITY_DEF("ord", 10845, "\xe2\xa9\x9d"), + ENTITY_DEF("iinfin", 10716, "\xe2\xa7\x9c"), + ENTITY_DEF("Xfr", 120091, "\xf0\x9d\x94\x9b"), + ENTITY_DEF("qint", 10764, "\xe2\xa8\x8c"), + ENTITY_DEF("Upsilon", 933, "\xce\xa5"), + ENTITY_DEF("NotSubset", 8834, "\xe2\x8a\x82\xe2\x83\x92"), + ENTITY_DEF("gfr", 120100, "\xf0\x9d\x94\xa4"), + ENTITY_DEF("notnivb", 8958, "\xe2\x8b\xbe"), + ENTITY_DEF("Afr", 120068, "\xf0\x9d\x94\x84"), + ENTITY_DEF_HEUR("ge", 8805, "\xe2\x89\xa5"), + ENTITY_DEF_HEUR("iexcl", 161, "\xc2\xa1"), + ENTITY_DEF("dfr", 120097, "\xf0\x9d\x94\xa1"), + ENTITY_DEF("rsaquo", 8250, "\xe2\x80\xba"), + ENTITY_DEF("xcap", 8898, "\xe2\x8b\x82"), + ENTITY_DEF("Jopf", 120129, "\xf0\x9d\x95\x81"), + ENTITY_DEF("Hstrok", 294, "\xc4\xa6"), + ENTITY_DEF("ldca", 10550, "\xe2\xa4\xb6"), + ENTITY_DEF("lmoust", 9136, "\xe2\x8e\xb0"), + ENTITY_DEF("wcirc", 373, "\xc5\xb5"), + ENTITY_DEF("DownRightVector", 8641, "\xe2\x87\x81"), + ENTITY_DEF("LessFullEqual", 8806, "\xe2\x89\xa6"), + ENTITY_DEF("dotsquare", 8865, "\xe2\x8a\xa1"), + ENTITY_DEF("zhcy", 1078, "\xd0\xb6"), + ENTITY_DEF("mDDot", 8762, "\xe2\x88\xba"), + ENTITY_DEF("Prime", 8243, "\xe2\x80\xb3"), + ENTITY_DEF("prec", 8826, "\xe2\x89\xba"), + ENTITY_DEF("swnwar", 10538, "\xe2\xa4\xaa"), + ENTITY_DEF_HEUR("COPY", 169, "\xc2\xa9"), + ENTITY_DEF("cong", 8773, "\xe2\x89\x85"), + ENTITY_DEF("sacute", 347, "\xc5\x9b"), + ENTITY_DEF("Nopf", 8469, "\xe2\x84\x95"), + ENTITY_DEF("it", 8290, "\xe2\x81\xa2"), + ENTITY_DEF("SOFTcy", 1068, "\xd0\xac"), + ENTITY_DEF("uuarr", 8648, "\xe2\x87\x88"), + ENTITY_DEF("iota", 953, "\xce\xb9"), + ENTITY_DEF("notinE", 8953, "\xe2\x8b\xb9\xcc\xb8"), + ENTITY_DEF("jfr", 120103, "\xf0\x9d\x94\xa7"), + ENTITY_DEF_HEUR("QUOT", 34, "\x22"), + ENTITY_DEF("vsupnE", 10956, "\xe2\xab\x8c\xef\xb8\x80"), + ENTITY_DEF_HEUR("igrave", 236, "\xc3\xac"), + ENTITY_DEF("bsim", 8765, "\xe2\x88\xbd"), + ENTITY_DEF("npreceq", 10927, "\xe2\xaa\xaf\xcc\xb8"), + ENTITY_DEF("zcaron", 382, "\xc5\xbe"), + ENTITY_DEF("DD", 8517, "\xe2\x85\x85"), + ENTITY_DEF("gamma", 947, "\xce\xb3"), + ENTITY_DEF("homtht", 8763, "\xe2\x88\xbb"), + ENTITY_DEF("NonBreakingSpace", 160, "\xc2\xa0"), + ENTITY_DEF("Proportion", 8759, "\xe2\x88\xb7"), + ENTITY_DEF("nedot", 8784, "\xe2\x89\x90\xcc\xb8"), + ENTITY_DEF("nabla", 8711, "\xe2\x88\x87"), + ENTITY_DEF("ac", 8766, "\xe2\x88\xbe"), + ENTITY_DEF("nsupe", 8841, "\xe2\x8a\x89"), + ENTITY_DEF("ell", 8467, "\xe2\x84\x93"), + ENTITY_DEF("boxvR", 9566, "\xe2\x95\x9e"), + ENTITY_DEF("LowerRightArrow", 8600, "\xe2\x86\x98"), + ENTITY_DEF("boxHu", 9575, "\xe2\x95\xa7"), + ENTITY_DEF("lE", 8806, "\xe2\x89\xa6"), + ENTITY_DEF("dzigrarr", 10239, "\xe2\x9f\xbf"), + ENTITY_DEF("rfloor", 8971, "\xe2\x8c\x8b"), + ENTITY_DEF("gneq", 10888, "\xe2\xaa\x88"), + ENTITY_DEF("rightleftharpoons", 8652, "\xe2\x87\x8c"), + ENTITY_DEF("gtquest", 10876, "\xe2\xa9\xbc"), + ENTITY_DEF("searhk", 10533, "\xe2\xa4\xa5"), + ENTITY_DEF("gesdoto", 10882, "\xe2\xaa\x82"), + ENTITY_DEF("cross", 10007, "\xe2\x9c\x97"), + ENTITY_DEF("rdquo", 8221, "\xe2\x80\x9d"), + ENTITY_DEF("sqsupset", 8848, "\xe2\x8a\x90"), + ENTITY_DEF("divonx", 8903, "\xe2\x8b\x87"), + ENTITY_DEF("lat", 10923, "\xe2\xaa\xab"), + ENTITY_DEF("rmoustache", 9137, "\xe2\x8e\xb1"), + ENTITY_DEF("succapprox", 10936, "\xe2\xaa\xb8"), + ENTITY_DEF("nhpar", 10994, "\xe2\xab\xb2"), + ENTITY_DEF("sharp", 9839, "\xe2\x99\xaf"), + ENTITY_DEF("lrcorner", 8991, "\xe2\x8c\x9f"), + ENTITY_DEF("Vscr", 119985, "\xf0\x9d\x92\xb1"), + ENTITY_DEF("varsigma", 962, "\xcf\x82"), + ENTITY_DEF("bsolb", 10693, "\xe2\xa7\x85"), + ENTITY_DEF("cupcap", 10822, "\xe2\xa9\x86"), + ENTITY_DEF("leftrightarrow", 8596, "\xe2\x86\x94"), + ENTITY_DEF("LeftTee", 8867, "\xe2\x8a\xa3"), + ENTITY_DEF("Sqrt", 8730, "\xe2\x88\x9a"), + ENTITY_DEF("Odblac", 336, "\xc5\x90"), + ENTITY_DEF("ocir", 8858, "\xe2\x8a\x9a"), + ENTITY_DEF("eqslantless", 10901, "\xe2\xaa\x95"), + ENTITY_DEF("supedot", 10948, "\xe2\xab\x84"), + ENTITY_DEF("intercal", 8890, "\xe2\x8a\xba"), + ENTITY_DEF("Gbreve", 286, "\xc4\x9e"), + ENTITY_DEF("xrArr", 10233, "\xe2\x9f\xb9"), + ENTITY_DEF("NotTildeEqual", 8772, "\xe2\x89\x84"), + ENTITY_DEF("Bfr", 120069, "\xf0\x9d\x94\x85"), + ENTITY_DEF_HEUR("Iuml", 207, "\xc3\x8f"), + ENTITY_DEF("leg", 8922, "\xe2\x8b\x9a"), + ENTITY_DEF("boxhU", 9576, "\xe2\x95\xa8"), + ENTITY_DEF("Gopf", 120126, "\xf0\x9d\x94\xbe"), + ENTITY_DEF("af", 8289, "\xe2\x81\xa1"), + ENTITY_DEF("xwedge", 8896, "\xe2\x8b\x80"), + ENTITY_DEF("precapprox", 10935, "\xe2\xaa\xb7"), + ENTITY_DEF("lcedil", 316, "\xc4\xbc"), + ENTITY_DEF("between", 8812, "\xe2\x89\xac"), + ENTITY_DEF_HEUR("Oslash", 216, "\xc3\x98"), + ENTITY_DEF("breve", 728, "\xcb\x98"), + ENTITY_DEF("caps", 8745, "\xe2\x88\xa9\xef\xb8\x80"), + ENTITY_DEF("vangrt", 10652, "\xe2\xa6\x9c"), + ENTITY_DEF("lagran", 8466, "\xe2\x84\x92"), + ENTITY_DEF("kopf", 120156, "\xf0\x9d\x95\x9c"), + ENTITY_DEF("ReverseUpEquilibrium", 10607, "\xe2\xa5\xaf"), + ENTITY_DEF("nlsim", 8820, "\xe2\x89\xb4"), + ENTITY_DEF("Cap", 8914, "\xe2\x8b\x92"), + ENTITY_DEF("angmsdac", 10666, "\xe2\xa6\xaa"), + ENTITY_DEF("iocy", 1105, "\xd1\x91"), + ENTITY_DEF("seswar", 10537, "\xe2\xa4\xa9"), + ENTITY_DEF("dzcy", 1119, "\xd1\x9f"), + ENTITY_DEF("nsubset", 8834, "\xe2\x8a\x82\xe2\x83\x92"), + ENTITY_DEF("cup", 8746, "\xe2\x88\xaa"), + ENTITY_DEF("npar", 8742, "\xe2\x88\xa6"), + ENTITY_DEF("late", 10925, "\xe2\xaa\xad"), + ENTITY_DEF("plussim", 10790, "\xe2\xa8\xa6"), + ENTITY_DEF("Darr", 8609, "\xe2\x86\xa1"), + ENTITY_DEF("nexist", 8708, "\xe2\x88\x84"), + ENTITY_DEF_HEUR("cent", 162, "\xc2\xa2"), + ENTITY_DEF("khcy", 1093, "\xd1\x85"), + ENTITY_DEF("smallsetminus", 8726, "\xe2\x88\x96"), + ENTITY_DEF("ycirc", 375, "\xc5\xb7"), + ENTITY_DEF("lharu", 8636, "\xe2\x86\xbc"), + ENTITY_DEF("upuparrows", 8648, "\xe2\x87\x88"), + ENTITY_DEF("sigmaf", 962, "\xcf\x82"), + ENTITY_DEF("nltri", 8938, "\xe2\x8b\xaa"), + ENTITY_DEF("mstpos", 8766, "\xe2\x88\xbe"), + ENTITY_DEF("Zopf", 8484, "\xe2\x84\xa4"), + ENTITY_DEF("dwangle", 10662, "\xe2\xa6\xa6"), + ENTITY_DEF("bowtie", 8904, "\xe2\x8b\x88"), + ENTITY_DEF("Dfr", 120071, "\xf0\x9d\x94\x87"), + ENTITY_DEF_HEUR("iacute", 237, "\xc3\xad"), + ENTITY_DEF("njcy", 1114, "\xd1\x9a"), + ENTITY_DEF("cfr", 120096, "\xf0\x9d\x94\xa0"), + ENTITY_DEF("TripleDot", 8411, "\xe2\x83\x9b"), + ENTITY_DEF("Or", 10836, "\xe2\xa9\x94"), + ENTITY_DEF("blk34", 9619, "\xe2\x96\x93"), + ENTITY_DEF("equiv", 8801, "\xe2\x89\xa1"), + ENTITY_DEF("fflig", 64256, "\xef\xac\x80"), + ENTITY_DEF("Rang", 10219, "\xe2\x9f\xab"), + ENTITY_DEF("Wopf", 120142, "\xf0\x9d\x95\x8e"), + ENTITY_DEF("boxUl", 9564, "\xe2\x95\x9c"), + ENTITY_DEF_HEUR("frac12", 189, "\xc2\xbd"), + ENTITY_DEF("clubs", 9827, "\xe2\x99\xa3"), + ENTITY_DEF("amalg", 10815, "\xe2\xa8\xbf"), + ENTITY_DEF("Lang", 10218, "\xe2\x9f\xaa"), + ENTITY_DEF("asymp", 8776, "\xe2\x89\x88"), + ENTITY_DEF("models", 8871, "\xe2\x8a\xa7"), + ENTITY_DEF("emptyset", 8709, "\xe2\x88\x85"), + ENTITY_DEF("Tscr", 119983, "\xf0\x9d\x92\xaf"), + ENTITY_DEF("nleftarrow", 8602, "\xe2\x86\x9a"), + ENTITY_DEF("Omacr", 332, "\xc5\x8c"), + ENTITY_DEF("gtrarr", 10616, "\xe2\xa5\xb8"), + ENTITY_DEF("excl", 33, "\x21"), + ENTITY_DEF("rarrw", 8605, "\xe2\x86\x9d"), + ENTITY_DEF("abreve", 259, "\xc4\x83"), + ENTITY_DEF("CircleTimes", 8855, "\xe2\x8a\x97"), + ENTITY_DEF("aopf", 120146, "\xf0\x9d\x95\x92"), + ENTITY_DEF("eqvparsl", 10725, "\xe2\xa7\xa5"), + ENTITY_DEF("boxv", 9474, "\xe2\x94\x82"), + ENTITY_DEF("SuchThat", 8715, "\xe2\x88\x8b"), + ENTITY_DEF("varphi", 981, "\xcf\x95"), + ENTITY_DEF("Ropf", 8477, "\xe2\x84\x9d"), + ENTITY_DEF("rscr", 120007, "\xf0\x9d\x93\x87"), + ENTITY_DEF("Rrightarrow", 8667, "\xe2\x87\x9b"), + ENTITY_DEF("equest", 8799, "\xe2\x89\x9f"), + ENTITY_DEF_HEUR("ntilde", 241, "\xc3\xb1"), + ENTITY_DEF("Escr", 8496, "\xe2\x84\xb0"), + ENTITY_DEF("Lopf", 120131, "\xf0\x9d\x95\x83"), + ENTITY_DEF("GreaterGreater", 10914, "\xe2\xaa\xa2"), + ENTITY_DEF("pluscir", 10786, "\xe2\xa8\xa2"), + ENTITY_DEF("nsupset", 8835, "\xe2\x8a\x83\xe2\x83\x92"), + ENTITY_DEF("uArr", 8657, "\xe2\x87\x91"), + ENTITY_DEF("nwarhk", 10531, "\xe2\xa4\xa3"), + ENTITY_DEF("Ycirc", 374, "\xc5\xb6"), + ENTITY_DEF("tdot", 8411, "\xe2\x83\x9b"), + ENTITY_DEF("circledS", 9416, "\xe2\x93\x88"), + ENTITY_DEF("lhard", 8637, "\xe2\x86\xbd"), + ENTITY_DEF("iukcy", 1110, "\xd1\x96"), + ENTITY_DEF("PrecedesSlantEqual", 8828, "\xe2\x89\xbc"), + ENTITY_DEF("Sfr", 120086, "\xf0\x9d\x94\x96"), + ENTITY_DEF("egs", 10902, "\xe2\xaa\x96"), + ENTITY_DEF("oelig", 339, "\xc5\x93"), + ENTITY_DEF("bigtriangledown", 9661, "\xe2\x96\xbd"), + ENTITY_DEF("EmptyVerySmallSquare", 9643, "\xe2\x96\xab"), + ENTITY_DEF("Backslash", 8726, "\xe2\x88\x96"), + ENTITY_DEF("nscr", 120003, "\xf0\x9d\x93\x83"), + ENTITY_DEF("uogon", 371, "\xc5\xb3"), + ENTITY_DEF("circeq", 8791, "\xe2\x89\x97"), + ENTITY_DEF("check", 10003, "\xe2\x9c\x93"), + ENTITY_DEF("Sup", 8913, "\xe2\x8b\x91"), + ENTITY_DEF("Rcaron", 344, "\xc5\x98"), + ENTITY_DEF("lneqq", 8808, "\xe2\x89\xa8"), + ENTITY_DEF("lrhar", 8651, "\xe2\x87\x8b"), + ENTITY_DEF("ulcorn", 8988, "\xe2\x8c\x9c"), + ENTITY_DEF("timesd", 10800, "\xe2\xa8\xb0"), + ENTITY_DEF("Sum", 8721, "\xe2\x88\x91"), + ENTITY_DEF("varpropto", 8733, "\xe2\x88\x9d"), + ENTITY_DEF("Lcaron", 317, "\xc4\xbd"), + ENTITY_DEF("lbrkslu", 10637, "\xe2\xa6\x8d"), + ENTITY_DEF_HEUR("AElig", 198, "\xc3\x86"), + ENTITY_DEF("varr", 8597, "\xe2\x86\x95"), + ENTITY_DEF("nvinfin", 10718, "\xe2\xa7\x9e"), + ENTITY_DEF("leq", 8804, "\xe2\x89\xa4"), + ENTITY_DEF("biguplus", 10756, "\xe2\xa8\x84"), + ENTITY_DEF("rpar", 41, "\x29"), + ENTITY_DEF("eng", 331, "\xc5\x8b"), + ENTITY_DEF("NegativeThinSpace", 8203, "\xe2\x80\x8b"), + ENTITY_DEF("lesssim", 8818, "\xe2\x89\xb2"), + ENTITY_DEF("lBarr", 10510, "\xe2\xa4\x8e"), + ENTITY_DEF("LeftUpTeeVector", 10592, "\xe2\xa5\xa0"), + ENTITY_DEF("gnE", 8809, "\xe2\x89\xa9"), + ENTITY_DEF("efr", 120098, "\xf0\x9d\x94\xa2"), + ENTITY_DEF("barvee", 8893, "\xe2\x8a\xbd"), + ENTITY_DEF("ee", 8519, "\xe2\x85\x87"), + ENTITY_DEF("Uogon", 370, "\xc5\xb2"), + ENTITY_DEF("gnapprox", 10890, "\xe2\xaa\x8a"), + ENTITY_DEF("olcir", 10686, "\xe2\xa6\xbe"), + ENTITY_DEF("boxUL", 9565, "\xe2\x95\x9d"), + ENTITY_DEF("Gg", 8921, "\xe2\x8b\x99"), + ENTITY_DEF("CloseCurlyQuote", 8217, "\xe2\x80\x99"), + ENTITY_DEF("leftharpoondown", 8637, "\xe2\x86\xbd"), + ENTITY_DEF("vfr", 120115, "\xf0\x9d\x94\xb3"), + ENTITY_DEF("gvertneqq", 8809, "\xe2\x89\xa9\xef\xb8\x80"), + ENTITY_DEF_HEUR("ouml", 246, "\xc3\xb6"), + ENTITY_DEF("raemptyv", 10675, "\xe2\xa6\xb3"), + ENTITY_DEF("Zcaron", 381, "\xc5\xbd"), + ENTITY_DEF("scE", 10932, "\xe2\xaa\xb4"), + ENTITY_DEF("boxvh", 9532, "\xe2\x94\xbc"), + ENTITY_DEF("ominus", 8854, "\xe2\x8a\x96"), + ENTITY_DEF("oopf", 120160, "\xf0\x9d\x95\xa0"), + ENTITY_DEF("nsucceq", 10928, "\xe2\xaa\xb0\xcc\xb8"), + ENTITY_DEF("RBarr", 10512, "\xe2\xa4\x90"), + ENTITY_DEF("iprod", 10812, "\xe2\xa8\xbc"), + ENTITY_DEF("lvnE", 8808, "\xe2\x89\xa8\xef\xb8\x80"), + ENTITY_DEF("andand", 10837, "\xe2\xa9\x95"), + ENTITY_DEF("upharpoonright", 8638, "\xe2\x86\xbe"), + ENTITY_DEF("ncongdot", 10861, "\xe2\xa9\xad\xcc\xb8"), + ENTITY_DEF("drcrop", 8972, "\xe2\x8c\x8c"), + ENTITY_DEF("nsimeq", 8772, "\xe2\x89\x84"), + ENTITY_DEF("subsub", 10965, "\xe2\xab\x95"), + ENTITY_DEF("hardcy", 1098, "\xd1\x8a"), + ENTITY_DEF("leqslant", 10877, "\xe2\xa9\xbd"), + ENTITY_DEF("uharl", 8639, "\xe2\x86\xbf"), + ENTITY_DEF("expectation", 8496, "\xe2\x84\xb0"), + ENTITY_DEF("mdash", 8212, "\xe2\x80\x94"), + ENTITY_DEF("VerticalTilde", 8768, "\xe2\x89\x80"), + ENTITY_DEF("rdldhar", 10601, "\xe2\xa5\xa9"), + ENTITY_DEF("leftharpoonup", 8636, "\xe2\x86\xbc"), + ENTITY_DEF("mu", 956, "\xce\xbc"), + ENTITY_DEF("curarrm", 10556, "\xe2\xa4\xbc"), + ENTITY_DEF("Cdot", 266, "\xc4\x8a"), + ENTITY_DEF("NotTildeTilde", 8777, "\xe2\x89\x89"), + ENTITY_DEF("boxul", 9496, "\xe2\x94\x98"), + ENTITY_DEF("planckh", 8462, "\xe2\x84\x8e"), + ENTITY_DEF("CapitalDifferentialD", 8517, "\xe2\x85\x85"), + ENTITY_DEF("boxDL", 9559, "\xe2\x95\x97"), + ENTITY_DEF("cupbrcap", 10824, "\xe2\xa9\x88"), + ENTITY_DEF("boxdL", 9557, "\xe2\x95\x95"), + ENTITY_DEF("supe", 8839, "\xe2\x8a\x87"), + ENTITY_DEF("nvlt", 60, "\x3c\xe2\x83\x92"), + ENTITY_DEF("par", 8741, "\xe2\x88\xa5"), + ENTITY_DEF("InvisibleComma", 8291, "\xe2\x81\xa3"), + ENTITY_DEF("ring", 730, "\xcb\x9a"), + ENTITY_DEF("nvap", 8781, "\xe2\x89\x8d\xe2\x83\x92"), + ENTITY_DEF("veeeq", 8794, "\xe2\x89\x9a"), + ENTITY_DEF("Hfr", 8460, "\xe2\x84\x8c"), + ENTITY_DEF("dstrok", 273, "\xc4\x91"), + ENTITY_DEF("gesles", 10900, "\xe2\xaa\x94"), + ENTITY_DEF("dash", 8208, "\xe2\x80\x90"), + ENTITY_DEF("SHcy", 1064, "\xd0\xa8"), + ENTITY_DEF("congdot", 10861, "\xe2\xa9\xad"), + ENTITY_DEF("imagline", 8464, "\xe2\x84\x90"), + ENTITY_DEF("ncy", 1085, "\xd0\xbd"), + ENTITY_DEF("bigstar", 9733, "\xe2\x98\x85"), + ENTITY_DEF_HEUR("REG", 174, "\xc2\xae"), + ENTITY_DEF("triangleq", 8796, "\xe2\x89\x9c"), + ENTITY_DEF("rsqb", 93, "\x5d"), + ENTITY_DEF("ddarr", 8650, "\xe2\x87\x8a"), + ENTITY_DEF("csub", 10959, "\xe2\xab\x8f"), + ENTITY_DEF("quest", 63, "\x3f"), + ENTITY_DEF("Star", 8902, "\xe2\x8b\x86"), + ENTITY_DEF_HEUR("LT", 60, "\x3c"), + ENTITY_DEF("ncong", 8775, "\xe2\x89\x87"), + ENTITY_DEF("prnE", 10933, "\xe2\xaa\xb5"), + ENTITY_DEF("bigtriangleup", 9651, "\xe2\x96\xb3"), + ENTITY_DEF("Tilde", 8764, "\xe2\x88\xbc"), + ENTITY_DEF("ltrif", 9666, "\xe2\x97\x82"), + ENTITY_DEF("ldrdhar", 10599, "\xe2\xa5\xa7"), + ENTITY_DEF("lcaron", 318, "\xc4\xbe"), + ENTITY_DEF("equivDD", 10872, "\xe2\xa9\xb8"), + ENTITY_DEF("lHar", 10594, "\xe2\xa5\xa2"), + ENTITY_DEF("vBar", 10984, "\xe2\xab\xa8"), + ENTITY_DEF("Mopf", 120132, "\xf0\x9d\x95\x84"), + ENTITY_DEF("LeftArrow", 8592, "\xe2\x86\x90"), + ENTITY_DEF("Rho", 929, "\xce\xa1"), + ENTITY_DEF("Ccirc", 264, "\xc4\x88"), + ENTITY_DEF("ifr", 120102, "\xf0\x9d\x94\xa6"), + ENTITY_DEF("cacute", 263, "\xc4\x87"), + ENTITY_DEF("centerdot", 183, "\xc2\xb7"), + ENTITY_DEF("dollar", 36, "\x24"), + ENTITY_DEF("lang", 10216, "\xe2\x9f\xa8"), + ENTITY_DEF("curvearrowright", 8631, "\xe2\x86\xb7"), + ENTITY_DEF("half", 189, "\xc2\xbd"), + ENTITY_DEF("Ecy", 1069, "\xd0\xad"), + ENTITY_DEF("rcub", 125, "\x7d"), + ENTITY_DEF("rcy", 1088, "\xd1\x80"), + ENTITY_DEF("isins", 8948, "\xe2\x8b\xb4"), + ENTITY_DEF("bsolhsub", 10184, "\xe2\x9f\x88"), + ENTITY_DEF("boxuL", 9563, "\xe2\x95\x9b"), + ENTITY_DEF("shchcy", 1097, "\xd1\x89"), + ENTITY_DEF("cwconint", 8754, "\xe2\x88\xb2"), + ENTITY_DEF("euro", 8364, "\xe2\x82\xac"), + ENTITY_DEF("lesseqqgtr", 10891, "\xe2\xaa\x8b"), + ENTITY_DEF("sim", 8764, "\xe2\x88\xbc"), + ENTITY_DEF("rarrc", 10547, "\xe2\xa4\xb3"), + ENTITY_DEF("boxdl", 9488, "\xe2\x94\x90"), + ENTITY_DEF("Epsilon", 917, "\xce\x95"), + ENTITY_DEF("iiiint", 10764, "\xe2\xa8\x8c"), + ENTITY_DEF("Rightarrow", 8658, "\xe2\x87\x92"), + ENTITY_DEF("conint", 8750, "\xe2\x88\xae"), + ENTITY_DEF("boxDl", 9558, "\xe2\x95\x96"), + ENTITY_DEF("kappav", 1008, "\xcf\xb0"), + ENTITY_DEF("profsurf", 8979, "\xe2\x8c\x93"), + ENTITY_DEF_HEUR("auml", 228, "\xc3\xa4"), + ENTITY_DEF("heartsuit", 9829, "\xe2\x99\xa5"), + ENTITY_DEF_HEUR("eacute", 233, "\xc3\xa9"), + ENTITY_DEF_HEUR("gt", 62, "\x3e"), + ENTITY_DEF("Gcedil", 290, "\xc4\xa2"), + ENTITY_DEF("easter", 10862, "\xe2\xa9\xae"), + ENTITY_DEF("Tcy", 1058, "\xd0\xa2"), + ENTITY_DEF("swarrow", 8601, "\xe2\x86\x99"), + ENTITY_DEF("lopf", 120157, "\xf0\x9d\x95\x9d"), + ENTITY_DEF("Agrave", 192, "\xc3\x80"), + ENTITY_DEF("Aring", 197, "\xc3\x85"), + ENTITY_DEF("fpartint", 10765, "\xe2\xa8\x8d"), + ENTITY_DEF("xoplus", 10753, "\xe2\xa8\x81"), + ENTITY_DEF("LeftDownTeeVector", 10593, "\xe2\xa5\xa1"), + ENTITY_DEF("int", 8747, "\xe2\x88\xab"), + ENTITY_DEF("Zeta", 918, "\xce\x96"), + ENTITY_DEF("loz", 9674, "\xe2\x97\x8a"), + ENTITY_DEF("ncup", 10818, "\xe2\xa9\x82"), + ENTITY_DEF("napE", 10864, "\xe2\xa9\xb0\xcc\xb8"), + ENTITY_DEF("csup", 10960, "\xe2\xab\x90"), + ENTITY_DEF("Ncedil", 325, "\xc5\x85"), + ENTITY_DEF("cuwed", 8911, "\xe2\x8b\x8f"), + ENTITY_DEF("Dot", 168, "\xc2\xa8"), + ENTITY_DEF("SquareIntersection", 8851, "\xe2\x8a\x93"), + ENTITY_DEF("map", 8614, "\xe2\x86\xa6"), + ENTITY_DEF_HEUR("aelig", 230, "\xc3\xa6"), + ENTITY_DEF("RightArrow", 8594, "\xe2\x86\x92"), + ENTITY_DEF("rightharpoondown", 8641, "\xe2\x87\x81"), + ENTITY_DEF("bNot", 10989, "\xe2\xab\xad"), + ENTITY_DEF("nsccue", 8929, "\xe2\x8b\xa1"), + ENTITY_DEF("zigrarr", 8669, "\xe2\x87\x9d"), + ENTITY_DEF("Sacute", 346, "\xc5\x9a"), + ENTITY_DEF("orv", 10843, "\xe2\xa9\x9b"), + ENTITY_DEF("RightVectorBar", 10579, "\xe2\xa5\x93"), + ENTITY_DEF("nrarrw", 8605, "\xe2\x86\x9d\xcc\xb8"), + ENTITY_DEF("nbump", 8782, "\xe2\x89\x8e\xcc\xb8"), + ENTITY_DEF_HEUR("iquest", 191, "\xc2\xbf"), + ENTITY_DEF("wr", 8768, "\xe2\x89\x80"), + ENTITY_DEF("UpArrow", 8593, "\xe2\x86\x91"), + ENTITY_DEF("notinva", 8713, "\xe2\x88\x89"), + ENTITY_DEF("ddagger", 8225, "\xe2\x80\xa1"), + ENTITY_DEF("nLeftarrow", 8653, "\xe2\x87\x8d"), + ENTITY_DEF("rbbrk", 10099, "\xe2\x9d\xb3"), + ENTITY_DEF("RightTriangle", 8883, "\xe2\x8a\xb3"), + ENTITY_DEF("leqq", 8806, "\xe2\x89\xa6"), + ENTITY_DEF("Vert", 8214, "\xe2\x80\x96"), + ENTITY_DEF("gesl", 8923, "\xe2\x8b\x9b\xef\xb8\x80"), + ENTITY_DEF("LeftTeeVector", 10586, "\xe2\xa5\x9a"), + ENTITY_DEF("Union", 8899, "\xe2\x8b\x83"), + ENTITY_DEF("sc", 8827, "\xe2\x89\xbb"), + ENTITY_DEF("ofr", 120108, "\xf0\x9d\x94\xac"), + ENTITY_DEF("quatint", 10774, "\xe2\xa8\x96"), + ENTITY_DEF("apacir", 10863, "\xe2\xa9\xaf"), + ENTITY_DEF("profalar", 9006, "\xe2\x8c\xae"), + ENTITY_DEF("subsetneq", 8842, "\xe2\x8a\x8a"), + ENTITY_DEF("Vvdash", 8874, "\xe2\x8a\xaa"), + ENTITY_DEF("ohbar", 10677, "\xe2\xa6\xb5"), + ENTITY_DEF("Gt", 8811, "\xe2\x89\xab"), + ENTITY_DEF("exist", 8707, "\xe2\x88\x83"), + ENTITY_DEF("gtrapprox", 10886, "\xe2\xaa\x86"), + ENTITY_DEF_HEUR("euml", 235, "\xc3\xab"), + ENTITY_DEF("Equilibrium", 8652, "\xe2\x87\x8c"), + ENTITY_DEF_HEUR("aacute", 225, "\xc3\xa1"), + ENTITY_DEF("omid", 10678, "\xe2\xa6\xb6"), + ENTITY_DEF("loarr", 8701, "\xe2\x87\xbd"), + ENTITY_DEF("SucceedsSlantEqual", 8829, "\xe2\x89\xbd"), + ENTITY_DEF("angsph", 8738, "\xe2\x88\xa2"), + ENTITY_DEF("nsmid", 8740, "\xe2\x88\xa4"), + ENTITY_DEF("lsquor", 8218, "\xe2\x80\x9a"), + ENTITY_DEF("cemptyv", 10674, "\xe2\xa6\xb2"), + ENTITY_DEF("rAarr", 8667, "\xe2\x87\x9b"), + ENTITY_DEF("searr", 8600, "\xe2\x86\x98"), + ENTITY_DEF("complexes", 8450, "\xe2\x84\x82"), + ENTITY_DEF("UnderParenthesis", 9181, "\xe2\x8f\x9d"), + ENTITY_DEF("nparsl", 11005, "\xe2\xab\xbd\xe2\x83\xa5"), + ENTITY_DEF("Lacute", 313, "\xc4\xb9"), + ENTITY_DEF_HEUR("deg", 176, "\xc2\xb0"), + ENTITY_DEF("Racute", 340, "\xc5\x94"), + ENTITY_DEF("Verbar", 8214, "\xe2\x80\x96"), + ENTITY_DEF("sqcups", 8852, "\xe2\x8a\x94\xef\xb8\x80"), + ENTITY_DEF("Hopf", 8461, "\xe2\x84\x8d"), + ENTITY_DEF("naturals", 8469, "\xe2\x84\x95"), + ENTITY_DEF("Cedilla", 184, "\xc2\xb8"), + ENTITY_DEF("exponentiale", 8519, "\xe2\x85\x87"), + ENTITY_DEF("vnsup", 8835, "\xe2\x8a\x83\xe2\x83\x92"), + ENTITY_DEF("leftrightarrows", 8646, "\xe2\x87\x86"), + ENTITY_DEF("Laplacetrf", 8466, "\xe2\x84\x92"), + ENTITY_DEF("vartriangleright", 8883, "\xe2\x8a\xb3"), + ENTITY_DEF("rtri", 9657, "\xe2\x96\xb9"), + ENTITY_DEF("gE", 8807, "\xe2\x89\xa7"), + ENTITY_DEF("SmallCircle", 8728, "\xe2\x88\x98"), + ENTITY_DEF("diamondsuit", 9830, "\xe2\x99\xa6"), + ENTITY_DEF_HEUR("Otilde", 213, "\xc3\x95"), + ENTITY_DEF("lneq", 10887, "\xe2\xaa\x87"), + ENTITY_DEF("lesdoto", 10881, "\xe2\xaa\x81"), + ENTITY_DEF("ltquest", 10875, "\xe2\xa9\xbb"), + ENTITY_DEF("thinsp", 8201, "\xe2\x80\x89"), + ENTITY_DEF("barwed", 8965, "\xe2\x8c\x85"), + ENTITY_DEF("elsdot", 10903, "\xe2\xaa\x97"), + ENTITY_DEF("circ", 710, "\xcb\x86"), + ENTITY_DEF("ni", 8715, "\xe2\x88\x8b"), + ENTITY_DEF("mlcp", 10971, "\xe2\xab\x9b"), + ENTITY_DEF("Vdash", 8873, "\xe2\x8a\xa9"), + ENTITY_DEF("ShortRightArrow", 8594, "\xe2\x86\x92"), + ENTITY_DEF("upharpoonleft", 8639, "\xe2\x86\xbf"), + ENTITY_DEF("UnderBracket", 9141, "\xe2\x8e\xb5"), + ENTITY_DEF("rAtail", 10524, "\xe2\xa4\x9c"), + ENTITY_DEF("iopf", 120154, "\xf0\x9d\x95\x9a"), + ENTITY_DEF("longleftarrow", 10229, "\xe2\x9f\xb5"), + ENTITY_DEF("Zacute", 377, "\xc5\xb9"), + ENTITY_DEF("duhar", 10607, "\xe2\xa5\xaf"), + ENTITY_DEF("Mfr", 120080, "\xf0\x9d\x94\x90"), + ENTITY_DEF("prnap", 10937, "\xe2\xaa\xb9"), + ENTITY_DEF("eqcirc", 8790, "\xe2\x89\x96"), + ENTITY_DEF("rarrlp", 8620, "\xe2\x86\xac"), + ENTITY_DEF("le", 8804, "\xe2\x89\xa4"), + ENTITY_DEF("Oscr", 119978, "\xf0\x9d\x92\xaa"), + ENTITY_DEF("langd", 10641, "\xe2\xa6\x91"), + ENTITY_DEF("Ucirc", 219, "\xc3\x9b"), + ENTITY_DEF("precnapprox", 10937, "\xe2\xaa\xb9"), + ENTITY_DEF("succcurlyeq", 8829, "\xe2\x89\xbd"), + ENTITY_DEF("Tau", 932, "\xce\xa4"), + ENTITY_DEF("larr", 8592, "\xe2\x86\x90"), + ENTITY_DEF("neArr", 8663, "\xe2\x87\x97"), + ENTITY_DEF("subsim", 10951, "\xe2\xab\x87"), + ENTITY_DEF("DScy", 1029, "\xd0\x85"), + ENTITY_DEF("preccurlyeq", 8828, "\xe2\x89\xbc"), + ENTITY_DEF("NotLessLess", 8810, "\xe2\x89\xaa\xcc\xb8"), + ENTITY_DEF("succnapprox", 10938, "\xe2\xaa\xba"), + ENTITY_DEF("prcue", 8828, "\xe2\x89\xbc"), + ENTITY_DEF("Downarrow", 8659, "\xe2\x87\x93"), + ENTITY_DEF("angmsdah", 10671, "\xe2\xa6\xaf"), + ENTITY_DEF("Emacr", 274, "\xc4\x92"), + ENTITY_DEF("lsh", 8624, "\xe2\x86\xb0"), + ENTITY_DEF("simne", 8774, "\xe2\x89\x86"), + ENTITY_DEF("Bumpeq", 8782, "\xe2\x89\x8e"), + ENTITY_DEF("RightUpTeeVector", 10588, "\xe2\xa5\x9c"), + ENTITY_DEF("Sigma", 931, "\xce\xa3"), + ENTITY_DEF("nvltrie", 8884, "\xe2\x8a\xb4\xe2\x83\x92"), + ENTITY_DEF("lfr", 120105, "\xf0\x9d\x94\xa9"), + ENTITY_DEF("emsp13", 8196, "\xe2\x80\x84"), + ENTITY_DEF("parsl", 11005, "\xe2\xab\xbd"), + ENTITY_DEF_HEUR("ucirc", 251, "\xc3\xbb"), + ENTITY_DEF("gsiml", 10896, "\xe2\xaa\x90"), + ENTITY_DEF("xsqcup", 10758, "\xe2\xa8\x86"), + ENTITY_DEF("Omicron", 927, "\xce\x9f"), + ENTITY_DEF("gsime", 10894, "\xe2\xaa\x8e"), + ENTITY_DEF("circlearrowleft", 8634, "\xe2\x86\xba"), + ENTITY_DEF("sqsupe", 8850, "\xe2\x8a\x92"), + ENTITY_DEF("supE", 10950, "\xe2\xab\x86"), + ENTITY_DEF("dlcrop", 8973, "\xe2\x8c\x8d"), + ENTITY_DEF("RightDownTeeVector", 10589, "\xe2\xa5\x9d"), + ENTITY_DEF("Colone", 10868, "\xe2\xa9\xb4"), + ENTITY_DEF("awconint", 8755, "\xe2\x88\xb3"), + ENTITY_DEF("smte", 10924, "\xe2\xaa\xac"), + ENTITY_DEF("lEg", 10891, "\xe2\xaa\x8b"), + ENTITY_DEF("circledast", 8859, "\xe2\x8a\x9b"), + ENTITY_DEF("ecolon", 8789, "\xe2\x89\x95"), + ENTITY_DEF("rect", 9645, "\xe2\x96\xad"), + ENTITY_DEF("Equal", 10869, "\xe2\xa9\xb5"), + ENTITY_DEF("nwnear", 10535, "\xe2\xa4\xa7"), + ENTITY_DEF("capdot", 10816, "\xe2\xa9\x80"), + ENTITY_DEF("straightphi", 981, "\xcf\x95"), + ENTITY_DEF("forkv", 10969, "\xe2\xab\x99"), + ENTITY_DEF("ZHcy", 1046, "\xd0\x96"), + ENTITY_DEF("Element", 8712, "\xe2\x88\x88"), + ENTITY_DEF("rthree", 8908, "\xe2\x8b\x8c"), + ENTITY_DEF("vzigzag", 10650, "\xe2\xa6\x9a"), + ENTITY_DEF("hybull", 8259, "\xe2\x81\x83"), + ENTITY_DEF("intprod", 10812, "\xe2\xa8\xbc"), + ENTITY_DEF("HumpEqual", 8783, "\xe2\x89\x8f"), + ENTITY_DEF("bigsqcup", 10758, "\xe2\xa8\x86"), + ENTITY_DEF("mp", 8723, "\xe2\x88\x93"), + ENTITY_DEF("lescc", 10920, "\xe2\xaa\xa8"), + ENTITY_DEF("NotPrecedes", 8832, "\xe2\x8a\x80"), + ENTITY_DEF("wedge", 8743, "\xe2\x88\xa7"), + ENTITY_DEF("Supset", 8913, "\xe2\x8b\x91"), + ENTITY_DEF("pm", 177, "\xc2\xb1"), + ENTITY_DEF("kfr", 120104, "\xf0\x9d\x94\xa8"), + ENTITY_DEF("ufisht", 10622, "\xe2\xa5\xbe"), + ENTITY_DEF("ecaron", 283, "\xc4\x9b"), + ENTITY_DEF("chcy", 1095, "\xd1\x87"), + ENTITY_DEF("Esim", 10867, "\xe2\xa9\xb3"), + ENTITY_DEF("fltns", 9649, "\xe2\x96\xb1"), + ENTITY_DEF("nsce", 10928, "\xe2\xaa\xb0\xcc\xb8"), + ENTITY_DEF("hookrightarrow", 8618, "\xe2\x86\xaa"), + ENTITY_DEF("semi", 59, "\x3b"), + ENTITY_DEF("ges", 10878, "\xe2\xa9\xbe"), + ENTITY_DEF("approxeq", 8778, "\xe2\x89\x8a"), + ENTITY_DEF("rarrsim", 10612, "\xe2\xa5\xb4"), + ENTITY_DEF("boxhD", 9573, "\xe2\x95\xa5"), + ENTITY_DEF("varpi", 982, "\xcf\x96"), + ENTITY_DEF("larrb", 8676, "\xe2\x87\xa4"), + ENTITY_DEF("copf", 120148, "\xf0\x9d\x95\x94"), + ENTITY_DEF("Dopf", 120123, "\xf0\x9d\x94\xbb"), + ENTITY_DEF("LeftVector", 8636, "\xe2\x86\xbc"), + ENTITY_DEF("iff", 8660, "\xe2\x87\x94"), + ENTITY_DEF("lnap", 10889, "\xe2\xaa\x89"), + ENTITY_DEF("NotGreaterFullEqual", 8807, "\xe2\x89\xa7\xcc\xb8"), + ENTITY_DEF("varrho", 1009, "\xcf\xb1"), + ENTITY_DEF("NotSucceeds", 8833, "\xe2\x8a\x81"), + ENTITY_DEF("ltrPar", 10646, "\xe2\xa6\x96"), + ENTITY_DEF("nlE", 8806, "\xe2\x89\xa6\xcc\xb8"), + ENTITY_DEF("Zfr", 8488, "\xe2\x84\xa8"), + ENTITY_DEF("LeftArrowBar", 8676, "\xe2\x87\xa4"), + ENTITY_DEF("boxplus", 8862, "\xe2\x8a\x9e"), + ENTITY_DEF("sqsube", 8849, "\xe2\x8a\x91"), + ENTITY_DEF("Re", 8476, "\xe2\x84\x9c"), + ENTITY_DEF("Wfr", 120090, "\xf0\x9d\x94\x9a"), + ENTITY_DEF("epsi", 949, "\xce\xb5"), + ENTITY_DEF("oacute", 243, "\xc3\xb3"), + ENTITY_DEF("bdquo", 8222, "\xe2\x80\x9e"), + ENTITY_DEF("wscr", 120012, "\xf0\x9d\x93\x8c"), + ENTITY_DEF("bullet", 8226, "\xe2\x80\xa2"), + ENTITY_DEF("frown", 8994, "\xe2\x8c\xa2"), + ENTITY_DEF("siml", 10909, "\xe2\xaa\x9d"), + ENTITY_DEF("Rarr", 8608, "\xe2\x86\xa0"), + ENTITY_DEF("Scaron", 352, "\xc5\xa0"), + ENTITY_DEF("gtreqqless", 10892, "\xe2\xaa\x8c"), + ENTITY_DEF("Larr", 8606, "\xe2\x86\x9e"), + ENTITY_DEF("notniva", 8716, "\xe2\x88\x8c"), + ENTITY_DEF("gg", 8811, "\xe2\x89\xab"), + ENTITY_DEF("phmmat", 8499, "\xe2\x84\xb3"), + ENTITY_DEF("boxVL", 9571, "\xe2\x95\xa3"), + ENTITY_DEF("sigmav", 962, "\xcf\x82"), + ENTITY_DEF("order", 8500, "\xe2\x84\xb4"), + ENTITY_DEF("subsup", 10963, "\xe2\xab\x93"), + ENTITY_DEF("afr", 120094, "\xf0\x9d\x94\x9e"), + ENTITY_DEF("lbrace", 123, "\x7b"), + ENTITY_DEF("urcorn", 8989, "\xe2\x8c\x9d"), + ENTITY_DEF("Im", 8465, "\xe2\x84\x91"), + ENTITY_DEF("CounterClockwiseContourIntegral", 8755, "\xe2\x88\xb3"), + ENTITY_DEF("lne", 10887, "\xe2\xaa\x87"), + ENTITY_DEF("chi", 967, "\xcf\x87"), + ENTITY_DEF("cudarrl", 10552, "\xe2\xa4\xb8"), + ENTITY_DEF("ang", 8736, "\xe2\x88\xa0"), + ENTITY_DEF("isindot", 8949, "\xe2\x8b\xb5"), + ENTITY_DEF("Lfr", 120079, "\xf0\x9d\x94\x8f"), + ENTITY_DEF("Rsh", 8625, "\xe2\x86\xb1"), + ENTITY_DEF("Ocy", 1054, "\xd0\x9e"), + ENTITY_DEF("nvrArr", 10499, "\xe2\xa4\x83"), + ENTITY_DEF("otimes", 8855, "\xe2\x8a\x97"), + ENTITY_DEF("eqslantgtr", 10902, "\xe2\xaa\x96"), + ENTITY_DEF("Rfr", 8476, "\xe2\x84\x9c"), + ENTITY_DEF("blacktriangleleft", 9666, "\xe2\x97\x82"), + ENTITY_DEF("Lsh", 8624, "\xe2\x86\xb0"), + ENTITY_DEF("boxvr", 9500, "\xe2\x94\x9c"), + ENTITY_DEF("scedil", 351, "\xc5\x9f"), + ENTITY_DEF_HEUR("iuml", 239, "\xc3\xaf"), + ENTITY_DEF("NJcy", 1034, "\xd0\x8a"), + ENTITY_DEF("Dagger", 8225, "\xe2\x80\xa1"), + ENTITY_DEF("rarrap", 10613, "\xe2\xa5\xb5"), + ENTITY_DEF("udblac", 369, "\xc5\xb1"), + ENTITY_DEF("Sopf", 120138, "\xf0\x9d\x95\x8a"), + ENTITY_DEF("scnsim", 8937, "\xe2\x8b\xa9"), + ENTITY_DEF("hbar", 8463, "\xe2\x84\x8f"), + ENTITY_DEF("frac15", 8533, "\xe2\x85\x95"), + ENTITY_DEF_HEUR("sup3", 179, "\xc2\xb3"), + ENTITY_DEF("NegativeThickSpace", 8203, "\xe2\x80\x8b"), + ENTITY_DEF("npr", 8832, "\xe2\x8a\x80"), + ENTITY_DEF("doteq", 8784, "\xe2\x89\x90"), + ENTITY_DEF("subrarr", 10617, "\xe2\xa5\xb9"), + ENTITY_DEF("SquareSubset", 8847, "\xe2\x8a\x8f"), + ENTITY_DEF("vprop", 8733, "\xe2\x88\x9d"), + ENTITY_DEF("OpenCurlyQuote", 8216, "\xe2\x80\x98"), + ENTITY_DEF("supseteq", 8839, "\xe2\x8a\x87"), + ENTITY_DEF("nRightarrow", 8655, "\xe2\x87\x8f"), + ENTITY_DEF("Longleftarrow", 10232, "\xe2\x9f\xb8"), + ENTITY_DEF("lsquo", 8216, "\xe2\x80\x98"), + ENTITY_DEF("hstrok", 295, "\xc4\xa7"), + ENTITY_DEF("NotTilde", 8769, "\xe2\x89\x81"), + ENTITY_DEF("ogt", 10689, "\xe2\xa7\x81"), + ENTITY_DEF("block", 9608, "\xe2\x96\x88"), + ENTITY_DEF("minusd", 8760, "\xe2\x88\xb8"), + ENTITY_DEF("esdot", 8784, "\xe2\x89\x90"), + ENTITY_DEF("nsim", 8769, "\xe2\x89\x81"), + ENTITY_DEF("scsim", 8831, "\xe2\x89\xbf"), + ENTITY_DEF("boxVl", 9570, "\xe2\x95\xa2"), + ENTITY_DEF("ltimes", 8905, "\xe2\x8b\x89"), + ENTITY_DEF("thkap", 8776, "\xe2\x89\x88"), + ENTITY_DEF("vnsub", 8834, "\xe2\x8a\x82\xe2\x83\x92"), + ENTITY_DEF("thetasym", 977, "\xcf\x91"), + ENTITY_DEF("eopf", 120150, "\xf0\x9d\x95\x96"), + ENTITY_DEF("image", 8465, "\xe2\x84\x91"), + ENTITY_DEF("doteqdot", 8785, "\xe2\x89\x91"), + ENTITY_DEF("Udblac", 368, "\xc5\xb0"), + ENTITY_DEF("gnsim", 8935, "\xe2\x8b\xa7"), + ENTITY_DEF("yicy", 1111, "\xd1\x97"), + ENTITY_DEF("vopf", 120167, "\xf0\x9d\x95\xa7"), + ENTITY_DEF("DDotrahd", 10513, "\xe2\xa4\x91"), + ENTITY_DEF("Iota", 921, "\xce\x99"), + ENTITY_DEF("GJcy", 1027, "\xd0\x83"), + ENTITY_DEF("rightthreetimes", 8908, "\xe2\x8b\x8c"), + ENTITY_DEF("nrtri", 8939, "\xe2\x8b\xab"), + ENTITY_DEF("TildeFullEqual", 8773, "\xe2\x89\x85"), + ENTITY_DEF("Dcaron", 270, "\xc4\x8e"), + ENTITY_DEF("ccaron", 269, "\xc4\x8d"), + ENTITY_DEF("lacute", 314, "\xc4\xba"), + ENTITY_DEF("VerticalBar", 8739, "\xe2\x88\xa3"), + ENTITY_DEF("Igrave", 204, "\xc3\x8c"), + ENTITY_DEF("boxH", 9552, "\xe2\x95\x90"), + ENTITY_DEF("Pfr", 120083, "\xf0\x9d\x94\x93"), + ENTITY_DEF("equals", 61, "\x3d"), + ENTITY_DEF("rbrack", 93, "\x5d"), + ENTITY_DEF("OverParenthesis", 9180, "\xe2\x8f\x9c"), + ENTITY_DEF("in", 8712, "\xe2\x88\x88"), + ENTITY_DEF("llcorner", 8990, "\xe2\x8c\x9e"), + ENTITY_DEF("mcomma", 10793, "\xe2\xa8\xa9"), + ENTITY_DEF("NotGreater", 8815, "\xe2\x89\xaf"), + ENTITY_DEF("midcir", 10992, "\xe2\xab\xb0"), + ENTITY_DEF("Edot", 278, "\xc4\x96"), + ENTITY_DEF("oplus", 8853, "\xe2\x8a\x95"), + ENTITY_DEF("geqq", 8807, "\xe2\x89\xa7"), + ENTITY_DEF("curvearrowleft", 8630, "\xe2\x86\xb6"), + ENTITY_DEF("Poincareplane", 8460, "\xe2\x84\x8c"), + ENTITY_DEF("yscr", 120014, "\xf0\x9d\x93\x8e"), + ENTITY_DEF("ccaps", 10829, "\xe2\xa9\x8d"), + ENTITY_DEF("rpargt", 10644, "\xe2\xa6\x94"), + ENTITY_DEF("topfork", 10970, "\xe2\xab\x9a"), + ENTITY_DEF("Gamma", 915, "\xce\x93"), + ENTITY_DEF("umacr", 363, "\xc5\xab"), + ENTITY_DEF("frac13", 8531, "\xe2\x85\x93"), + ENTITY_DEF("cirfnint", 10768, "\xe2\xa8\x90"), + ENTITY_DEF("xlArr", 10232, "\xe2\x9f\xb8"), + ENTITY_DEF("digamma", 989, "\xcf\x9d"), + ENTITY_DEF("Hat", 94, "\x5e"), + ENTITY_DEF("lates", 10925, "\xe2\xaa\xad\xef\xb8\x80"), + ENTITY_DEF("lgE", 10897, "\xe2\xaa\x91"), + ENTITY_DEF("commat", 64, "\x40"), + ENTITY_DEF("NotPrecedesSlantEqual", 8928, "\xe2\x8b\xa0"), + ENTITY_DEF("phone", 9742, "\xe2\x98\x8e"), + ENTITY_DEF("Ecirc", 202, "\xc3\x8a"), + ENTITY_DEF_HEUR("lt", 60, "\x3c"), + ENTITY_DEF("intcal", 8890, "\xe2\x8a\xba"), + ENTITY_DEF("xdtri", 9661, "\xe2\x96\xbd"), + ENTITY_DEF("Abreve", 258, "\xc4\x82"), + ENTITY_DEF("gopf", 120152, "\xf0\x9d\x95\x98"), + ENTITY_DEF("Xopf", 120143, "\xf0\x9d\x95\x8f"), + ENTITY_DEF("Iacute", 205, "\xc3\x8d"), + ENTITY_DEF("Aopf", 120120, "\xf0\x9d\x94\xb8"), + ENTITY_DEF("gbreve", 287, "\xc4\x9f"), + ENTITY_DEF("nleq", 8816, "\xe2\x89\xb0"), + ENTITY_DEF("xopf", 120169, "\xf0\x9d\x95\xa9"), + ENTITY_DEF("SquareSupersetEqual", 8850, "\xe2\x8a\x92"), + ENTITY_DEF("NotLessTilde", 8820, "\xe2\x89\xb4"), + ENTITY_DEF("SubsetEqual", 8838, "\xe2\x8a\x86"), + ENTITY_DEF("Sc", 10940, "\xe2\xaa\xbc"), + ENTITY_DEF("sdote", 10854, "\xe2\xa9\xa6"), + ENTITY_DEF("loplus", 10797, "\xe2\xa8\xad"), + ENTITY_DEF("zfr", 120119, "\xf0\x9d\x94\xb7"), + ENTITY_DEF("subseteqq", 10949, "\xe2\xab\x85"), + ENTITY_DEF("Vdashl", 10982, "\xe2\xab\xa6"), + ENTITY_DEF("integers", 8484, "\xe2\x84\xa4"), + ENTITY_DEF("Umacr", 362, "\xc5\xaa"), + ENTITY_DEF("dopf", 120149, "\xf0\x9d\x95\x95"), + ENTITY_DEF("RightDownVectorBar", 10581, "\xe2\xa5\x95"), + ENTITY_DEF("angmsdaf", 10669, "\xe2\xa6\xad"), + ENTITY_DEF("Jfr", 120077, "\xf0\x9d\x94\x8d"), + ENTITY_DEF("bernou", 8492, "\xe2\x84\xac"), + ENTITY_DEF("lceil", 8968, "\xe2\x8c\x88"), + ENTITY_DEF("nvsim", 8764, "\xe2\x88\xbc\xe2\x83\x92"), + ENTITY_DEF("NotSucceedsSlantEqual", 8929, "\xe2\x8b\xa1"), + ENTITY_DEF("hearts", 9829, "\xe2\x99\xa5"), + ENTITY_DEF("vee", 8744, "\xe2\x88\xa8"), + ENTITY_DEF("LJcy", 1033, "\xd0\x89"), + ENTITY_DEF("nlt", 8814, "\xe2\x89\xae"), + ENTITY_DEF("because", 8757, "\xe2\x88\xb5"), + ENTITY_DEF("hairsp", 8202, "\xe2\x80\x8a"), + ENTITY_DEF("comma", 44, "\x2c"), + ENTITY_DEF("iecy", 1077, "\xd0\xb5"), + ENTITY_DEF("npre", 10927, "\xe2\xaa\xaf\xcc\xb8"), + ENTITY_DEF("NotSquareSubset", 8847, "\xe2\x8a\x8f\xcc\xb8"), + ENTITY_DEF("mscr", 120002, "\xf0\x9d\x93\x82"), + ENTITY_DEF("jopf", 120155, "\xf0\x9d\x95\x9b"), + ENTITY_DEF("bumpE", 10926, "\xe2\xaa\xae"), + ENTITY_DEF("thicksim", 8764, "\xe2\x88\xbc"), + ENTITY_DEF("Nfr", 120081, "\xf0\x9d\x94\x91"), + ENTITY_DEF("yucy", 1102, "\xd1\x8e"), + ENTITY_DEF("notinvc", 8950, "\xe2\x8b\xb6"), + ENTITY_DEF("lstrok", 322, "\xc5\x82"), + ENTITY_DEF("robrk", 10215, "\xe2\x9f\xa7"), + ENTITY_DEF("LeftTriangleBar", 10703, "\xe2\xa7\x8f"), + ENTITY_DEF("hksearow", 10533, "\xe2\xa4\xa5"), + ENTITY_DEF("bigcap", 8898, "\xe2\x8b\x82"), + ENTITY_DEF("udhar", 10606, "\xe2\xa5\xae"), + ENTITY_DEF("Yscr", 119988, "\xf0\x9d\x92\xb4"), + ENTITY_DEF("smeparsl", 10724, "\xe2\xa7\xa4"), + ENTITY_DEF("NotLess", 8814, "\xe2\x89\xae"), + ENTITY_DEF("dcaron", 271, "\xc4\x8f"), + ENTITY_DEF("ange", 10660, "\xe2\xa6\xa4"), + ENTITY_DEF("dHar", 10597, "\xe2\xa5\xa5"), + ENTITY_DEF("UpperRightArrow", 8599, "\xe2\x86\x97"), + ENTITY_DEF("trpezium", 9186, "\xe2\x8f\xa2"), + ENTITY_DEF("boxminus", 8863, "\xe2\x8a\x9f"), + ENTITY_DEF("notni", 8716, "\xe2\x88\x8c"), + ENTITY_DEF("dtrif", 9662, "\xe2\x96\xbe"), + ENTITY_DEF("nhArr", 8654, "\xe2\x87\x8e"), + ENTITY_DEF("larrpl", 10553, "\xe2\xa4\xb9"), + ENTITY_DEF("simeq", 8771, "\xe2\x89\x83"), + ENTITY_DEF("geqslant", 10878, "\xe2\xa9\xbe"), + ENTITY_DEF("RightUpVectorBar", 10580, "\xe2\xa5\x94"), + ENTITY_DEF("nsc", 8833, "\xe2\x8a\x81"), + ENTITY_DEF("div", 247, "\xc3\xb7"), + ENTITY_DEF("orslope", 10839, "\xe2\xa9\x97"), + ENTITY_DEF("lparlt", 10643, "\xe2\xa6\x93"), + ENTITY_DEF("trie", 8796, "\xe2\x89\x9c"), + ENTITY_DEF("cirmid", 10991, "\xe2\xab\xaf"), + ENTITY_DEF("wp", 8472, "\xe2\x84\x98"), + ENTITY_DEF("dagger", 8224, "\xe2\x80\xa0"), + ENTITY_DEF("utri", 9653, "\xe2\x96\xb5"), + ENTITY_DEF("supnE", 10956, "\xe2\xab\x8c"), + ENTITY_DEF("eg", 10906, "\xe2\xaa\x9a"), + ENTITY_DEF("LeftDownVector", 8643, "\xe2\x87\x83"), + ENTITY_DEF("NotLessEqual", 8816, "\xe2\x89\xb0"), + ENTITY_DEF("Bopf", 120121, "\xf0\x9d\x94\xb9"), + ENTITY_DEF("LongLeftRightArrow", 10231, "\xe2\x9f\xb7"), + ENTITY_DEF("Gfr", 120074, "\xf0\x9d\x94\x8a"), + ENTITY_DEF("sqsubseteq", 8849, "\xe2\x8a\x91"), + ENTITY_DEF_HEUR("ograve", 242, "\xc3\xb2"), + ENTITY_DEF("larrhk", 8617, "\xe2\x86\xa9"), + ENTITY_DEF("sigma", 963, "\xcf\x83"), + ENTITY_DEF("NotSquareSupersetEqual", 8931, "\xe2\x8b\xa3"), + ENTITY_DEF("gvnE", 8809, "\xe2\x89\xa9\xef\xb8\x80"), + ENTITY_DEF("timesbar", 10801, "\xe2\xa8\xb1"), + ENTITY_DEF("Iukcy", 1030, "\xd0\x86"), + ENTITY_DEF("bscr", 119991, "\xf0\x9d\x92\xb7"), + ENTITY_DEF("Exists", 8707, "\xe2\x88\x83"), + ENTITY_DEF("tscr", 120009, "\xf0\x9d\x93\x89"), + ENTITY_DEF("tcy", 1090, "\xd1\x82"), + ENTITY_DEF("nwarr", 8598, "\xe2\x86\x96"), + ENTITY_DEF("hoarr", 8703, "\xe2\x87\xbf"), + ENTITY_DEF("lnapprox", 10889, "\xe2\xaa\x89"), + ENTITY_DEF("nu", 957, "\xce\xbd"), + ENTITY_DEF("bcy", 1073, "\xd0\xb1"), + ENTITY_DEF("ndash", 8211, "\xe2\x80\x93"), + ENTITY_DEF("smt", 10922, "\xe2\xaa\xaa"), + ENTITY_DEF("scaron", 353, "\xc5\xa1"), + ENTITY_DEF("IOcy", 1025, "\xd0\x81"), + ENTITY_DEF("Ifr", 8465, "\xe2\x84\x91"), + ENTITY_DEF("cularrp", 10557, "\xe2\xa4\xbd"), + ENTITY_DEF("lvertneqq", 8808, "\xe2\x89\xa8\xef\xb8\x80"), + ENTITY_DEF("nlarr", 8602, "\xe2\x86\x9a"), + ENTITY_DEF("colon", 58, "\x3a"), + ENTITY_DEF("ddotseq", 10871, "\xe2\xa9\xb7"), + ENTITY_DEF("zacute", 378, "\xc5\xba"), + ENTITY_DEF("DoubleVerticalBar", 8741, "\xe2\x88\xa5"), + ENTITY_DEF("larrfs", 10525, "\xe2\xa4\x9d"), + ENTITY_DEF("NotExists", 8708, "\xe2\x88\x84"), + ENTITY_DEF("geq", 8805, "\xe2\x89\xa5"), + ENTITY_DEF("Ffr", 120073, "\xf0\x9d\x94\x89"), + ENTITY_DEF_HEUR("divide", 247, "\xc3\xb7"), + ENTITY_DEF("blank", 9251, "\xe2\x90\xa3"), + ENTITY_DEF("IEcy", 1045, "\xd0\x95"), + ENTITY_DEF_HEUR("ordm", 186, "\xc2\xba"), + ENTITY_DEF("fopf", 120151, "\xf0\x9d\x95\x97"), + ENTITY_DEF("ecir", 8790, "\xe2\x89\x96"), + ENTITY_DEF("complement", 8705, "\xe2\x88\x81"), + ENTITY_DEF("top", 8868, "\xe2\x8a\xa4"), + ENTITY_DEF("DoubleContourIntegral", 8751, "\xe2\x88\xaf"), + ENTITY_DEF("nisd", 8954, "\xe2\x8b\xba"), + ENTITY_DEF("bcong", 8780, "\xe2\x89\x8c"), + ENTITY_DEF("plusdu", 10789, "\xe2\xa8\xa5"), + ENTITY_DEF("TildeTilde", 8776, "\xe2\x89\x88"), + ENTITY_DEF("lnE", 8808, "\xe2\x89\xa8"), + ENTITY_DEF("DoubleLongRightArrow", 10233, "\xe2\x9f\xb9"), + ENTITY_DEF("nsubseteqq", 10949, "\xe2\xab\x85\xcc\xb8"), + ENTITY_DEF("DownTeeArrow", 8615, "\xe2\x86\xa7"), + ENTITY_DEF("Cscr", 119966, "\xf0\x9d\x92\x9e"), + ENTITY_DEF("NegativeVeryThinSpace", 8203, "\xe2\x80\x8b"), + ENTITY_DEF("emsp", 8195, "\xe2\x80\x83"), + ENTITY_DEF("vartriangleleft", 8882, "\xe2\x8a\xb2"), + ENTITY_DEF("ropar", 10630, "\xe2\xa6\x86"), + ENTITY_DEF("checkmark", 10003, "\xe2\x9c\x93"), + ENTITY_DEF("Ycy", 1067, "\xd0\xab"), + ENTITY_DEF("supset", 8835, "\xe2\x8a\x83"), + ENTITY_DEF("gneqq", 8809, "\xe2\x89\xa9"), + ENTITY_DEF("Lstrok", 321, "\xc5\x81"), + ENTITY_DEF_HEUR("AMP", 38, "\x26"), + ENTITY_DEF("acE", 8766, "\xe2\x88\xbe\xcc\xb3"), + ENTITY_DEF("sqsupseteq", 8850, "\xe2\x8a\x92"), + ENTITY_DEF("nle", 8816, "\xe2\x89\xb0"), + ENTITY_DEF("nesear", 10536, "\xe2\xa4\xa8"), + ENTITY_DEF("LeftDownVectorBar", 10585, "\xe2\xa5\x99"), + ENTITY_DEF("Integral", 8747, "\xe2\x88\xab"), + ENTITY_DEF("Beta", 914, "\xce\x92"), + ENTITY_DEF("nvdash", 8876, "\xe2\x8a\xac"), + ENTITY_DEF("nges", 10878, "\xe2\xa9\xbe\xcc\xb8"), + ENTITY_DEF("demptyv", 10673, "\xe2\xa6\xb1"), + ENTITY_DEF("eta", 951, "\xce\xb7"), + ENTITY_DEF("GreaterSlantEqual", 10878, "\xe2\xa9\xbe"), + ENTITY_DEF_HEUR("ccedil", 231, "\xc3\xa7"), + ENTITY_DEF("pfr", 120109, "\xf0\x9d\x94\xad"), + ENTITY_DEF("bbrktbrk", 9142, "\xe2\x8e\xb6"), + ENTITY_DEF("mcy", 1084, "\xd0\xbc"), + ENTITY_DEF("Not", 10988, "\xe2\xab\xac"), + ENTITY_DEF("qscr", 120006, "\xf0\x9d\x93\x86"), + ENTITY_DEF("zwj", 8205, "\xe2\x80\x8d"), + ENTITY_DEF("ntrianglerighteq", 8941, "\xe2\x8b\xad"), + ENTITY_DEF("permil", 8240, "\xe2\x80\xb0"), + ENTITY_DEF("squarf", 9642, "\xe2\x96\xaa"), + ENTITY_DEF("apos", 39, "\x27"), + ENTITY_DEF("lrm", 8206, "\xe2\x80\x8e"), + ENTITY_DEF("male", 9794, "\xe2\x99\x82"), + ENTITY_DEF_HEUR("agrave", 224, "\xc3\xa0"), + ENTITY_DEF("Lt", 8810, "\xe2\x89\xaa"), + ENTITY_DEF("capand", 10820, "\xe2\xa9\x84"), + ENTITY_DEF_HEUR("aring", 229, "\xc3\xa5"), + ENTITY_DEF("Jukcy", 1028, "\xd0\x84"), + ENTITY_DEF("bumpe", 8783, "\xe2\x89\x8f"), + ENTITY_DEF("dd", 8518, "\xe2\x85\x86"), + ENTITY_DEF("tscy", 1094, "\xd1\x86"), + ENTITY_DEF("oS", 9416, "\xe2\x93\x88"), + ENTITY_DEF("succeq", 10928, "\xe2\xaa\xb0"), + ENTITY_DEF("xharr", 10231, "\xe2\x9f\xb7"), + ENTITY_DEF("pluse", 10866, "\xe2\xa9\xb2"), + ENTITY_DEF("rfisht", 10621, "\xe2\xa5\xbd"), + ENTITY_DEF("HorizontalLine", 9472, "\xe2\x94\x80"), + ENTITY_DEF("DiacriticalAcute", 180, "\xc2\xb4"), + ENTITY_DEF("hfr", 120101, "\xf0\x9d\x94\xa5"), + ENTITY_DEF("preceq", 10927, "\xe2\xaa\xaf"), + ENTITY_DEF("rationals", 8474, "\xe2\x84\x9a"), + ENTITY_DEF_HEUR("Auml", 196, "\xc3\x84"), + ENTITY_DEF("LeftRightArrow", 8596, "\xe2\x86\x94"), + ENTITY_DEF("blacktriangleright", 9656, "\xe2\x96\xb8"), + ENTITY_DEF("dharr", 8642, "\xe2\x87\x82"), + ENTITY_DEF("isin", 8712, "\xe2\x88\x88"), + ENTITY_DEF("ldrushar", 10571, "\xe2\xa5\x8b"), + ENTITY_DEF("squ", 9633, "\xe2\x96\xa1"), + ENTITY_DEF("rbrksld", 10638, "\xe2\xa6\x8e"), + ENTITY_DEF("bigwedge", 8896, "\xe2\x8b\x80"), + ENTITY_DEF("swArr", 8665, "\xe2\x87\x99"), + ENTITY_DEF("IJlig", 306, "\xc4\xb2"), + ENTITY_DEF("harr", 8596, "\xe2\x86\x94"), + ENTITY_DEF("range", 10661, "\xe2\xa6\xa5"), + ENTITY_DEF("urtri", 9721, "\xe2\x97\xb9"), + ENTITY_DEF("NotVerticalBar", 8740, "\xe2\x88\xa4"), + ENTITY_DEF("ic", 8291, "\xe2\x81\xa3"), + ENTITY_DEF("solbar", 9023, "\xe2\x8c\xbf"), + ENTITY_DEF("approx", 8776, "\xe2\x89\x88"), + ENTITY_DEF("SquareSuperset", 8848, "\xe2\x8a\x90"), + ENTITY_DEF("numsp", 8199, "\xe2\x80\x87"), + ENTITY_DEF("nLt", 8810, "\xe2\x89\xaa\xe2\x83\x92"), + ENTITY_DEF("tilde", 732, "\xcb\x9c"), + ENTITY_DEF("rlarr", 8644, "\xe2\x87\x84"), + ENTITY_DEF("langle", 10216, "\xe2\x9f\xa8"), + ENTITY_DEF("nleqslant", 10877, "\xe2\xa9\xbd\xcc\xb8"), + ENTITY_DEF("Nacute", 323, "\xc5\x83"), + ENTITY_DEF("NotLeftTriangle", 8938, "\xe2\x8b\xaa"), + ENTITY_DEF("sopf", 120164, "\xf0\x9d\x95\xa4"), + ENTITY_DEF("xmap", 10236, "\xe2\x9f\xbc"), + ENTITY_DEF("supne", 8843, "\xe2\x8a\x8b"), + ENTITY_DEF("Int", 8748, "\xe2\x88\xac"), + ENTITY_DEF("nsupseteqq", 10950, "\xe2\xab\x86\xcc\xb8"), + ENTITY_DEF("circlearrowright", 8635, "\xe2\x86\xbb"), + ENTITY_DEF("NotCongruent", 8802, "\xe2\x89\xa2"), + ENTITY_DEF("Scedil", 350, "\xc5\x9e"), + ENTITY_DEF_HEUR("raquo", 187, "\xc2\xbb"), + ENTITY_DEF("ycy", 1099, "\xd1\x8b"), + ENTITY_DEF("notinvb", 8951, "\xe2\x8b\xb7"), + ENTITY_DEF("andv", 10842, "\xe2\xa9\x9a"), + ENTITY_DEF("nap", 8777, "\xe2\x89\x89"), + ENTITY_DEF("shcy", 1096, "\xd1\x88"), + ENTITY_DEF("ssetmn", 8726, "\xe2\x88\x96"), + ENTITY_DEF("downarrow", 8595, "\xe2\x86\x93"), + ENTITY_DEF("gesdotol", 10884, "\xe2\xaa\x84"), + ENTITY_DEF("Congruent", 8801, "\xe2\x89\xa1"), + ENTITY_DEF_HEUR("pound", 163, "\xc2\xa3"), + ENTITY_DEF("ZeroWidthSpace", 8203, "\xe2\x80\x8b"), + ENTITY_DEF("rdca", 10551, "\xe2\xa4\xb7"), + ENTITY_DEF("rmoust", 9137, "\xe2\x8e\xb1"), + ENTITY_DEF("zcy", 1079, "\xd0\xb7"), + ENTITY_DEF("Square", 9633, "\xe2\x96\xa1"), + ENTITY_DEF("subE", 10949, "\xe2\xab\x85"), + ENTITY_DEF("infintie", 10717, "\xe2\xa7\x9d"), + ENTITY_DEF("Cayleys", 8493, "\xe2\x84\xad"), + ENTITY_DEF("lsaquo", 8249, "\xe2\x80\xb9"), + ENTITY_DEF("realpart", 8476, "\xe2\x84\x9c"), + ENTITY_DEF("nprec", 8832, "\xe2\x8a\x80"), + ENTITY_DEF("RightTriangleBar", 10704, "\xe2\xa7\x90"), + ENTITY_DEF("Kopf", 120130, "\xf0\x9d\x95\x82"), + ENTITY_DEF("Ubreve", 364, "\xc5\xac"), + ENTITY_DEF("Uopf", 120140, "\xf0\x9d\x95\x8c"), + ENTITY_DEF("trianglelefteq", 8884, "\xe2\x8a\xb4"), + ENTITY_DEF("rotimes", 10805, "\xe2\xa8\xb5"), + ENTITY_DEF("qfr", 120110, "\xf0\x9d\x94\xae"), + ENTITY_DEF("gtcc", 10919, "\xe2\xaa\xa7"), + ENTITY_DEF("fnof", 402, "\xc6\x92"), + ENTITY_DEF("tritime", 10811, "\xe2\xa8\xbb"), + ENTITY_DEF("andslope", 10840, "\xe2\xa9\x98"), + ENTITY_DEF("harrw", 8621, "\xe2\x86\xad"), + ENTITY_DEF("NotSquareSuperset", 8848, "\xe2\x8a\x90\xcc\xb8"), + ENTITY_DEF("Amacr", 256, "\xc4\x80"), + ENTITY_DEF("OpenCurlyDoubleQuote", 8220, "\xe2\x80\x9c"), + ENTITY_DEF_HEUR("thorn", 254, "\xc3\xbe"), + ENTITY_DEF_HEUR("ordf", 170, "\xc2\xaa"), + ENTITY_DEF("natur", 9838, "\xe2\x99\xae"), + ENTITY_DEF("xi", 958, "\xce\xbe"), + ENTITY_DEF("infin", 8734, "\xe2\x88\x9e"), + ENTITY_DEF("nspar", 8742, "\xe2\x88\xa6"), + ENTITY_DEF("Jcy", 1049, "\xd0\x99"), + ENTITY_DEF("DownLeftTeeVector", 10590, "\xe2\xa5\x9e"), + ENTITY_DEF("rbarr", 10509, "\xe2\xa4\x8d"), + ENTITY_DEF("Xi", 926, "\xce\x9e"), + ENTITY_DEF("bull", 8226, "\xe2\x80\xa2"), + ENTITY_DEF("cuesc", 8927, "\xe2\x8b\x9f"), + ENTITY_DEF("backcong", 8780, "\xe2\x89\x8c"), + ENTITY_DEF("frac35", 8535, "\xe2\x85\x97"), + ENTITY_DEF("hscr", 119997, "\xf0\x9d\x92\xbd"), + ENTITY_DEF("LessEqualGreater", 8922, "\xe2\x8b\x9a"), + ENTITY_DEF("Implies", 8658, "\xe2\x87\x92"), + ENTITY_DEF("ETH", 208, "\xc3\x90"), + ENTITY_DEF_HEUR("Yacute", 221, "\xc3\x9d"), + ENTITY_DEF_HEUR("shy", 173, "\xc2\xad"), + ENTITY_DEF("Rarrtl", 10518, "\xe2\xa4\x96"), + ENTITY_DEF_HEUR("sup1", 185, "\xc2\xb9"), + ENTITY_DEF("reals", 8477, "\xe2\x84\x9d"), + ENTITY_DEF("blacklozenge", 10731, "\xe2\xa7\xab"), + ENTITY_DEF("ncedil", 326, "\xc5\x86"), + ENTITY_DEF("Lambda", 923, "\xce\x9b"), + ENTITY_DEF("uopf", 120166, "\xf0\x9d\x95\xa6"), + ENTITY_DEF("bigodot", 10752, "\xe2\xa8\x80"), + ENTITY_DEF("ubreve", 365, "\xc5\xad"), + ENTITY_DEF("drbkarow", 10512, "\xe2\xa4\x90"), + ENTITY_DEF("els", 10901, "\xe2\xaa\x95"), + ENTITY_DEF("shortparallel", 8741, "\xe2\x88\xa5"), + ENTITY_DEF("Pcy", 1055, "\xd0\x9f"), + ENTITY_DEF("dsol", 10742, "\xe2\xa7\xb6"), + ENTITY_DEF("supsim", 10952, "\xe2\xab\x88"), + ENTITY_DEF("Longrightarrow", 10233, "\xe2\x9f\xb9"), + ENTITY_DEF("ThickSpace", 8287, "\xe2\x81\x9f\xe2\x80\x8a"), + ENTITY_DEF("Itilde", 296, "\xc4\xa8"), + ENTITY_DEF("nparallel", 8742, "\xe2\x88\xa6"), + ENTITY_DEF("And", 10835, "\xe2\xa9\x93"), + ENTITY_DEF("boxhd", 9516, "\xe2\x94\xac"), + ENTITY_DEF("Dashv", 10980, "\xe2\xab\xa4"), + ENTITY_DEF("NotSuperset", 8835, "\xe2\x8a\x83\xe2\x83\x92"), + ENTITY_DEF("Eta", 919, "\xce\x97"), + ENTITY_DEF("Qopf", 8474, "\xe2\x84\x9a"), + ENTITY_DEF("period", 46, "\x2e"), + ENTITY_DEF("angmsd", 8737, "\xe2\x88\xa1"), + ENTITY_DEF("fllig", 64258, "\xef\xac\x82"), + ENTITY_DEF("cuvee", 8910, "\xe2\x8b\x8e"), + ENTITY_DEF("wedbar", 10847, "\xe2\xa9\x9f"), + ENTITY_DEF("Fscr", 8497, "\xe2\x84\xb1"), + ENTITY_DEF("veebar", 8891, "\xe2\x8a\xbb"), + ENTITY_DEF("Longleftrightarrow", 10234, "\xe2\x9f\xba"), + ENTITY_DEF_HEUR("reg", 174, "\xc2\xae"), + ENTITY_DEF("NegativeMediumSpace", 8203, "\xe2\x80\x8b"), + ENTITY_DEF("Upsi", 978, "\xcf\x92"), + ENTITY_DEF("Mellintrf", 8499, "\xe2\x84\xb3"), + ENTITY_DEF("boxHU", 9577, "\xe2\x95\xa9"), + ENTITY_DEF("frac56", 8538, "\xe2\x85\x9a"), + ENTITY_DEF("utrif", 9652, "\xe2\x96\xb4"), + ENTITY_DEF("LeftTriangle", 8882, "\xe2\x8a\xb2"), + ENTITY_DEF("nsime", 8772, "\xe2\x89\x84"), + ENTITY_DEF("rcedil", 343, "\xc5\x97"), + ENTITY_DEF("aogon", 261, "\xc4\x85"), + ENTITY_DEF("uHar", 10595, "\xe2\xa5\xa3"), + ENTITY_DEF("ForAll", 8704, "\xe2\x88\x80"), + ENTITY_DEF("prE", 10931, "\xe2\xaa\xb3"), + ENTITY_DEF("boxV", 9553, "\xe2\x95\x91"), + ENTITY_DEF("softcy", 1100, "\xd1\x8c"), + ENTITY_DEF("hercon", 8889, "\xe2\x8a\xb9"), + ENTITY_DEF("lmoustache", 9136, "\xe2\x8e\xb0"), + ENTITY_DEF("Product", 8719, "\xe2\x88\x8f"), + ENTITY_DEF("lsimg", 10895, "\xe2\xaa\x8f"), + ENTITY_DEF("verbar", 124, "\x7c"), + ENTITY_DEF("ofcir", 10687, "\xe2\xa6\xbf"), + ENTITY_DEF("curlyeqprec", 8926, "\xe2\x8b\x9e"), + ENTITY_DEF("ldquo", 8220, "\xe2\x80\x9c"), + ENTITY_DEF("bot", 8869, "\xe2\x8a\xa5"), + ENTITY_DEF("Psi", 936, "\xce\xa8"), + ENTITY_DEF("OElig", 338, "\xc5\x92"), + ENTITY_DEF("DownRightVectorBar", 10583, "\xe2\xa5\x97"), + ENTITY_DEF("minusb", 8863, "\xe2\x8a\x9f"), + ENTITY_DEF("Iscr", 8464, "\xe2\x84\x90"), + ENTITY_DEF("Tcedil", 354, "\xc5\xa2"), + ENTITY_DEF("ffilig", 64259, "\xef\xac\x83"), + ENTITY_DEF("Gcy", 1043, "\xd0\x93"), + ENTITY_DEF("oline", 8254, "\xe2\x80\xbe"), + ENTITY_DEF("bottom", 8869, "\xe2\x8a\xa5"), + ENTITY_DEF("nVDash", 8879, "\xe2\x8a\xaf"), + ENTITY_DEF("lessdot", 8918, "\xe2\x8b\x96"), + ENTITY_DEF("cups", 8746, "\xe2\x88\xaa\xef\xb8\x80"), + ENTITY_DEF("gla", 10917, "\xe2\xaa\xa5"), + ENTITY_DEF("hellip", 8230, "\xe2\x80\xa6"), + ENTITY_DEF("hookleftarrow", 8617, "\xe2\x86\xa9"), + ENTITY_DEF("Cup", 8915, "\xe2\x8b\x93"), + ENTITY_DEF("upsi", 965, "\xcf\x85"), + ENTITY_DEF("DownArrowBar", 10515, "\xe2\xa4\x93"), + ENTITY_DEF("lowast", 8727, "\xe2\x88\x97"), + ENTITY_DEF("profline", 8978, "\xe2\x8c\x92"), + ENTITY_DEF("ngsim", 8821, "\xe2\x89\xb5"), + ENTITY_DEF("boxhu", 9524, "\xe2\x94\xb4"), + ENTITY_DEF("operp", 10681, "\xe2\xa6\xb9"), + ENTITY_DEF("cap", 8745, "\xe2\x88\xa9"), + ENTITY_DEF("Hcirc", 292, "\xc4\xa4"), + ENTITY_DEF("Ncy", 1053, "\xd0\x9d"), + ENTITY_DEF("zeetrf", 8488, "\xe2\x84\xa8"), + ENTITY_DEF("cuepr", 8926, "\xe2\x8b\x9e"), + ENTITY_DEF("supsetneq", 8843, "\xe2\x8a\x8b"), + ENTITY_DEF("lfloor", 8970, "\xe2\x8c\x8a"), + ENTITY_DEF("ngtr", 8815, "\xe2\x89\xaf"), + ENTITY_DEF("ccups", 10828, "\xe2\xa9\x8c"), + ENTITY_DEF("pscr", 120005, "\xf0\x9d\x93\x85"), + ENTITY_DEF("Cfr", 8493, "\xe2\x84\xad"), + ENTITY_DEF("dtri", 9663, "\xe2\x96\xbf"), + ENTITY_DEF("icirc", 238, "\xc3\xae"), + ENTITY_DEF("leftarrow", 8592, "\xe2\x86\x90"), + ENTITY_DEF("vdash", 8866, "\xe2\x8a\xa2"), + ENTITY_DEF("leftrightharpoons", 8651, "\xe2\x87\x8b"), + ENTITY_DEF("rightrightarrows", 8649, "\xe2\x87\x89"), + ENTITY_DEF("strns", 175, "\xc2\xaf"), + ENTITY_DEF("intlarhk", 10775, "\xe2\xa8\x97"), + ENTITY_DEF("downharpoonright", 8642, "\xe2\x87\x82"), + ENTITY_DEF_HEUR("yacute", 253, "\xc3\xbd"), + ENTITY_DEF("boxUr", 9561, "\xe2\x95\x99"), + ENTITY_DEF("triangleleft", 9667, "\xe2\x97\x83"), + ENTITY_DEF("DiacriticalDot", 729, "\xcb\x99"), + ENTITY_DEF("thetav", 977, "\xcf\x91"), + ENTITY_DEF("OverBracket", 9140, "\xe2\x8e\xb4"), + ENTITY_DEF("PrecedesTilde", 8830, "\xe2\x89\xbe"), + ENTITY_DEF("rtrie", 8885, "\xe2\x8a\xb5"), + ENTITY_DEF("Scirc", 348, "\xc5\x9c"), + ENTITY_DEF("vsupne", 8843, "\xe2\x8a\x8b\xef\xb8\x80"), + ENTITY_DEF("OverBrace", 9182, "\xe2\x8f\x9e"), + ENTITY_DEF("Yfr", 120092, "\xf0\x9d\x94\x9c"), + ENTITY_DEF("scnE", 10934, "\xe2\xaa\xb6"), + ENTITY_DEF("simlE", 10911, "\xe2\xaa\x9f"), + ENTITY_DEF("Proportional", 8733, "\xe2\x88\x9d"), + ENTITY_DEF("edot", 279, "\xc4\x97"), + ENTITY_DEF("loang", 10220, "\xe2\x9f\xac"), + ENTITY_DEF("gesdot", 10880, "\xe2\xaa\x80"), + ENTITY_DEF("DownBreve", 785, "\xcc\x91"), + ENTITY_DEF("pcy", 1087, "\xd0\xbf"), + ENTITY_DEF("Succeeds", 8827, "\xe2\x89\xbb"), + ENTITY_DEF("mfr", 120106, "\xf0\x9d\x94\xaa"), + ENTITY_DEF("Leftarrow", 8656, "\xe2\x87\x90"), + ENTITY_DEF("boxDr", 9555, "\xe2\x95\x93"), + ENTITY_DEF("Nscr", 119977, "\xf0\x9d\x92\xa9"), + ENTITY_DEF("diam", 8900, "\xe2\x8b\x84"), + ENTITY_DEF("CHcy", 1063, "\xd0\xa7"), + ENTITY_DEF("boxdr", 9484, "\xe2\x94\x8c"), + ENTITY_DEF("rlm", 8207, "\xe2\x80\x8f"), + ENTITY_DEF("Coproduct", 8720, "\xe2\x88\x90"), + ENTITY_DEF("RightTeeArrow", 8614, "\xe2\x86\xa6"), + ENTITY_DEF("tridot", 9708, "\xe2\x97\xac"), + ENTITY_DEF("ldquor", 8222, "\xe2\x80\x9e"), + ENTITY_DEF("sol", 47, "\x2f"), + ENTITY_DEF_HEUR("ecirc", 234, "\xc3\xaa"), + ENTITY_DEF("DoubleLeftArrow", 8656, "\xe2\x87\x90"), + ENTITY_DEF("Gscr", 119970, "\xf0\x9d\x92\xa2"), + ENTITY_DEF("ap", 8776, "\xe2\x89\x88"), + ENTITY_DEF("rbrke", 10636, "\xe2\xa6\x8c"), + ENTITY_DEF("LeftFloor", 8970, "\xe2\x8c\x8a"), + ENTITY_DEF("blk12", 9618, "\xe2\x96\x92"), + ENTITY_DEF("Conint", 8751, "\xe2\x88\xaf"), + ENTITY_DEF("triangledown", 9663, "\xe2\x96\xbf"), + ENTITY_DEF("Icy", 1048, "\xd0\x98"), + ENTITY_DEF("backprime", 8245, "\xe2\x80\xb5"), + ENTITY_DEF("longleftrightarrow", 10231, "\xe2\x9f\xb7"), + ENTITY_DEF("ntriangleleft", 8938, "\xe2\x8b\xaa"), + ENTITY_DEF_HEUR("copy", 169, "\xc2\xa9"), + ENTITY_DEF("mapstodown", 8615, "\xe2\x86\xa7"), + ENTITY_DEF("seArr", 8664, "\xe2\x87\x98"), + ENTITY_DEF("ENG", 330, "\xc5\x8a"), + ENTITY_DEF("DoubleRightArrow", 8658, "\xe2\x87\x92"), + ENTITY_DEF("tfr", 120113, "\xf0\x9d\x94\xb1"), + ENTITY_DEF("rharul", 10604, "\xe2\xa5\xac"), + ENTITY_DEF("bfr", 120095, "\xf0\x9d\x94\x9f"), + ENTITY_DEF("origof", 8886, "\xe2\x8a\xb6"), + ENTITY_DEF("Therefore", 8756, "\xe2\x88\xb4"), + ENTITY_DEF("glE", 10898, "\xe2\xaa\x92"), + ENTITY_DEF("leftarrowtail", 8610, "\xe2\x86\xa2"), + ENTITY_DEF("NotEqual", 8800, "\xe2\x89\xa0"), + ENTITY_DEF("LeftCeiling", 8968, "\xe2\x8c\x88"), + ENTITY_DEF("lArr", 8656, "\xe2\x87\x90"), + ENTITY_DEF("subseteq", 8838, "\xe2\x8a\x86"), + ENTITY_DEF("larrbfs", 10527, "\xe2\xa4\x9f"), + ENTITY_DEF("Gammad", 988, "\xcf\x9c"), + ENTITY_DEF("rtriltri", 10702, "\xe2\xa7\x8e"), + ENTITY_DEF("Fcy", 1060, "\xd0\xa4"), + ENTITY_DEF("Vopf", 120141, "\xf0\x9d\x95\x8d"), + ENTITY_DEF("lrarr", 8646, "\xe2\x87\x86"), + ENTITY_DEF("delta", 948, "\xce\xb4"), + ENTITY_DEF("xodot", 10752, "\xe2\xa8\x80"), + ENTITY_DEF("larrtl", 8610, "\xe2\x86\xa2"), + ENTITY_DEF("gsim", 8819, "\xe2\x89\xb3"), + ENTITY_DEF("ratail", 10522, "\xe2\xa4\x9a"), + ENTITY_DEF("vsubne", 8842, "\xe2\x8a\x8a\xef\xb8\x80"), + ENTITY_DEF("boxur", 9492, "\xe2\x94\x94"), + ENTITY_DEF("succsim", 8831, "\xe2\x89\xbf"), + ENTITY_DEF("triplus", 10809, "\xe2\xa8\xb9"), + ENTITY_DEF("nless", 8814, "\xe2\x89\xae"), + ENTITY_DEF("uharr", 8638, "\xe2\x86\xbe"), + ENTITY_DEF("lambda", 955, "\xce\xbb"), + ENTITY_DEF_HEUR("uuml", 252, "\xc3\xbc"), + ENTITY_DEF("horbar", 8213, "\xe2\x80\x95"), + ENTITY_DEF("ccirc", 265, "\xc4\x89"), + ENTITY_DEF("sqcup", 8852, "\xe2\x8a\x94"), + ENTITY_DEF("Pscr", 119979, "\xf0\x9d\x92\xab"), + ENTITY_DEF("supsup", 10966, "\xe2\xab\x96"), + ENTITY_DEF("Cacute", 262, "\xc4\x86"), + ENTITY_DEF("upsih", 978, "\xcf\x92"), + ENTITY_DEF("precsim", 8830, "\xe2\x89\xbe"), + ENTITY_DEF("longrightarrow", 10230, "\xe2\x9f\xb6"), + ENTITY_DEF("circledR", 174, "\xc2\xae"), + ENTITY_DEF("UpTeeArrow", 8613, "\xe2\x86\xa5"), + ENTITY_DEF("bepsi", 1014, "\xcf\xb6"), + ENTITY_DEF("oast", 8859, "\xe2\x8a\x9b"), + ENTITY_DEF("yfr", 120118, "\xf0\x9d\x94\xb6"), + ENTITY_DEF("rdsh", 8627, "\xe2\x86\xb3"), + ENTITY_DEF("Ograve", 210, "\xc3\x92"), + ENTITY_DEF("LeftVectorBar", 10578, "\xe2\xa5\x92"), + ENTITY_DEF("NotNestedLessLess", 10913, "\xe2\xaa\xa1\xcc\xb8"), + ENTITY_DEF("Jscr", 119973, "\xf0\x9d\x92\xa5"), + ENTITY_DEF("psi", 968, "\xcf\x88"), + ENTITY_DEF("orarr", 8635, "\xe2\x86\xbb"), + ENTITY_DEF("Subset", 8912, "\xe2\x8b\x90"), + ENTITY_DEF("curarr", 8631, "\xe2\x86\xb7"), + ENTITY_DEF("CirclePlus", 8853, "\xe2\x8a\x95"), + ENTITY_DEF("gtrless", 8823, "\xe2\x89\xb7"), + ENTITY_DEF("nvle", 8804, "\xe2\x89\xa4\xe2\x83\x92"), + ENTITY_DEF("prop", 8733, "\xe2\x88\x9d"), + ENTITY_DEF("gEl", 10892, "\xe2\xaa\x8c"), + ENTITY_DEF("gtlPar", 10645, "\xe2\xa6\x95"), + ENTITY_DEF("frasl", 8260, "\xe2\x81\x84"), + ENTITY_DEF("nearr", 8599, "\xe2\x86\x97"), + ENTITY_DEF("NotSubsetEqual", 8840, "\xe2\x8a\x88"), + ENTITY_DEF("planck", 8463, "\xe2\x84\x8f"), + ENTITY_DEF_HEUR("Uuml", 220, "\xc3\x9c"), + ENTITY_DEF("spadesuit", 9824, "\xe2\x99\xa0"), + ENTITY_DEF_HEUR("sect", 167, "\xc2\xa7"), + ENTITY_DEF("cdot", 267, "\xc4\x8b"), + ENTITY_DEF("boxVh", 9579, "\xe2\x95\xab"), + ENTITY_DEF("zscr", 120015, "\xf0\x9d\x93\x8f"), + ENTITY_DEF("nsqsube", 8930, "\xe2\x8b\xa2"), + ENTITY_DEF("grave", 96, "\x60"), + ENTITY_DEF("angrtvb", 8894, "\xe2\x8a\xbe"), + ENTITY_DEF("MediumSpace", 8287, "\xe2\x81\x9f"), + ENTITY_DEF("Ntilde", 209, "\xc3\x91"), + ENTITY_DEF("solb", 10692, "\xe2\xa7\x84"), + ENTITY_DEF("angzarr", 9084, "\xe2\x8d\xbc"), + ENTITY_DEF("nopf", 120159, "\xf0\x9d\x95\x9f"), + ENTITY_DEF("rtrif", 9656, "\xe2\x96\xb8"), + ENTITY_DEF("nrightarrow", 8603, "\xe2\x86\x9b"), + ENTITY_DEF("Kappa", 922, "\xce\x9a"), + ENTITY_DEF("simrarr", 10610, "\xe2\xa5\xb2"), + ENTITY_DEF("imacr", 299, "\xc4\xab"), + ENTITY_DEF("vrtri", 8883, "\xe2\x8a\xb3"), + ENTITY_DEF("part", 8706, "\xe2\x88\x82"), + ENTITY_DEF("esim", 8770, "\xe2\x89\x82"), + ENTITY_DEF_HEUR("atilde", 227, "\xc3\xa3"), + ENTITY_DEF("DownRightTeeVector", 10591, "\xe2\xa5\x9f"), + ENTITY_DEF("jcirc", 309, "\xc4\xb5"), + ENTITY_DEF("Ecaron", 282, "\xc4\x9a"), + ENTITY_DEF("VerticalSeparator", 10072, "\xe2\x9d\x98"), + ENTITY_DEF("rHar", 10596, "\xe2\xa5\xa4"), + ENTITY_DEF("rcaron", 345, "\xc5\x99"), + ENTITY_DEF("subnE", 10955, "\xe2\xab\x8b"), + ENTITY_DEF("ii", 8520, "\xe2\x85\x88"), + ENTITY_DEF("Cconint", 8752, "\xe2\x88\xb0"), + ENTITY_DEF("Mcy", 1052, "\xd0\x9c"), + ENTITY_DEF("eqcolon", 8789, "\xe2\x89\x95"), + ENTITY_DEF("cupor", 10821, "\xe2\xa9\x85"), + ENTITY_DEF("DoubleUpArrow", 8657, "\xe2\x87\x91"), + ENTITY_DEF("boxbox", 10697, "\xe2\xa7\x89"), + ENTITY_DEF("setminus", 8726, "\xe2\x88\x96"), + ENTITY_DEF("Lleftarrow", 8666, "\xe2\x87\x9a"), + ENTITY_DEF("nang", 8736, "\xe2\x88\xa0\xe2\x83\x92"), + ENTITY_DEF("TRADE", 8482, "\xe2\x84\xa2"), + ENTITY_DEF("urcorner", 8989, "\xe2\x8c\x9d"), + ENTITY_DEF("lsqb", 91, "\x5b"), + ENTITY_DEF("cupcup", 10826, "\xe2\xa9\x8a"), + ENTITY_DEF("kjcy", 1116, "\xd1\x9c"), + ENTITY_DEF("llhard", 10603, "\xe2\xa5\xab"), + ENTITY_DEF("mumap", 8888, "\xe2\x8a\xb8"), + ENTITY_DEF("iiint", 8749, "\xe2\x88\xad"), + ENTITY_DEF("RightTee", 8866, "\xe2\x8a\xa2"), + ENTITY_DEF("Tcaron", 356, "\xc5\xa4"), + ENTITY_DEF("bigcirc", 9711, "\xe2\x97\xaf"), + ENTITY_DEF("trianglerighteq", 8885, "\xe2\x8a\xb5"), + ENTITY_DEF("NotLessGreater", 8824, "\xe2\x89\xb8"), + ENTITY_DEF("hArr", 8660, "\xe2\x87\x94"), + ENTITY_DEF("ocy", 1086, "\xd0\xbe"), + ENTITY_DEF("tosa", 10537, "\xe2\xa4\xa9"), + ENTITY_DEF("twixt", 8812, "\xe2\x89\xac"), + ENTITY_DEF("square", 9633, "\xe2\x96\xa1"), + ENTITY_DEF("Otimes", 10807, "\xe2\xa8\xb7"), + ENTITY_DEF("Kcedil", 310, "\xc4\xb6"), + ENTITY_DEF("beth", 8502, "\xe2\x84\xb6"), + ENTITY_DEF("triminus", 10810, "\xe2\xa8\xba"), + ENTITY_DEF("nlArr", 8653, "\xe2\x87\x8d"), + ENTITY_DEF("Oacute", 211, "\xc3\x93"), + ENTITY_DEF("zwnj", 8204, "\xe2\x80\x8c"), + ENTITY_DEF("ll", 8810, "\xe2\x89\xaa"), + ENTITY_DEF("smashp", 10803, "\xe2\xa8\xb3"), + ENTITY_DEF("ngeqq", 8807, "\xe2\x89\xa7\xcc\xb8"), + ENTITY_DEF("rnmid", 10990, "\xe2\xab\xae"), + ENTITY_DEF("nwArr", 8662, "\xe2\x87\x96"), + ENTITY_DEF("RightUpDownVector", 10575, "\xe2\xa5\x8f"), + ENTITY_DEF("lbbrk", 10098, "\xe2\x9d\xb2"), + ENTITY_DEF("compfn", 8728, "\xe2\x88\x98"), + ENTITY_DEF("eDDot", 10871, "\xe2\xa9\xb7"), + ENTITY_DEF("Jsercy", 1032, "\xd0\x88"), + ENTITY_DEF("HARDcy", 1066, "\xd0\xaa"), + ENTITY_DEF("nexists", 8708, "\xe2\x88\x84"), + ENTITY_DEF("theta", 952, "\xce\xb8"), + ENTITY_DEF("plankv", 8463, "\xe2\x84\x8f"), + ENTITY_DEF_HEUR("sup2", 178, "\xc2\xb2"), + ENTITY_DEF("lessapprox", 10885, "\xe2\xaa\x85"), + ENTITY_DEF("gdot", 289, "\xc4\xa1"), + ENTITY_DEF("angmsdae", 10668, "\xe2\xa6\xac"), + ENTITY_DEF("Superset", 8835, "\xe2\x8a\x83"), + ENTITY_DEF("prap", 10935, "\xe2\xaa\xb7"), + ENTITY_DEF("Zscr", 119989, "\xf0\x9d\x92\xb5"), + ENTITY_DEF("nsucc", 8833, "\xe2\x8a\x81"), + ENTITY_DEF("supseteqq", 10950, "\xe2\xab\x86"), + ENTITY_DEF("UpTee", 8869, "\xe2\x8a\xa5"), + ENTITY_DEF("LowerLeftArrow", 8601, "\xe2\x86\x99"), + ENTITY_DEF("ssmile", 8995, "\xe2\x8c\xa3"), + ENTITY_DEF("niv", 8715, "\xe2\x88\x8b"), + ENTITY_DEF("bigvee", 8897, "\xe2\x8b\x81"), + ENTITY_DEF("kscr", 120000, "\xf0\x9d\x93\x80"), + ENTITY_DEF("xutri", 9651, "\xe2\x96\xb3"), + ENTITY_DEF("caret", 8257, "\xe2\x81\x81"), + ENTITY_DEF("caron", 711, "\xcb\x87"), + ENTITY_DEF("Wedge", 8896, "\xe2\x8b\x80"), + ENTITY_DEF("sdotb", 8865, "\xe2\x8a\xa1"), + ENTITY_DEF("bigoplus", 10753, "\xe2\xa8\x81"), + ENTITY_DEF("Breve", 728, "\xcb\x98"), + ENTITY_DEF("ImaginaryI", 8520, "\xe2\x85\x88"), + ENTITY_DEF("longmapsto", 10236, "\xe2\x9f\xbc"), + ENTITY_DEF("boxVH", 9580, "\xe2\x95\xac"), + ENTITY_DEF("lozenge", 9674, "\xe2\x97\x8a"), + ENTITY_DEF("toea", 10536, "\xe2\xa4\xa8"), + ENTITY_DEF("nbumpe", 8783, "\xe2\x89\x8f\xcc\xb8"), + ENTITY_DEF("gcirc", 285, "\xc4\x9d"), + ENTITY_DEF("NotHumpEqual", 8783, "\xe2\x89\x8f\xcc\xb8"), + ENTITY_DEF("pre", 10927, "\xe2\xaa\xaf"), + ENTITY_DEF("ascr", 119990, "\xf0\x9d\x92\xb6"), + ENTITY_DEF("Acirc", 194, "\xc3\x82"), + ENTITY_DEF("questeq", 8799, "\xe2\x89\x9f"), + ENTITY_DEF("ncaron", 328, "\xc5\x88"), + ENTITY_DEF("LeftTeeArrow", 8612, "\xe2\x86\xa4"), + ENTITY_DEF("xcirc", 9711, "\xe2\x97\xaf"), + ENTITY_DEF("swarr", 8601, "\xe2\x86\x99"), + ENTITY_DEF("MinusPlus", 8723, "\xe2\x88\x93"), + ENTITY_DEF("plus", 43, "\x2b"), + ENTITY_DEF("NotDoubleVerticalBar", 8742, "\xe2\x88\xa6"), + ENTITY_DEF("rppolint", 10770, "\xe2\xa8\x92"), + ENTITY_DEF("NotTildeFullEqual", 8775, "\xe2\x89\x87"), + ENTITY_DEF("ltdot", 8918, "\xe2\x8b\x96"), + ENTITY_DEF("NotNestedGreaterGreater", 10914, "\xe2\xaa\xa2\xcc\xb8"), + ENTITY_DEF("Lscr", 8466, "\xe2\x84\x92"), + ENTITY_DEF("pitchfork", 8916, "\xe2\x8b\x94"), + ENTITY_DEF("Eopf", 120124, "\xf0\x9d\x94\xbc"), + ENTITY_DEF("ropf", 120163, "\xf0\x9d\x95\xa3"), + ENTITY_DEF("Delta", 916, "\xce\x94"), + ENTITY_DEF("lozf", 10731, "\xe2\xa7\xab"), + ENTITY_DEF("RightTeeVector", 10587, "\xe2\xa5\x9b"), + ENTITY_DEF("UpDownArrow", 8597, "\xe2\x86\x95"), + ENTITY_DEF("bump", 8782, "\xe2\x89\x8e"), + ENTITY_DEF("Rscr", 8475, "\xe2\x84\x9b"), + ENTITY_DEF("slarr", 8592, "\xe2\x86\x90"), + ENTITY_DEF("lcy", 1083, "\xd0\xbb"), + ENTITY_DEF("Vee", 8897, "\xe2\x8b\x81"), + ENTITY_DEF("Iogon", 302, "\xc4\xae"), + ENTITY_DEF("minus", 8722, "\xe2\x88\x92"), + ENTITY_DEF("GreaterFullEqual", 8807, "\xe2\x89\xa7"), + ENTITY_DEF("xhArr", 10234, "\xe2\x9f\xba"), + ENTITY_DEF("shortmid", 8739, "\xe2\x88\xa3"), + ENTITY_DEF("DoubleDownArrow", 8659, "\xe2\x87\x93"), + ENTITY_DEF("Wscr", 119986, "\xf0\x9d\x92\xb2"), + ENTITY_DEF("rang", 10217, "\xe2\x9f\xa9"), + ENTITY_DEF("lcub", 123, "\x7b"), + ENTITY_DEF("mnplus", 8723, "\xe2\x88\x93"), + ENTITY_DEF("ulcrop", 8975, "\xe2\x8c\x8f"), + ENTITY_DEF("wfr", 120116, "\xf0\x9d\x94\xb4"), + ENTITY_DEF("DifferentialD", 8518, "\xe2\x85\x86"), + ENTITY_DEF("ThinSpace", 8201, "\xe2\x80\x89"), + ENTITY_DEF("NotGreaterGreater", 8811, "\xe2\x89\xab\xcc\xb8"), + ENTITY_DEF("Topf", 120139, "\xf0\x9d\x95\x8b"), + ENTITY_DEF("sbquo", 8218, "\xe2\x80\x9a"), + ENTITY_DEF("sdot", 8901, "\xe2\x8b\x85"), + ENTITY_DEF("DoubleLeftTee", 10980, "\xe2\xab\xa4"), + ENTITY_DEF("vBarv", 10985, "\xe2\xab\xa9"), + ENTITY_DEF("subne", 8842, "\xe2\x8a\x8a"), + ENTITY_DEF("gtrdot", 8919, "\xe2\x8b\x97"), + ENTITY_DEF("opar", 10679, "\xe2\xa6\xb7"), + ENTITY_DEF("apid", 8779, "\xe2\x89\x8b"), + ENTITY_DEF("Cross", 10799, "\xe2\xa8\xaf"), + ENTITY_DEF("lhblk", 9604, "\xe2\x96\x84"), + ENTITY_DEF("capcap", 10827, "\xe2\xa9\x8b"), + ENTITY_DEF("midast", 42, "\x2a"), + ENTITY_DEF("lscr", 120001, "\xf0\x9d\x93\x81"), + ENTITY_DEF("nGt", 8811, "\xe2\x89\xab\xe2\x83\x92"), + ENTITY_DEF_HEUR("Euml", 203, "\xc3\x8b"), + ENTITY_DEF("blacktriangledown", 9662, "\xe2\x96\xbe"), + ENTITY_DEF("Rcy", 1056, "\xd0\xa0"), + ENTITY_DEF("dfisht", 10623, "\xe2\xa5\xbf"), + ENTITY_DEF("dashv", 8867, "\xe2\x8a\xa3"), + ENTITY_DEF("ast", 42, "\x2a"), + ENTITY_DEF("ContourIntegral", 8750, "\xe2\x88\xae"), + ENTITY_DEF("Ofr", 120082, "\xf0\x9d\x94\x92"), + ENTITY_DEF("Lcy", 1051, "\xd0\x9b"), + ENTITY_DEF("nltrie", 8940, "\xe2\x8b\xac"), + ENTITY_DEF("ShortUpArrow", 8593, "\xe2\x86\x91"), + ENTITY_DEF("acy", 1072, "\xd0\xb0"), + ENTITY_DEF("rightarrow", 8594, "\xe2\x86\x92"), + ENTITY_DEF("UnderBar", 95, "\x5f"), + ENTITY_DEF("LongLeftArrow", 10229, "\xe2\x9f\xb5"), + ENTITY_DEF("andd", 10844, "\xe2\xa9\x9c"), + ENTITY_DEF("xlarr", 10229, "\xe2\x9f\xb5"), + ENTITY_DEF("percnt", 37, "\x25"), + ENTITY_DEF("rharu", 8640, "\xe2\x87\x80"), + ENTITY_DEF("plusdo", 8724, "\xe2\x88\x94"), + ENTITY_DEF("TScy", 1062, "\xd0\xa6"), + ENTITY_DEF("kcy", 1082, "\xd0\xba"), + ENTITY_DEF("boxVR", 9568, "\xe2\x95\xa0"), + ENTITY_DEF("looparrowleft", 8619, "\xe2\x86\xab"), + ENTITY_DEF("scirc", 349, "\xc5\x9d"), + ENTITY_DEF("drcorn", 8991, "\xe2\x8c\x9f"), + ENTITY_DEF("iiota", 8489, "\xe2\x84\xa9"), + ENTITY_DEF("Zcy", 1047, "\xd0\x97"), + ENTITY_DEF("frac58", 8541, "\xe2\x85\x9d"), + ENTITY_DEF("alpha", 945, "\xce\xb1"), + ENTITY_DEF("daleth", 8504, "\xe2\x84\xb8"), + ENTITY_DEF("gtreqless", 8923, "\xe2\x8b\x9b"), + ENTITY_DEF("tstrok", 359, "\xc5\xa7"), + ENTITY_DEF("plusb", 8862, "\xe2\x8a\x9e"), + ENTITY_DEF("odsold", 10684, "\xe2\xa6\xbc"), + ENTITY_DEF("varsupsetneqq", 10956, "\xe2\xab\x8c\xef\xb8\x80"), + ENTITY_DEF_HEUR("otilde", 245, "\xc3\xb5"), + ENTITY_DEF("gtcir", 10874, "\xe2\xa9\xba"), + ENTITY_DEF("lltri", 9722, "\xe2\x97\xba"), + ENTITY_DEF("rx", 8478, "\xe2\x84\x9e"), + ENTITY_DEF("ljcy", 1113, "\xd1\x99"), + ENTITY_DEF("parsim", 10995, "\xe2\xab\xb3"), + ENTITY_DEF("NotElement", 8713, "\xe2\x88\x89"), + ENTITY_DEF_HEUR("plusmn", 177, "\xc2\xb1"), + ENTITY_DEF("varsubsetneq", 8842, "\xe2\x8a\x8a\xef\xb8\x80"), + ENTITY_DEF("subset", 8834, "\xe2\x8a\x82"), + ENTITY_DEF("awint", 10769, "\xe2\xa8\x91"), + ENTITY_DEF("laemptyv", 10676, "\xe2\xa6\xb4"), + ENTITY_DEF("phiv", 981, "\xcf\x95"), + ENTITY_DEF("sfrown", 8994, "\xe2\x8c\xa2"), + ENTITY_DEF("DoubleUpDownArrow", 8661, "\xe2\x87\x95"), + ENTITY_DEF("lpar", 40, "\x28"), + ENTITY_DEF("frac45", 8536, "\xe2\x85\x98"), + ENTITY_DEF("rBarr", 10511, "\xe2\xa4\x8f"), + ENTITY_DEF("npolint", 10772, "\xe2\xa8\x94"), + ENTITY_DEF("emacr", 275, "\xc4\x93"), + ENTITY_DEF("maltese", 10016, "\xe2\x9c\xa0"), + ENTITY_DEF("PlusMinus", 177, "\xc2\xb1"), + ENTITY_DEF("ReverseEquilibrium", 8651, "\xe2\x87\x8b"), + ENTITY_DEF("oscr", 8500, "\xe2\x84\xb4"), + ENTITY_DEF("blacksquare", 9642, "\xe2\x96\xaa"), + ENTITY_DEF("TSHcy", 1035, "\xd0\x8b"), + ENTITY_DEF("gap", 10886, "\xe2\xaa\x86"), + ENTITY_DEF("xnis", 8955, "\xe2\x8b\xbb"), + ENTITY_DEF("Ll", 8920, "\xe2\x8b\x98"), + ENTITY_DEF("PrecedesEqual", 10927, "\xe2\xaa\xaf"), + ENTITY_DEF("incare", 8453, "\xe2\x84\x85"), + ENTITY_DEF("nharr", 8622, "\xe2\x86\xae"), + ENTITY_DEF("varnothing", 8709, "\xe2\x88\x85"), + ENTITY_DEF("ShortDownArrow", 8595, "\xe2\x86\x93"), + ENTITY_DEF_HEUR("nbsp", 160, " "), + ENTITY_DEF("asympeq", 8781, "\xe2\x89\x8d"), + ENTITY_DEF("rbrkslu", 10640, "\xe2\xa6\x90"), + ENTITY_DEF("rho", 961, "\xcf\x81"), + ENTITY_DEF("Mscr", 8499, "\xe2\x84\xb3"), + ENTITY_DEF_HEUR("eth", 240, "\xc3\xb0"), + ENTITY_DEF("suplarr", 10619, "\xe2\xa5\xbb"), + ENTITY_DEF("Tab", 9, "\x09"), + ENTITY_DEF("omicron", 959, "\xce\xbf"), + ENTITY_DEF("blacktriangle", 9652, "\xe2\x96\xb4"), + ENTITY_DEF("nldr", 8229, "\xe2\x80\xa5"), + ENTITY_DEF("downharpoonleft", 8643, "\xe2\x87\x83"), + ENTITY_DEF("circledcirc", 8858, "\xe2\x8a\x9a"), + ENTITY_DEF("leftleftarrows", 8647, "\xe2\x87\x87"), + ENTITY_DEF("NotHumpDownHump", 8782, "\xe2\x89\x8e\xcc\xb8"), + ENTITY_DEF("nvgt", 62, "\x3e\xe2\x83\x92"), + ENTITY_DEF("rhard", 8641, "\xe2\x87\x81"), + ENTITY_DEF("nGg", 8921, "\xe2\x8b\x99\xcc\xb8"), + ENTITY_DEF("lurdshar", 10570, "\xe2\xa5\x8a"), + ENTITY_DEF("cirE", 10691, "\xe2\xa7\x83"), + ENTITY_DEF("isinE", 8953, "\xe2\x8b\xb9"), + ENTITY_DEF("eparsl", 10723, "\xe2\xa7\xa3"), + ENTITY_DEF("RightAngleBracket", 10217, "\xe2\x9f\xa9"), + ENTITY_DEF("hcirc", 293, "\xc4\xa5"), + ENTITY_DEF("bumpeq", 8783, "\xe2\x89\x8f"), + ENTITY_DEF("cire", 8791, "\xe2\x89\x97"), + ENTITY_DEF("dotplus", 8724, "\xe2\x88\x94"), + ENTITY_DEF("itilde", 297, "\xc4\xa9"), + ENTITY_DEF("uwangle", 10663, "\xe2\xa6\xa7"), + ENTITY_DEF("rlhar", 8652, "\xe2\x87\x8c"), + ENTITY_DEF("rbrace", 125, "\x7d"), + ENTITY_DEF("mid", 8739, "\xe2\x88\xa3"), + ENTITY_DEF("el", 10905, "\xe2\xaa\x99"), + ENTITY_DEF("KJcy", 1036, "\xd0\x8c"), + ENTITY_DEF("odiv", 10808, "\xe2\xa8\xb8"), + ENTITY_DEF("amacr", 257, "\xc4\x81"), + ENTITY_DEF("qprime", 8279, "\xe2\x81\x97"), + ENTITY_DEF("tcedil", 355, "\xc5\xa3"), + ENTITY_DEF("UpArrowDownArrow", 8645, "\xe2\x87\x85"), + ENTITY_DEF("spades", 9824, "\xe2\x99\xa0"), + ENTITY_DEF("napos", 329, "\xc5\x89"), + ENTITY_DEF("straightepsilon", 1013, "\xcf\xb5"), + ENTITY_DEF("CupCap", 8781, "\xe2\x89\x8d"), + ENTITY_DEF("Oopf", 120134, "\xf0\x9d\x95\x86"), + ENTITY_DEF("sub", 8834, "\xe2\x8a\x82"), + ENTITY_DEF("ohm", 937, "\xce\xa9"), + ENTITY_DEF("UnderBrace", 9183, "\xe2\x8f\x9f"), + ENTITY_DEF("looparrowright", 8620, "\xe2\x86\xac"), + ENTITY_DEF("xotime", 10754, "\xe2\xa8\x82"), + ENTITY_DEF("ntgl", 8825, "\xe2\x89\xb9"), + ENTITY_DEF("minusdu", 10794, "\xe2\xa8\xaa"), + ENTITY_DEF("rarrb", 8677, "\xe2\x87\xa5"), + ENTITY_DEF("nvlArr", 10498, "\xe2\xa4\x82"), + ENTITY_DEF("triangle", 9653, "\xe2\x96\xb5"), + ENTITY_DEF("nacute", 324, "\xc5\x84"), + ENTITY_DEF("boxHD", 9574, "\xe2\x95\xa6"), + ENTITY_DEF("ratio", 8758, "\xe2\x88\xb6"), + ENTITY_DEF("larrsim", 10611, "\xe2\xa5\xb3"), + ENTITY_DEF("LessLess", 10913, "\xe2\xaa\xa1"), + ENTITY_DEF("yacy", 1103, "\xd1\x8f"), + ENTITY_DEF("ctdot", 8943, "\xe2\x8b\xaf"), + ENTITY_DEF("and", 8743, "\xe2\x88\xa7"), + ENTITY_DEF("lrtri", 8895, "\xe2\x8a\xbf"), + ENTITY_DEF("eDot", 8785, "\xe2\x89\x91"), + ENTITY_DEF("sqsub", 8847, "\xe2\x8a\x8f"), + ENTITY_DEF("real", 8476, "\xe2\x84\x9c"), + ENTITY_DEF("Dcy", 1044, "\xd0\x94"), + ENTITY_DEF("vartheta", 977, "\xcf\x91"), + ENTITY_DEF("nsub", 8836, "\xe2\x8a\x84"), + ENTITY_DEF("DownTee", 8868, "\xe2\x8a\xa4"), + ENTITY_DEF_HEUR("acute", 180, "\xc2\xb4"), + ENTITY_DEF("GreaterLess", 8823, "\xe2\x89\xb7"), + ENTITY_DEF("supplus", 10944, "\xe2\xab\x80"), + ENTITY_DEF("Vbar", 10987, "\xe2\xab\xab"), + ENTITY_DEF("divideontimes", 8903, "\xe2\x8b\x87"), + ENTITY_DEF("lsim", 8818, "\xe2\x89\xb2"), + ENTITY_DEF("nearhk", 10532, "\xe2\xa4\xa4"), + ENTITY_DEF("nLtv", 8810, "\xe2\x89\xaa\xcc\xb8"), + ENTITY_DEF("RuleDelayed", 10740, "\xe2\xa7\xb4"), + ENTITY_DEF("smile", 8995, "\xe2\x8c\xa3"), + ENTITY_DEF("coprod", 8720, "\xe2\x88\x90"), + ENTITY_DEF("imof", 8887, "\xe2\x8a\xb7"), + ENTITY_DEF("ecy", 1101, "\xd1\x8d"), + ENTITY_DEF("RightCeiling", 8969, "\xe2\x8c\x89"), + ENTITY_DEF("dlcorn", 8990, "\xe2\x8c\x9e"), + ENTITY_DEF("Nu", 925, "\xce\x9d"), + ENTITY_DEF("frac18", 8539, "\xe2\x85\x9b"), + ENTITY_DEF("diamond", 8900, "\xe2\x8b\x84"), + ENTITY_DEF("Icirc", 206, "\xc3\x8e"), + ENTITY_DEF("ngeq", 8817, "\xe2\x89\xb1"), + ENTITY_DEF("epsilon", 949, "\xce\xb5"), + ENTITY_DEF("fork", 8916, "\xe2\x8b\x94"), + ENTITY_DEF("xrarr", 10230, "\xe2\x9f\xb6"), + ENTITY_DEF("racute", 341, "\xc5\x95"), + ENTITY_DEF("ntlg", 8824, "\xe2\x89\xb8"), + ENTITY_DEF("xvee", 8897, "\xe2\x8b\x81"), + ENTITY_DEF("LeftArrowRightArrow", 8646, "\xe2\x87\x86"), + ENTITY_DEF("DownLeftRightVector", 10576, "\xe2\xa5\x90"), + ENTITY_DEF("Eacute", 201, "\xc3\x89"), + ENTITY_DEF("gimel", 8503, "\xe2\x84\xb7"), + ENTITY_DEF("rtimes", 8906, "\xe2\x8b\x8a"), + ENTITY_DEF("forall", 8704, "\xe2\x88\x80"), + ENTITY_DEF("DiacriticalDoubleAcute", 733, "\xcb\x9d"), + ENTITY_DEF("dArr", 8659, "\xe2\x87\x93"), + ENTITY_DEF("fallingdotseq", 8786, "\xe2\x89\x92"), + ENTITY_DEF("Aogon", 260, "\xc4\x84"), + ENTITY_DEF("PartialD", 8706, "\xe2\x88\x82"), + ENTITY_DEF("mapstoup", 8613, "\xe2\x86\xa5"), + ENTITY_DEF("die", 168, "\xc2\xa8"), + ENTITY_DEF("ngt", 8815, "\xe2\x89\xaf"), + ENTITY_DEF("vcy", 1074, "\xd0\xb2"), + ENTITY_DEF("fjlig", (unsigned) -1, "\x66\x6a"), + ENTITY_DEF("submult", 10945, "\xe2\xab\x81"), + ENTITY_DEF("ubrcy", 1118, "\xd1\x9e"), + ENTITY_DEF("ovbar", 9021, "\xe2\x8c\xbd"), + ENTITY_DEF("bsime", 8909, "\xe2\x8b\x8d"), + ENTITY_DEF("precnsim", 8936, "\xe2\x8b\xa8"), + ENTITY_DEF("DiacriticalTilde", 732, "\xcb\x9c"), + ENTITY_DEF("cwint", 8753, "\xe2\x88\xb1"), + ENTITY_DEF("Scy", 1057, "\xd0\xa1"), + ENTITY_DEF("NotGreaterEqual", 8817, "\xe2\x89\xb1"), + ENTITY_DEF("boxUR", 9562, "\xe2\x95\x9a"), + ENTITY_DEF("LessSlantEqual", 10877, "\xe2\xa9\xbd"), + ENTITY_DEF("Barwed", 8966, "\xe2\x8c\x86"), + ENTITY_DEF("supdot", 10942, "\xe2\xaa\xbe"), + ENTITY_DEF("gel", 8923, "\xe2\x8b\x9b"), + ENTITY_DEF("iscr", 119998, "\xf0\x9d\x92\xbe"), + ENTITY_DEF("doublebarwedge", 8966, "\xe2\x8c\x86"), + ENTITY_DEF("Idot", 304, "\xc4\xb0"), + ENTITY_DEF("DoubleDot", 168, "\xc2\xa8"), + ENTITY_DEF("rsquo", 8217, "\xe2\x80\x99"), + ENTITY_DEF("subsetneqq", 10955, "\xe2\xab\x8b"), + ENTITY_DEF("UpEquilibrium", 10606, "\xe2\xa5\xae"), + ENTITY_DEF("copysr", 8471, "\xe2\x84\x97"), + ENTITY_DEF("RightDoubleBracket", 10215, "\xe2\x9f\xa7"), + ENTITY_DEF("LeftRightVector", 10574, "\xe2\xa5\x8e"), + ENTITY_DEF("DownLeftVectorBar", 10582, "\xe2\xa5\x96"), + ENTITY_DEF("suphsub", 10967, "\xe2\xab\x97"), + ENTITY_DEF_HEUR("cedil", 184, "\xc2\xb8"), + ENTITY_DEF("prurel", 8880, "\xe2\x8a\xb0"), + ENTITY_DEF("imagpart", 8465, "\xe2\x84\x91"), + ENTITY_DEF("Hscr", 8459, "\xe2\x84\x8b"), + ENTITY_DEF("jmath", 567, "\xc8\xb7"), + ENTITY_DEF("nrtrie", 8941, "\xe2\x8b\xad"), + ENTITY_DEF("nsup", 8837, "\xe2\x8a\x85"), + ENTITY_DEF("Ubrcy", 1038, "\xd0\x8e"), + ENTITY_DEF("succnsim", 8937, "\xe2\x8b\xa9"), + ENTITY_DEF("nesim", 8770, "\xe2\x89\x82\xcc\xb8"), + ENTITY_DEF("varepsilon", 1013, "\xcf\xb5"), + ENTITY_DEF("DoubleRightTee", 8872, "\xe2\x8a\xa8"), + ENTITY_DEF_HEUR("not", 172, "\xc2\xac"), + ENTITY_DEF("lesdot", 10879, "\xe2\xa9\xbf"), + ENTITY_DEF("backepsilon", 1014, "\xcf\xb6"), + ENTITY_DEF("srarr", 8594, "\xe2\x86\x92"), + ENTITY_DEF("varsubsetneqq", 10955, "\xe2\xab\x8b\xef\xb8\x80"), + ENTITY_DEF("sqcap", 8851, "\xe2\x8a\x93"), + ENTITY_DEF("rightleftarrows", 8644, "\xe2\x87\x84"), + ENTITY_DEF("diams", 9830, "\xe2\x99\xa6"), + ENTITY_DEF("boxdR", 9554, "\xe2\x95\x92"), + ENTITY_DEF("ngeqslant", 10878, "\xe2\xa9\xbe\xcc\xb8"), + ENTITY_DEF("boxDR", 9556, "\xe2\x95\x94"), + ENTITY_DEF("sext", 10038, "\xe2\x9c\xb6"), + ENTITY_DEF("backsim", 8765, "\xe2\x88\xbd"), + ENTITY_DEF("nfr", 120107, "\xf0\x9d\x94\xab"), + ENTITY_DEF("CloseCurlyDoubleQuote", 8221, "\xe2\x80\x9d"), + ENTITY_DEF("npart", 8706, "\xe2\x88\x82\xcc\xb8"), + ENTITY_DEF("dharl", 8643, "\xe2\x87\x83"), + ENTITY_DEF("NewLine", 10, "\x0a"), + ENTITY_DEF("bigotimes", 10754, "\xe2\xa8\x82"), + ENTITY_DEF("lAtail", 10523, "\xe2\xa4\x9b"), + ENTITY_DEF_HEUR("frac14", 188, "\xc2\xbc"), + ENTITY_DEF("or", 8744, "\xe2\x88\xa8"), + ENTITY_DEF("subedot", 10947, "\xe2\xab\x83"), + ENTITY_DEF("nmid", 8740, "\xe2\x88\xa4"), + ENTITY_DEF("DownArrowUpArrow", 8693, "\xe2\x87\xb5"), + ENTITY_DEF("icy", 1080, "\xd0\xb8"), + ENTITY_DEF("num", 35, "\x23"), + ENTITY_DEF("Gdot", 288, "\xc4\xa0"), + ENTITY_DEF("urcrop", 8974, "\xe2\x8c\x8e"), + ENTITY_DEF("epsiv", 1013, "\xcf\xb5"), + ENTITY_DEF("topcir", 10993, "\xe2\xab\xb1"), + ENTITY_DEF("ne", 8800, "\xe2\x89\xa0"), + ENTITY_DEF("osol", 8856, "\xe2\x8a\x98"), + ENTITY_DEF_HEUR("amp", 38, "\x26"), + ENTITY_DEF("ncap", 10819, "\xe2\xa9\x83"), + ENTITY_DEF("Sscr", 119982, "\xf0\x9d\x92\xae"), + ENTITY_DEF("sung", 9834, "\xe2\x99\xaa"), + ENTITY_DEF("ltri", 9667, "\xe2\x97\x83"), + ENTITY_DEF("frac25", 8534, "\xe2\x85\x96"), + ENTITY_DEF("DZcy", 1039, "\xd0\x8f"), + ENTITY_DEF("RightUpVector", 8638, "\xe2\x86\xbe"), + ENTITY_DEF("rsquor", 8217, "\xe2\x80\x99"), + ENTITY_DEF("uplus", 8846, "\xe2\x8a\x8e"), + ENTITY_DEF("triangleright", 9657, "\xe2\x96\xb9"), + ENTITY_DEF("lAarr", 8666, "\xe2\x87\x9a"), + ENTITY_DEF("HilbertSpace", 8459, "\xe2\x84\x8b"), + ENTITY_DEF("there4", 8756, "\xe2\x88\xb4"), + ENTITY_DEF("vscr", 120011, "\xf0\x9d\x93\x8b"), + ENTITY_DEF("cirscir", 10690, "\xe2\xa7\x82"), + ENTITY_DEF("roarr", 8702, "\xe2\x87\xbe"), + ENTITY_DEF("hslash", 8463, "\xe2\x84\x8f"), + ENTITY_DEF("supdsub", 10968, "\xe2\xab\x98"), + ENTITY_DEF("simg", 10910, "\xe2\xaa\x9e"), + ENTITY_DEF("trade", 8482, "\xe2\x84\xa2"), + ENTITY_DEF("searrow", 8600, "\xe2\x86\x98"), + ENTITY_DEF("DownLeftVector", 8637, "\xe2\x86\xbd"), + ENTITY_DEF("FilledSmallSquare", 9724, "\xe2\x97\xbc"), + ENTITY_DEF("prod", 8719, "\xe2\x88\x8f"), + ENTITY_DEF("oror", 10838, "\xe2\xa9\x96"), + ENTITY_DEF("udarr", 8645, "\xe2\x87\x85"), + ENTITY_DEF("jsercy", 1112, "\xd1\x98"), + ENTITY_DEF("tprime", 8244, "\xe2\x80\xb4"), + ENTITY_DEF("bprime", 8245, "\xe2\x80\xb5"), + ENTITY_DEF("malt", 10016, "\xe2\x9c\xa0"), + ENTITY_DEF("bigcup", 8899, "\xe2\x8b\x83"), + ENTITY_DEF("oint", 8750, "\xe2\x88\xae"), + ENTITY_DEF("female", 9792, "\xe2\x99\x80"), + ENTITY_DEF("omacr", 333, "\xc5\x8d"), + ENTITY_DEF("SquareSubsetEqual", 8849, "\xe2\x8a\x91"), + ENTITY_DEF("SucceedsEqual", 10928, "\xe2\xaa\xb0"), + ENTITY_DEF("plusacir", 10787, "\xe2\xa8\xa3"), + ENTITY_DEF("Gcirc", 284, "\xc4\x9c"), + ENTITY_DEF("lesdotor", 10883, "\xe2\xaa\x83"), + ENTITY_DEF("escr", 8495, "\xe2\x84\xaf"), + ENTITY_DEF_HEUR("THORN", 222, "\xc3\x9e"), + ENTITY_DEF("UpArrowBar", 10514, "\xe2\xa4\x92"), + ENTITY_DEF("nvrtrie", 8885, "\xe2\x8a\xb5\xe2\x83\x92"), + ENTITY_DEF("varkappa", 1008, "\xcf\xb0"), + ENTITY_DEF("NotReverseElement", 8716, "\xe2\x88\x8c"), + ENTITY_DEF("zdot", 380, "\xc5\xbc"), + ENTITY_DEF("ExponentialE", 8519, "\xe2\x85\x87"), + ENTITY_DEF("lesseqgtr", 8922, "\xe2\x8b\x9a"), + ENTITY_DEF("cscr", 119992, "\xf0\x9d\x92\xb8"), + ENTITY_DEF("Dscr", 119967, "\xf0\x9d\x92\x9f"), + ENTITY_DEF("lthree", 8907, "\xe2\x8b\x8b"), + ENTITY_DEF("Ccedil", 199, "\xc3\x87"), + ENTITY_DEF("nge", 8817, "\xe2\x89\xb1"), + ENTITY_DEF("UpperLeftArrow", 8598, "\xe2\x86\x96"), + ENTITY_DEF("vDash", 8872, "\xe2\x8a\xa8"), + ENTITY_DEF("efDot", 8786, "\xe2\x89\x92"), + ENTITY_DEF("telrec", 8981, "\xe2\x8c\x95"), + ENTITY_DEF("vellip", 8942, "\xe2\x8b\xae"), + ENTITY_DEF("nrArr", 8655, "\xe2\x87\x8f"), + ENTITY_DEF_HEUR("ugrave", 249, "\xc3\xb9"), + ENTITY_DEF("uring", 367, "\xc5\xaf"), + ENTITY_DEF("Bernoullis", 8492, "\xe2\x84\xac"), + ENTITY_DEF("nles", 10877, "\xe2\xa9\xbd\xcc\xb8"), + ENTITY_DEF_HEUR("macr", 175, "\xc2\xaf"), + ENTITY_DEF("boxuR", 9560, "\xe2\x95\x98"), + ENTITY_DEF("clubsuit", 9827, "\xe2\x99\xa3"), + ENTITY_DEF("rightarrowtail", 8611, "\xe2\x86\xa3"), + ENTITY_DEF("epar", 8917, "\xe2\x8b\x95"), + ENTITY_DEF("ltcc", 10918, "\xe2\xaa\xa6"), + ENTITY_DEF("twoheadleftarrow", 8606, "\xe2\x86\x9e"), + ENTITY_DEF("aleph", 8501, "\xe2\x84\xb5"), + ENTITY_DEF("Colon", 8759, "\xe2\x88\xb7"), + ENTITY_DEF("vltri", 8882, "\xe2\x8a\xb2"), + ENTITY_DEF("quaternions", 8461, "\xe2\x84\x8d"), + ENTITY_DEF("rfr", 120111, "\xf0\x9d\x94\xaf"), + ENTITY_DEF_HEUR("Ouml", 214, "\xc3\x96"), + ENTITY_DEF("rsh", 8625, "\xe2\x86\xb1"), + ENTITY_DEF("emptyv", 8709, "\xe2\x88\x85"), + ENTITY_DEF("sqsup", 8848, "\xe2\x8a\x90"), + ENTITY_DEF("marker", 9646, "\xe2\x96\xae"), + ENTITY_DEF("Efr", 120072, "\xf0\x9d\x94\x88"), + ENTITY_DEF("DotEqual", 8784, "\xe2\x89\x90"), + ENTITY_DEF("eqsim", 8770, "\xe2\x89\x82"), + ENTITY_DEF("NotSucceedsEqual", 10928, "\xe2\xaa\xb0\xcc\xb8"), + ENTITY_DEF("primes", 8473, "\xe2\x84\x99"), + ENTITY_DEF_HEUR("times", 215, "\xc3\x97"), + ENTITY_DEF("rangd", 10642, "\xe2\xa6\x92"), + ENTITY_DEF("rightharpoonup", 8640, "\xe2\x87\x80"), + ENTITY_DEF("lrhard", 10605, "\xe2\xa5\xad"), + ENTITY_DEF("ape", 8778, "\xe2\x89\x8a"), + ENTITY_DEF("varsupsetneq", 8843, "\xe2\x8a\x8b\xef\xb8\x80"), + ENTITY_DEF("larrlp", 8619, "\xe2\x86\xab"), + ENTITY_DEF("NotPrecedesEqual", 10927, "\xe2\xaa\xaf\xcc\xb8"), + ENTITY_DEF("ulcorner", 8988, "\xe2\x8c\x9c"), + ENTITY_DEF("acd", 8767, "\xe2\x88\xbf"), + ENTITY_DEF("Hacek", 711, "\xcb\x87"), + ENTITY_DEF("xuplus", 10756, "\xe2\xa8\x84"), + ENTITY_DEF("therefore", 8756, "\xe2\x88\xb4"), + ENTITY_DEF("YIcy", 1031, "\xd0\x87"), + ENTITY_DEF("Tfr", 120087, "\xf0\x9d\x94\x97"), + ENTITY_DEF("Jcirc", 308, "\xc4\xb4"), + ENTITY_DEF("LessGreater", 8822, "\xe2\x89\xb6"), + ENTITY_DEF("Uring", 366, "\xc5\xae"), + ENTITY_DEF("Ugrave", 217, "\xc3\x99"), + ENTITY_DEF("rarr", 8594, "\xe2\x86\x92"), + ENTITY_DEF("wopf", 120168, "\xf0\x9d\x95\xa8"), + ENTITY_DEF("imath", 305, "\xc4\xb1"), + ENTITY_DEF("Yopf", 120144, "\xf0\x9d\x95\x90"), + ENTITY_DEF("colone", 8788, "\xe2\x89\x94"), + ENTITY_DEF("csube", 10961, "\xe2\xab\x91"), + ENTITY_DEF("odash", 8861, "\xe2\x8a\x9d"), + ENTITY_DEF("olarr", 8634, "\xe2\x86\xba"), + ENTITY_DEF("angrt", 8735, "\xe2\x88\x9f"), + ENTITY_DEF("NotLeftTriangleBar", 10703, "\xe2\xa7\x8f\xcc\xb8"), + ENTITY_DEF("GreaterEqual", 8805, "\xe2\x89\xa5"), + ENTITY_DEF("scnap", 10938, "\xe2\xaa\xba"), + ENTITY_DEF("pi", 960, "\xcf\x80"), + ENTITY_DEF("lesg", 8922, "\xe2\x8b\x9a\xef\xb8\x80"), + ENTITY_DEF("orderof", 8500, "\xe2\x84\xb4"), + ENTITY_DEF_HEUR("uacute", 250, "\xc3\xba"), + ENTITY_DEF("Barv", 10983, "\xe2\xab\xa7"), + ENTITY_DEF("Theta", 920, "\xce\x98"), + ENTITY_DEF("leftrightsquigarrow", 8621, "\xe2\x86\xad"), + ENTITY_DEF("Atilde", 195, "\xc3\x83"), + ENTITY_DEF("cupdot", 8845, "\xe2\x8a\x8d"), + ENTITY_DEF("ntriangleright", 8939, "\xe2\x8b\xab"), + ENTITY_DEF("measuredangle", 8737, "\xe2\x88\xa1"), + ENTITY_DEF("jscr", 119999, "\xf0\x9d\x92\xbf"), + ENTITY_DEF("inodot", 305, "\xc4\xb1"), + ENTITY_DEF("mopf", 120158, "\xf0\x9d\x95\x9e"), + ENTITY_DEF("hkswarow", 10534, "\xe2\xa4\xa6"), + ENTITY_DEF("lopar", 10629, "\xe2\xa6\x85"), + ENTITY_DEF("thksim", 8764, "\xe2\x88\xbc"), + ENTITY_DEF("bkarow", 10509, "\xe2\xa4\x8d"), + ENTITY_DEF("rarrfs", 10526, "\xe2\xa4\x9e"), + ENTITY_DEF("ntrianglelefteq", 8940, "\xe2\x8b\xac"), + ENTITY_DEF("Bscr", 8492, "\xe2\x84\xac"), + ENTITY_DEF("topf", 120165, "\xf0\x9d\x95\xa5"), + ENTITY_DEF("Uacute", 218, "\xc3\x9a"), + ENTITY_DEF("lap", 10885, "\xe2\xaa\x85"), + ENTITY_DEF("djcy", 1106, "\xd1\x92"), + ENTITY_DEF("bopf", 120147, "\xf0\x9d\x95\x93"), + ENTITY_DEF("empty", 8709, "\xe2\x88\x85"), + ENTITY_DEF("LeftAngleBracket", 10216, "\xe2\x9f\xa8"), + ENTITY_DEF("Imacr", 298, "\xc4\xaa"), + ENTITY_DEF("ltcir", 10873, "\xe2\xa9\xb9"), + ENTITY_DEF("trisb", 10701, "\xe2\xa7\x8d"), + ENTITY_DEF("gjcy", 1107, "\xd1\x93"), + ENTITY_DEF("pr", 8826, "\xe2\x89\xba"), + ENTITY_DEF("Mu", 924, "\xce\x9c"), + ENTITY_DEF("ogon", 731, "\xcb\x9b"), + ENTITY_DEF("pertenk", 8241, "\xe2\x80\xb1"), + ENTITY_DEF("plustwo", 10791, "\xe2\xa8\xa7"), + ENTITY_DEF("Vfr", 120089, "\xf0\x9d\x94\x99"), + ENTITY_DEF("ApplyFunction", 8289, "\xe2\x81\xa1"), + ENTITY_DEF("Sub", 8912, "\xe2\x8b\x90"), + ENTITY_DEF("DoubleLeftRightArrow", 8660, "\xe2\x87\x94"), + ENTITY_DEF("Lmidot", 319, "\xc4\xbf"), + ENTITY_DEF("nwarrow", 8598, "\xe2\x86\x96"), + ENTITY_DEF("angrtvbd", 10653, "\xe2\xa6\x9d"), + ENTITY_DEF("fcy", 1092, "\xd1\x84"), + ENTITY_DEF("ltlarr", 10614, "\xe2\xa5\xb6"), + ENTITY_DEF("CircleMinus", 8854, "\xe2\x8a\x96"), + ENTITY_DEF("angmsdab", 10665, "\xe2\xa6\xa9"), + ENTITY_DEF("wedgeq", 8793, "\xe2\x89\x99"), + ENTITY_DEF("iogon", 303, "\xc4\xaf"), + ENTITY_DEF_HEUR("laquo", 171, "\xc2\xab"), + ENTITY_DEF("NestedGreaterGreater", 8811, "\xe2\x89\xab"), + ENTITY_DEF("UnionPlus", 8846, "\xe2\x8a\x8e"), + ENTITY_DEF("CircleDot", 8857, "\xe2\x8a\x99"), + ENTITY_DEF("coloneq", 8788, "\xe2\x89\x94"), + ENTITY_DEF("csupe", 10962, "\xe2\xab\x92"), + ENTITY_DEF("tcaron", 357, "\xc5\xa5"), + ENTITY_DEF("GreaterTilde", 8819, "\xe2\x89\xb3"), + ENTITY_DEF("Map", 10501, "\xe2\xa4\x85"), + ENTITY_DEF("DoubleLongLeftArrow", 10232, "\xe2\x9f\xb8"), + ENTITY_DEF("Uparrow", 8657, "\xe2\x87\x91"), + ENTITY_DEF("scy", 1089, "\xd1\x81"), + ENTITY_DEF("llarr", 8647, "\xe2\x87\x87"), + ENTITY_DEF("rangle", 10217, "\xe2\x9f\xa9"), + ENTITY_DEF("sstarf", 8902, "\xe2\x8b\x86"), + ENTITY_DEF("InvisibleTimes", 8290, "\xe2\x81\xa2"), + ENTITY_DEF("egsdot", 10904, "\xe2\xaa\x98"), + ENTITY_DEF("target", 8982, "\xe2\x8c\x96"), + ENTITY_DEF("lesges", 10899, "\xe2\xaa\x93"), + ENTITY_DEF_HEUR("curren", 164, "\xc2\xa4"), + ENTITY_DEF("yopf", 120170, "\xf0\x9d\x95\xaa"), + ENTITY_DEF("frac23", 8532, "\xe2\x85\x94"), + ENTITY_DEF("NotSucceedsTilde", 8831, "\xe2\x89\xbf\xcc\xb8"), + ENTITY_DEF("napprox", 8777, "\xe2\x89\x89"), + ENTITY_DEF("odblac", 337, "\xc5\x91"), + ENTITY_DEF("gammad", 989, "\xcf\x9d"), + ENTITY_DEF("dscr", 119993, "\xf0\x9d\x92\xb9"), + ENTITY_DEF("SupersetEqual", 8839, "\xe2\x8a\x87"), + ENTITY_DEF("squf", 9642, "\xe2\x96\xaa"), + ENTITY_DEF("Because", 8757, "\xe2\x88\xb5"), + ENTITY_DEF("sccue", 8829, "\xe2\x89\xbd"), + ENTITY_DEF("KHcy", 1061, "\xd0\xa5"), + ENTITY_DEF("Wcirc", 372, "\xc5\xb4"), + ENTITY_DEF("uparrow", 8593, "\xe2\x86\x91"), + ENTITY_DEF("lessgtr", 8822, "\xe2\x89\xb6"), + ENTITY_DEF("thickapprox", 8776, "\xe2\x89\x88"), + ENTITY_DEF("lbrksld", 10639, "\xe2\xa6\x8f"), + ENTITY_DEF_HEUR("oslash", 248, "\xc3\xb8"), + ENTITY_DEF("NotCupCap", 8813, "\xe2\x89\xad"), + ENTITY_DEF("elinters", 9191, "\xe2\x8f\xa7"), + ENTITY_DEF("Assign", 8788, "\xe2\x89\x94"), + ENTITY_DEF("ClockwiseContourIntegral", 8754, "\xe2\x88\xb2"), + ENTITY_DEF("lfisht", 10620, "\xe2\xa5\xbc"), + ENTITY_DEF("DownArrow", 8595, "\xe2\x86\x93"), + ENTITY_DEF("Zdot", 379, "\xc5\xbb"), + ENTITY_DEF("xscr", 120013, "\xf0\x9d\x93\x8d"), + ENTITY_DEF("DiacriticalGrave", 96, "\x60"), + ENTITY_DEF("DoubleLongLeftRightArrow", 10234, "\xe2\x9f\xba"), + ENTITY_DEF("angle", 8736, "\xe2\x88\xa0"), + ENTITY_DEF("race", 8765, "\xe2\x88\xbd\xcc\xb1"), + ENTITY_DEF("Ascr", 119964, "\xf0\x9d\x92\x9c"), + ENTITY_DEF("Xscr", 119987, "\xf0\x9d\x92\xb3"), + ENTITY_DEF_HEUR("acirc", 226, "\xc3\xa2"), + ENTITY_DEF("otimesas", 10806, "\xe2\xa8\xb6"), + ENTITY_DEF("gscr", 8458, "\xe2\x84\x8a"), + ENTITY_DEF("gcy", 1075, "\xd0\xb3"), + ENTITY_DEF("angmsdag", 10670, "\xe2\xa6\xae"), + ENTITY_DEF("tshcy", 1115, "\xd1\x9b"), + ENTITY_DEF("Acy", 1040, "\xd0\x90"), + ENTITY_DEF("NotGreaterLess", 8825, "\xe2\x89\xb9"), + ENTITY_DEF("dtdot", 8945, "\xe2\x8b\xb1"), + ENTITY_DEF_HEUR("quot", 34, "\x22"), + ENTITY_DEF_HEUR("micro", 181, "\xc2\xb5"), + ENTITY_DEF("simplus", 10788, "\xe2\xa8\xa4"), + ENTITY_DEF("nsupseteq", 8841, "\xe2\x8a\x89"), + ENTITY_DEF("Ufr", 120088, "\xf0\x9d\x94\x98"), + ENTITY_DEF("Pr", 10939, "\xe2\xaa\xbb"), + ENTITY_DEF("napid", 8779, "\xe2\x89\x8b\xcc\xb8"), + ENTITY_DEF("rceil", 8969, "\xe2\x8c\x89"), + ENTITY_DEF("boxtimes", 8864, "\xe2\x8a\xa0"), + ENTITY_DEF("erarr", 10609, "\xe2\xa5\xb1"), + ENTITY_DEF("downdownarrows", 8650, "\xe2\x87\x8a"), + ENTITY_DEF("Kfr", 120078, "\xf0\x9d\x94\x8e"), + ENTITY_DEF("mho", 8487, "\xe2\x84\xa7"), + ENTITY_DEF("scpolint", 10771, "\xe2\xa8\x93"), + ENTITY_DEF("vArr", 8661, "\xe2\x87\x95"), + ENTITY_DEF("Ccaron", 268, "\xc4\x8c"), + ENTITY_DEF("NotRightTriangle", 8939, "\xe2\x8b\xab"), + ENTITY_DEF("topbot", 9014, "\xe2\x8c\xb6"), + ENTITY_DEF("qopf", 120162, "\xf0\x9d\x95\xa2"), + ENTITY_DEF("eogon", 281, "\xc4\x99"), + ENTITY_DEF("luruhar", 10598, "\xe2\xa5\xa6"), + ENTITY_DEF("gtdot", 8919, "\xe2\x8b\x97"), + ENTITY_DEF("Egrave", 200, "\xc3\x88"), + ENTITY_DEF("roplus", 10798, "\xe2\xa8\xae"), + ENTITY_DEF("Intersection", 8898, "\xe2\x8b\x82"), + ENTITY_DEF("Uarr", 8607, "\xe2\x86\x9f"), + ENTITY_DEF("dcy", 1076, "\xd0\xb4"), + ENTITY_DEF("boxvl", 9508, "\xe2\x94\xa4"), + ENTITY_DEF("RightArrowBar", 8677, "\xe2\x87\xa5"), + ENTITY_DEF_HEUR("yuml", 255, "\xc3\xbf"), + ENTITY_DEF("parallel", 8741, "\xe2\x88\xa5"), + ENTITY_DEF("succneqq", 10934, "\xe2\xaa\xb6"), + ENTITY_DEF("bemptyv", 10672, "\xe2\xa6\xb0"), + ENTITY_DEF("starf", 9733, "\xe2\x98\x85"), + ENTITY_DEF("OverBar", 8254, "\xe2\x80\xbe"), + ENTITY_DEF("Alpha", 913, "\xce\x91"), + ENTITY_DEF("LeftUpVectorBar", 10584, "\xe2\xa5\x98"), + ENTITY_DEF("ufr", 120114, "\xf0\x9d\x94\xb2"), + ENTITY_DEF("swarhk", 10534, "\xe2\xa4\xa6"), + ENTITY_DEF("GreaterEqualLess", 8923, "\xe2\x8b\x9b"), + ENTITY_DEF("sscr", 120008, "\xf0\x9d\x93\x88"), + ENTITY_DEF("Pi", 928, "\xce\xa0"), + ENTITY_DEF("boxh", 9472, "\xe2\x94\x80"), + ENTITY_DEF("frac16", 8537, "\xe2\x85\x99"), + ENTITY_DEF("lbrack", 91, "\x5b"), + ENTITY_DEF("vert", 124, "\x7c"), + ENTITY_DEF("precneqq", 10933, "\xe2\xaa\xb5"), + ENTITY_DEF("NotGreaterSlantEqual", 10878, "\xe2\xa9\xbe\xcc\xb8"), + ENTITY_DEF("Omega", 937, "\xce\xa9"), + ENTITY_DEF("uarr", 8593, "\xe2\x86\x91"), + ENTITY_DEF("boxVr", 9567, "\xe2\x95\x9f"), + ENTITY_DEF("ruluhar", 10600, "\xe2\xa5\xa8"), + ENTITY_DEF("ShortLeftArrow", 8592, "\xe2\x86\x90"), + ENTITY_DEF("Qfr", 120084, "\xf0\x9d\x94\x94"), + ENTITY_DEF("olt", 10688, "\xe2\xa7\x80"), + ENTITY_DEF("nequiv", 8802, "\xe2\x89\xa2"), + ENTITY_DEF("fscr", 119995, "\xf0\x9d\x92\xbb"), + ENTITY_DEF("rarrhk", 8618, "\xe2\x86\xaa"), + ENTITY_DEF("nsqsupe", 8931, "\xe2\x8b\xa3"), + ENTITY_DEF("nsubseteq", 8840, "\xe2\x8a\x88"), + ENTITY_DEF("numero", 8470, "\xe2\x84\x96"), + ENTITY_DEF("emsp14", 8197, "\xe2\x80\x85"), + ENTITY_DEF("gl", 8823, "\xe2\x89\xb7"), + ENTITY_DEF("ocirc", 244, "\xc3\xb4"), + ENTITY_DEF("weierp", 8472, "\xe2\x84\x98"), + ENTITY_DEF("boxvL", 9569, "\xe2\x95\xa1"), + ENTITY_DEF("RightArrowLeftArrow", 8644, "\xe2\x87\x84"), + ENTITY_DEF("Precedes", 8826, "\xe2\x89\xba"), + ENTITY_DEF("RightVector", 8640, "\xe2\x87\x80"), + ENTITY_DEF("xcup", 8899, "\xe2\x8b\x83"), + ENTITY_DEF("angmsdad", 10667, "\xe2\xa6\xab"), + ENTITY_DEF("gtrsim", 8819, "\xe2\x89\xb3"), + ENTITY_DEF("natural", 9838, "\xe2\x99\xae"), + ENTITY_DEF("nVdash", 8878, "\xe2\x8a\xae"), + ENTITY_DEF("RightTriangleEqual", 8885, "\xe2\x8a\xb5"), + ENTITY_DEF("dscy", 1109, "\xd1\x95"), + ENTITY_DEF("leftthreetimes", 8907, "\xe2\x8b\x8b"), + ENTITY_DEF("prsim", 8830, "\xe2\x89\xbe"), + ENTITY_DEF("Bcy", 1041, "\xd0\x91"), + ENTITY_DEF("Chi", 935, "\xce\xa7"), + ENTITY_DEF("timesb", 8864, "\xe2\x8a\xa0"), + ENTITY_DEF("Del", 8711, "\xe2\x88\x87"), + ENTITY_DEF("lmidot", 320, "\xc5\x80"), + ENTITY_DEF("RightDownVector", 8642, "\xe2\x87\x82"), + ENTITY_DEF("simdot", 10858, "\xe2\xa9\xaa"), + ENTITY_DEF("FilledVerySmallSquare", 9642, "\xe2\x96\xaa"), + ENTITY_DEF("NotLessSlantEqual", 10877, "\xe2\xa9\xbd\xcc\xb8"), + ENTITY_DEF("SucceedsTilde", 8831, "\xe2\x89\xbf"), + ENTITY_DEF("duarr", 8693, "\xe2\x87\xb5"), + ENTITY_DEF("apE", 10864, "\xe2\xa9\xb0"), + ENTITY_DEF("odot", 8857, "\xe2\x8a\x99"), + ENTITY_DEF("mldr", 8230, "\xe2\x80\xa6"), + ENTITY_DEF("Uarrocir", 10569, "\xe2\xa5\x89"), + ENTITY_DEF("nLl", 8920, "\xe2\x8b\x98\xcc\xb8"), + ENTITY_DEF("rarrpl", 10565, "\xe2\xa5\x85"), + ENTITY_DEF("cir", 9675, "\xe2\x97\x8b"), + ENTITY_DEF("blk14", 9617, "\xe2\x96\x91"), + ENTITY_DEF("VerticalLine", 124, "\x7c"), + ENTITY_DEF("jcy", 1081, "\xd0\xb9"), + ENTITY_DEF("filig", 64257, "\xef\xac\x81"), + ENTITY_DEF("LongRightArrow", 10230, "\xe2\x9f\xb6"), + ENTITY_DEF("beta", 946, "\xce\xb2"), + ENTITY_DEF("ccupssm", 10832, "\xe2\xa9\x90"), + ENTITY_DEF("supsub", 10964, "\xe2\xab\x94"), + ENTITY_DEF("spar", 8741, "\xe2\x88\xa5"), + ENTITY_DEF("Tstrok", 358, "\xc5\xa6"), + ENTITY_DEF("isinv", 8712, "\xe2\x88\x88"), + ENTITY_DEF("rightsquigarrow", 8605, "\xe2\x86\x9d"), + ENTITY_DEF("Diamond", 8900, "\xe2\x8b\x84"), + ENTITY_DEF("curlyeqsucc", 8927, "\xe2\x8b\x9f"), + ENTITY_DEF("ijlig", 307, "\xc4\xb3"), + ENTITY_DEF("puncsp", 8200, "\xe2\x80\x88"), + ENTITY_DEF("hamilt", 8459, "\xe2\x84\x8b"), + ENTITY_DEF("mapstoleft", 8612, "\xe2\x86\xa4"), + ENTITY_DEF("Copf", 8450, "\xe2\x84\x82"), + ENTITY_DEF("prnsim", 8936, "\xe2\x8b\xa8"), + ENTITY_DEF("DotDot", 8412, "\xe2\x83\x9c"), + ENTITY_DEF("lobrk", 10214, "\xe2\x9f\xa6"), + ENTITY_DEF("twoheadrightarrow", 8608, "\xe2\x86\xa0"), + ENTITY_DEF("ngE", 8807, "\xe2\x89\xa7\xcc\xb8"), + ENTITY_DEF("cylcty", 9005, "\xe2\x8c\xad"), + ENTITY_DEF("sube", 8838, "\xe2\x8a\x86"), + ENTITY_DEF("NotEqualTilde", 8770, "\xe2\x89\x82\xcc\xb8"), + ENTITY_DEF_HEUR("Yuml", 376, "\xc5\xb8"), + ENTITY_DEF("comp", 8705, "\xe2\x88\x81"), + ENTITY_DEF("dotminus", 8760, "\xe2\x88\xb8"), + ENTITY_DEF("crarr", 8629, "\xe2\x86\xb5"), + ENTITY_DEF("imped", 437, "\xc6\xb5"), + ENTITY_DEF("barwedge", 8965, "\xe2\x8c\x85"), + ENTITY_DEF("harrcir", 10568, "\xe2\xa5\x88")}; + +class html_entities_storage { + ankerl::unordered_dense::map<std::string_view, html_entity_def> entity_by_name; + ankerl::unordered_dense::map<std::string_view, html_entity_def> entity_by_name_heur; + ankerl::unordered_dense::map<unsigned, html_entity_def> entity_by_id; + +public: + html_entities_storage() + { + auto nelts = G_N_ELEMENTS(html_entities_array); + entity_by_name.reserve(nelts); + entity_by_id.reserve(nelts); + + for (const auto &e: html_entities_array) { + entity_by_name[e.name] = e; + entity_by_id[e.code] = e; + + if (e.allow_heuristic) { + entity_by_name_heur[e.name] = e; + } + } + } + + auto by_name(std::string_view name, bool use_heuristic = false) const -> const html_entity_def * + { + const decltype(entity_by_name) *htb; + + if (use_heuristic) { + htb = &entity_by_name_heur; + } + else { + htb = &entity_by_name; + } + auto it = htb->find(name); + + if (it != htb->end()) { + return &(it->second); + } + + return nullptr; + } + + auto by_id(int id) const -> const html_entity_def * + { + auto it = entity_by_id.find(id); + if (it != entity_by_id.end()) { + return &(it->second); + } + + return nullptr; + } +}; + +static const html_entities_storage html_entities_defs; + +std::size_t +decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces) +{ + /* + * t - tortoise (destination ptr) + * h - hare (source ptr) + * e - begin of entity + */ + char *t = s, *h = s, *e = s; + const gchar *end; + bool seen_hash = false, seen_hex = false; + enum { + do_undefined, + do_digits_only, + do_mixed, + } seen_digit_only; + enum class parser_state { + normal_content, + ampersand, + skip_multi_spaces, + skip_start_spaces, + } state = parser_state::normal_content; + + end = s + len; + + auto replace_named_entity = [&](const char *entity, std::size_t len) -> bool { + const auto *entity_def = html_entities_defs.by_name({entity, + (std::size_t)(h - entity)}, + false); + + auto replace_entity = [&]() -> void { + auto l = strlen(entity_def->replacement); + memcpy(t, entity_def->replacement, l); + t += l; + }; + + if (entity_def) { + replace_entity(); + return true; + } + else { + /* Try heuristic */ + auto heuristic_lookup_func = [&](std::size_t lookup_len) -> bool { + if (!entity_def && h - e > lookup_len) { + entity_def = html_entities_defs.by_name({entity, lookup_len}, true); + + if (entity_def) { + replace_entity(); + /* Adjust h back */ + h = e + lookup_len; + + return true; + } + + entity_def = nullptr; + } + + return false; + }; + + heuristic_lookup_func(5); + heuristic_lookup_func(4); + heuristic_lookup_func(3); + heuristic_lookup_func(2); + + /* Leave undecoded */ + if (!entity_def && (end - t > h - e)) { + memmove(t, e, h - e); + t += h - e; + } + else if (entity_def) { + return true; + } + } + + return false; + }; + + /* Strtoul works merely for 0 terminated strings, so leave it alone... */ + auto dec_to_int = [](const char *str, std::size_t len) -> std::optional<int> { + int n = 0; + + /* Avoid INT_MIN overflow by moving to negative numbers */ + while (len > 0 && g_ascii_isdigit(*str)) { + n = 10 * n - (*str++ - '0'); + len--; + } + + if (len == 0) { + return -(n); + } + else { + return std::nullopt; + } + }; + auto hex_to_int = [](const char *str, std::size_t len) -> std::optional<int> { + int n = 0; + + /* Avoid INT_MIN overflow by moving to negative numbers */ + while (len > 0 && g_ascii_isxdigit(*str)) { + if (*str <= 0x39) { + n = 16 * n - (*str++ - '0'); + } + else { + n = 16 * n - (((*str++) | ' ') - 'a' + 10); + } + len--; + } + + if (len == 0) { + return -(n); + } + else { + return std::nullopt; + } + }; + auto oct_to_int = [](const char *str, std::size_t len) -> std::optional<int> { + int n = 0; + + /* Avoid INT_MIN overflow by moving to negative numbers */ + while (len > 0 && g_ascii_isdigit(*str)) { + if (*str > '7') { + break; + } + else { + n = 8 * n - (*str++ - '0'); + } + len--; + } + + if (len == 0) { + return -(n); + } + else { + return std::nullopt; + } + }; + + auto replace_numeric_entity = [&](const char *entity) -> bool { + UChar32 uc; + std::optional<int> maybe_num; + + if (*entity == 'x' || *entity == 'X') { + maybe_num = hex_to_int(entity + 1, h - (entity + 1)); + } + else if (*entity == 'o' || *entity == 'O') { + maybe_num = oct_to_int(entity + 1, h - (entity + 1)); + } + else { + maybe_num = dec_to_int(entity, h - entity); + } + + if (!maybe_num) { + /* Skip undecoded */ + if (end - t >= h - e) { + memmove(t, e, h - e); + t += h - e; + } + + return false; + } + else { + uc = maybe_num.value(); + /* Search for a replacement */ + const auto *entity_def = html_entities_defs.by_id(uc); + + if (entity_def) { + auto rep_len = strlen(entity_def->replacement); + + if (end - t >= rep_len) { + memcpy(t, entity_def->replacement, + rep_len); + t += rep_len; + } + + return true; + } + else { + /* Unicode point */ + goffset off = t - s; + UBool is_error = 0; + + if (uc > 0) { + U8_APPEND((std::uint8_t *) s, off, len, uc, is_error); + + if (!is_error) { + t = s + off; + } + else if (end - t > 3) { + /* Not printable code point replace with 0xFFFD */ + *t++ = '\357'; + *t++ = '\277'; + *t++ = '\275'; + + return true; + } + } + else if (end - t > 3) { + /* Not printable code point replace with 0xFFFD */ + *t++ = '\357'; + *t++ = '\277'; + *t++ = '\275'; + } + } + + return true; + } + + return false; + }; + + auto replace_entity = [&]() -> bool { + if (e + 1 < end) { + const auto *entity_start = e + 1; + + if (*entity_start != '#') { + return replace_named_entity(entity_start, (h - entity_start)); + } + else if (entity_start + 1 < h) { + return replace_numeric_entity(entity_start + 1); + } + } + + return false; + }; + + if (norm_spaces && g_ascii_isspace(*h)) { + state = parser_state::skip_start_spaces; + } + + while (h - s < len && t <= h) { + switch (state) { + case parser_state::normal_content: + if (*h == '&') { + state = parser_state::ampersand; + seen_hash = false; + seen_hex = false; + seen_digit_only = do_undefined; + e = h; + h++; + continue; + } + else { + if (norm_spaces && g_ascii_isspace(*h)) { + *t++ = ' '; + state = parser_state::skip_multi_spaces; + h++; + } + else { + *t++ = *h++; + } + } + break; + case parser_state::ampersand: + if ((*h == ';' || g_ascii_isspace(*h)) && h > e) { + replace_entity(); + state = parser_state::normal_content; + + if (g_ascii_isspace(*h)) { + /* Avoid increase of h */ + continue; + } + } + else if (*h == '&') { + /* Previous `&` was bogus */ + state = parser_state::ampersand; + + if (end - t > h - e) { + memmove(t, e, h - e); + t += h - e; + } + + e = h; + } + else if (*h == '#') { + seen_hash = true; + + if (h + 1 < end && h[1] == 'x') { + seen_hex = true; + /* Skip one more character */ + h++; + } + } + else if (seen_digit_only != do_mixed && + (g_ascii_isdigit(*h) || (seen_hex && g_ascii_isxdigit(*h)))) { + seen_digit_only = do_digits_only; + } + else { + if (seen_digit_only == do_digits_only && seen_hash && h > e) { + /* We have seen some digits, so we can try to decode, eh */ + /* Fuck retarded email clients... */ + replace_entity(); + state = parser_state::normal_content; + continue; + } + + seen_digit_only = do_mixed; + } + + h++; + + break; + case parser_state::skip_multi_spaces: + if (g_ascii_isspace(*h)) { + h++; + } + else { + state = parser_state::normal_content; + } + break; + case parser_state::skip_start_spaces: + if (g_ascii_isspace(*h)) { + h++; + } + else { + state = parser_state::normal_content; + } + break; + } + } + + /* Leftover */ + if (state == parser_state::ampersand && h > e) { + /* Unfinished entity, copy as is */ + if (replace_entity()) { + /* To follow FSM semantics */ + h++; + } + else { + h = e; /* Include the last & */ + } + + /* Leftover after replacement */ + if (h < end && t + (end - h) <= end) { + memmove(t, h, end - h); + t += end - h; + } + } + + if (norm_spaces) { + bool seen_spaces = false; + + while (t > s && g_ascii_isspace(*(t - 1))) { + seen_spaces = true; + t--; + } + + if (seen_spaces) { + *t++ = ' '; + } + } + + return (t - s); +} + +auto decode_html_entitles_inplace(std::string &st) -> void +{ + auto nlen = decode_html_entitles_inplace(st.data(), st.size()); + st.resize(nlen); +} + +TEST_SUITE("html entities") +{ + + TEST_CASE("html entities decode") + { + std::vector<std::pair<std::string, std::string>> cases{ + {"", ""}, + {"abc", "abc"}, + {"abc def", "abc def"}, + {"abc def", "abc def"}, + {"abc\ndef", "abc def"}, + {"abc\n \tdef", "abc def"}, + {" abc def ", "abc def "}, + {"FOO>BAR", "FOO>BAR"}, + {"FOO>BAR", "FOO>BAR"}, + {"FOO> BAR", "FOO> BAR"}, + {"FOO>;;BAR", "FOO>;;BAR"}, + {"I'm ¬it;", "I'm ¬it;"}, + {"I'm ∉", "I'm ∉"}, + {"FOO& BAR", "FOO& BAR"}, + {"FOO&&&>BAR", "FOO&&&>BAR"}, + {"FOO)BAR", "FOO)BAR"}, + {"FOOABAR", "FOOABAR"}, + {"FOOABAR", "FOOABAR"}, + {"FOO&#BAR", "FOO&#BAR"}, + {"FOO&#ZOO", "FOO&#ZOO"}, + {"FOOºR", "FOOºR"}, + {"FOO䆺R", "FOO䆺R"}, + {"FOO�ZOO", "FOO\uFFFDZOO"}, + {"FOOZOO", "FOO\u0081ZOO"}, + {"FOO�ZOO", "FOO\uFFFDZOO"}, + {"FOO�ZOO", "FOO\uFFFDZOO"}, + {"ZZ£_id=23", "ZZ£_id=23"}, + {"ZZ&prod_id=23", "ZZ&prod_id=23"}, + {"ZZ>", "ZZ>"}, + {"ZZ&", "ZZ&"}, + {"ZZÆ=", "ZZÆ="}, + }; + + for (const auto &c: cases) { + SUBCASE(("decode entities: " + c.first).c_str()) + { + auto *cpy = new char[c.first.size()]; + memcpy(cpy, c.first.data(), c.first.size()); + auto nlen = decode_html_entitles_inplace(cpy, c.first.size(), true); + CHECK(std::string{cpy, nlen} == c.second); + delete[] cpy; + } + } + } +} + +}// namespace rspamd::html
\ No newline at end of file diff --git a/src/libserver/html/html_entities.hxx b/src/libserver/html/html_entities.hxx new file mode 100644 index 0000000..fc1f7cc --- /dev/null +++ b/src/libserver/html/html_entities.hxx @@ -0,0 +1,31 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_ENTITIES_H +#define RSPAMD_HTML_ENTITIES_H +#pragma once + +#include <utility> +#include <string> + +namespace rspamd::html { + +auto decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces = false) -> std::size_t; +auto decode_html_entitles_inplace(std::string &st) -> void; + +}// namespace rspamd::html + +#endif diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx new file mode 100644 index 0000000..309d761 --- /dev/null +++ b/src/libserver/html/html_tag.hxx @@ -0,0 +1,159 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_TAG_HXX +#define RSPAMD_HTML_TAG_HXX +#pragma once + +#include <utility> +#include <string_view> +#include <variant> +#include <vector> +#include <optional> +#include <cstdint> + +#include "html_tags.h" + +struct rspamd_url; +struct html_image; + +namespace rspamd::html { + +struct html_content; /* Forward declaration */ + +enum class html_component_type : std::uint8_t { + RSPAMD_HTML_COMPONENT_NAME = 0, + RSPAMD_HTML_COMPONENT_HREF, + RSPAMD_HTML_COMPONENT_COLOR, + RSPAMD_HTML_COMPONENT_BGCOLOR, + RSPAMD_HTML_COMPONENT_STYLE, + RSPAMD_HTML_COMPONENT_CLASS, + RSPAMD_HTML_COMPONENT_WIDTH, + RSPAMD_HTML_COMPONENT_HEIGHT, + RSPAMD_HTML_COMPONENT_SIZE, + RSPAMD_HTML_COMPONENT_REL, + RSPAMD_HTML_COMPONENT_ALT, + RSPAMD_HTML_COMPONENT_ID, + RSPAMD_HTML_COMPONENT_HIDDEN, +}; + +/* Public tags flags */ +/* XML tag */ +#define FL_XML (1u << CM_USER_SHIFT) +/* Fully closed tag (e.g. <a attrs />) */ +#define FL_CLOSED (1 << (CM_USER_SHIFT + 1)) +#define FL_BROKEN (1 << (CM_USER_SHIFT + 2)) +#define FL_IGNORE (1 << (CM_USER_SHIFT + 3)) +#define FL_BLOCK (1 << (CM_USER_SHIFT + 4)) +#define FL_HREF (1 << (CM_USER_SHIFT + 5)) +#define FL_COMMENT (1 << (CM_USER_SHIFT + 6)) +#define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7)) + +/** + * Returns component type from a string + * @param st + * @return + */ +auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>; + +using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>; +struct html_tag_component { + html_component_type type; + std::string_view value; + + html_tag_component(html_component_type type, std::string_view value) + : type(type), value(value) + { + } +}; + +/* Pairing closing tag representation */ +struct html_closing_tag { + int start = -1; + int end = -1; + + auto clear() -> void + { + start = end = -1; + } +}; + +struct html_tag { + unsigned int tag_start = 0; + unsigned int content_offset = 0; + std::uint32_t flags = 0; + std::int32_t id = Tag_UNKNOWN; + html_closing_tag closing; + + std::vector<html_tag_component> components; + + html_tag_extra_t extra; + mutable struct html_block *block = nullptr; + std::vector<struct html_tag *> children; + struct html_tag *parent; + + auto find_component(html_component_type what) const -> std::optional<std::string_view> + { + for (const auto &comp: components) { + if (comp.type == what) { + return comp.value; + } + } + + return std::nullopt; + } + + auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view> + { + if (what) { + return find_component(what.value()); + } + + return std::nullopt; + } + + auto clear(void) -> void + { + id = Tag_UNKNOWN; + tag_start = content_offset = 0; + extra = std::monostate{}; + components.clear(); + flags = 0; + block = nullptr; + children.clear(); + closing.clear(); + } + + constexpr auto get_content_length() const -> std::size_t + { + if (flags & (FL_IGNORE | CM_HEAD)) { + return 0; + } + if (closing.start > content_offset) { + return closing.start - content_offset; + } + + return 0; + } + + auto get_content(const struct html_content *hc) const -> std::string_view; +}; + +static_assert(CM_USER_SHIFT + 7 < sizeof(html_tag::flags) * NBBY); + +}// namespace rspamd::html + +#endif//RSPAMD_HTML_TAG_HXX diff --git a/src/libserver/html/html_tag_defs.hxx b/src/libserver/html/html_tag_defs.hxx new file mode 100644 index 0000000..647f7c3 --- /dev/null +++ b/src/libserver/html/html_tag_defs.hxx @@ -0,0 +1,194 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_HTML_TAG_DEFS_HXX +#define RSPAMD_HTML_TAG_DEFS_HXX + +#include "config.h" +#include "html_tags.h" +#include "libutil/cxx/util.hxx" + +#include <string> +#include "contrib/ankerl/unordered_dense.h" + +namespace rspamd::html { + +struct html_tag_def { + std::string name; + tag_id_t id; + guint flags; +}; + +#define TAG_DEF(id, name, flags) \ + html_tag_def \ + { \ + (name), (id), (flags) \ + } + +static const auto html_tag_defs_array = rspamd::array_of( + /* W3C defined elements */ + TAG_DEF(Tag_A, "a", FL_HREF), + TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)), + TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)), + TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)), + TAG_DEF(Tag_APPLET, "applet", (CM_IMG | CM_INLINE | CM_PARAM)), + TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)), + TAG_DEF(Tag_B, "b", (CM_INLINE | FL_BLOCK)), + TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)), + TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)), + TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)), + TAG_DEF(Tag_BIG, "big", (CM_INLINE)), + TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)), + TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)), + TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)), + TAG_DEF(Tag_BUTTON, "button", (CM_INLINE | FL_BLOCK)), + TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)), + TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)), + TAG_DEF(Tag_CITE, "cite", (CM_INLINE)), + TAG_DEF(Tag_CODE, "code", (CM_INLINE)), + TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)), + TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)), + TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)), + TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK)), + TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)), + TAG_DEF(Tag_DIR, "dir", (CM_BLOCK)), + TAG_DEF(Tag_DIV, "div", (CM_BLOCK | FL_BLOCK)), + TAG_DEF(Tag_DL, "dl", (CM_BLOCK | FL_BLOCK)), + TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)), + TAG_DEF(Tag_EM, "em", (CM_INLINE)), + TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)), + TAG_DEF(Tag_FONT, "font", (FL_BLOCK)), + TAG_DEF(Tag_FORM, "form", (CM_BLOCK | FL_HREF)), + TAG_DEF(Tag_FRAME, "frame", (CM_EMPTY | FL_HREF)), + TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML)), + TAG_DEF(Tag_H1, "h1", (CM_BLOCK)), + TAG_DEF(Tag_H2, "h2", (CM_BLOCK)), + TAG_DEF(Tag_H3, "h3", (CM_BLOCK)), + TAG_DEF(Tag_H4, "h4", (CM_BLOCK)), + TAG_DEF(Tag_H5, "h5", (CM_BLOCK)), + TAG_DEF(Tag_H6, "h6", (CM_BLOCK)), + TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)), + TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)), + TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)), + TAG_DEF(Tag_I, "i", (CM_INLINE)), + TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)), + TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)), + TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)), + TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK)), + TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)), + TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)), + TAG_DEF(Tag_LABEL, "label", (CM_INLINE)), + TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)), + TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)), + TAG_DEF(Tag_LINK, "link", (CM_EMPTY | FL_HREF)), + TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK)), + TAG_DEF(Tag_MAP, "map", (CM_INLINE | FL_HREF)), + TAG_DEF(Tag_MENU, "menu", (CM_BLOCK)), + TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)), + TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK)), + TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_RAW)), + TAG_DEF(Tag_OBJECT, "object", (CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)), + TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)), + TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)), + TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)), + TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)), + TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)), + TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK)), + TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)), + TAG_DEF(Tag_Q, "q", (CM_INLINE)), + TAG_DEF(Tag_RB, "rb", (CM_INLINE)), + TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)), + TAG_DEF(Tag_RP, "rp", (CM_INLINE)), + TAG_DEF(Tag_RT, "rt", (CM_INLINE)), + TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)), + TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)), + TAG_DEF(Tag_S, "s", (CM_INLINE)), + TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)), + TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_RAW)), + TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)), + TAG_DEF(Tag_SMALL, "small", (CM_INLINE)), + TAG_DEF(Tag_SPAN, "span", (CM_NO_INDENT | FL_BLOCK)), + TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)), + TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)), + TAG_DEF(Tag_STYLE, "style", (CM_HEAD | CM_RAW)), + TAG_DEF(Tag_SUB, "sub", (CM_INLINE)), + TAG_DEF(Tag_SUP, "sup", (CM_INLINE)), + TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)), + TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT | FL_BLOCK)), + TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)), + TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)), + TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)), + TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)), + TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)), + TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)), + TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT | FL_BLOCK)), + TAG_DEF(Tag_TT, "tt", (CM_INLINE)), + TAG_DEF(Tag_U, "u", (CM_INLINE)), + TAG_DEF(Tag_UL, "ul", (CM_BLOCK | FL_BLOCK)), + TAG_DEF(Tag_VAR, "var", (CM_INLINE)), + TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK)), + TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY))); + +class html_tags_storage { + ankerl::unordered_dense::map<std::string_view, html_tag_def> tag_by_name; + ankerl::unordered_dense::map<tag_id_t, html_tag_def> tag_by_id; + +public: + html_tags_storage() + { + tag_by_name.reserve(html_tag_defs_array.size()); + tag_by_id.reserve(html_tag_defs_array.size()); + + for (const auto &t: html_tag_defs_array) { + tag_by_name[t.name] = t; + tag_by_id[t.id] = t; + } + } + + auto by_name(std::string_view name) const -> const html_tag_def * + { + auto it = tag_by_name.find(name); + + if (it != tag_by_name.end()) { + return &(it->second); + } + + return nullptr; + } + + auto by_id(int id) const -> const html_tag_def * + { + auto it = tag_by_id.find(static_cast<tag_id_t>(id)); + if (it != tag_by_id.end()) { + return &(it->second); + } + + return nullptr; + } + + auto name_by_id_safe(int id) const -> std::string_view + { + auto it = tag_by_id.find(static_cast<tag_id_t>(id)); + if (it != tag_by_id.end()) { + return it->second.name; + } + + return "unknown"; + } +}; + +}// namespace rspamd::html + +#endif//RSPAMD_HTML_TAG_DEFS_HXX diff --git a/src/libserver/html/html_tags.h b/src/libserver/html/html_tags.h new file mode 100644 index 0000000..c186314 --- /dev/null +++ b/src/libserver/html/html_tags.h @@ -0,0 +1,176 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBSERVER_HTML_TAGS_H_ +#define SRC_LIBSERVER_HTML_TAGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Known HTML tags */ +typedef enum { + Tag_UNKNOWN = 0, /**< Unknown tag! */ + Tag_A, /**< A */ + Tag_ABBR, /**< ABBR */ + Tag_ACRONYM, /**< ACRONYM */ + Tag_ADDRESS, /**< ADDRESS */ + Tag_APPLET, /**< APPLET */ + Tag_AREA, /**< AREA */ + Tag_B, /**< B */ + Tag_BASE, /**< BASE */ + Tag_BASEFONT, /**< BASEFONT */ + Tag_BDO, /**< BDO */ + Tag_BIG, /**< BIG */ + Tag_BLOCKQUOTE, /**< BLOCKQUOTE */ + Tag_BODY, /**< BODY */ + Tag_BR, /**< BR */ + Tag_BUTTON, /**< BUTTON */ + Tag_CAPTION, /**< CAPTION */ + Tag_CENTER, /**< CENTER */ + Tag_CITE, /**< CITE */ + Tag_CODE, /**< CODE */ + Tag_COL, /**< COL */ + Tag_COLGROUP, /**< COLGROUP */ + Tag_DD, /**< DD */ + Tag_DEL, /**< DEL */ + Tag_DFN, /**< DFN */ + Tag_DIR, /**< DIR */ + Tag_DIV, /**< DIF */ + Tag_DL, /**< DL */ + Tag_DT, /**< DT */ + Tag_EM, /**< EM */ + Tag_FIELDSET, /**< FIELDSET */ + Tag_FONT, /**< FONT */ + Tag_FORM, /**< FORM */ + Tag_FRAME, /**< FRAME */ + Tag_FRAMESET, /**< FRAMESET */ + Tag_H1, /**< H1 */ + Tag_H2, /**< H2 */ + Tag_H3, /**< H3 */ + Tag_H4, /**< H4 */ + Tag_H5, /**< H5 */ + Tag_H6, /**< H6 */ + Tag_HEAD, /**< HEAD */ + Tag_HR, /**< HR */ + Tag_HTML, /**< HTML */ + Tag_I, /**< I */ + Tag_IFRAME, /**< IFRAME */ + Tag_IMG, /**< IMG */ + Tag_INPUT, /**< INPUT */ + Tag_INS, /**< INS */ + Tag_ISINDEX, /**< ISINDEX */ + Tag_KBD, /**< KBD */ + Tag_KEYGEN, /**< KEYGEN */ + Tag_LABEL, /**< LABEL */ + Tag_LEGEND, /**< LEGEND */ + Tag_LI, /**< LI */ + Tag_LINK, /**< LINK */ + Tag_LISTING, /**< LISTING */ + Tag_MAP, /**< MAP */ + Tag_MENU, /**< MENU */ + Tag_META, /**< META */ + Tag_NOFRAMES, /**< NOFRAMES */ + Tag_NOSCRIPT, /**< NOSCRIPT */ + Tag_OBJECT, /**< OBJECT */ + Tag_OL, /**< OL */ + Tag_OPTGROUP, /**< OPTGROUP */ + Tag_OPTION, /**< OPTION */ + Tag_P, /**< P */ + Tag_PARAM, /**< PARAM */ + Tag_PLAINTEXT, /**< PLAINTEXT */ + Tag_PRE, /**< PRE */ + Tag_Q, /**< Q */ + Tag_RB, /**< RB */ + Tag_RBC, /**< RBC */ + Tag_RP, /**< RP */ + Tag_RT, /**< RT */ + Tag_RTC, /**< RTC */ + Tag_RUBY, /**< RUBY */ + Tag_S, /**< S */ + Tag_SAMP, /**< SAMP */ + Tag_SCRIPT, /**< SCRIPT */ + Tag_SELECT, /**< SELECT */ + Tag_SMALL, /**< SMALL */ + Tag_SPAN, /**< SPAN */ + Tag_STRIKE, /**< STRIKE */ + Tag_STRONG, /**< STRONG */ + Tag_STYLE, /**< STYLE */ + Tag_SUB, /**< SUB */ + Tag_SUP, /**< SUP */ + Tag_TABLE, /**< TABLE */ + Tag_TBODY, /**< TBODY */ + Tag_TD, /**< TD */ + Tag_TEXTAREA, /**< TEXTAREA */ + Tag_TFOOT, /**< TFOOT */ + Tag_TH, /**< TH */ + Tag_THEAD, /**< THEAD */ + Tag_TITLE, /**< TITLE */ + Tag_TR, /**< TR */ + Tag_TT, /**< TT */ + Tag_U, /**< U */ + Tag_UL, /**< UL */ + Tag_VAR, /**< VAR */ + Tag_XMP, /**< XMP */ + Tag_NEXTID, /**< NEXTID */ + Tag_MAX, + + N_TAGS = -1 /**< Must be -1 */ +} tag_id_t; + +#define CM_UNKNOWN 0 +/* Elements with no content. Map to HTML specification. */ +#define CM_EMPTY (1 << 0) +/* Elements that appear outside of "BODY". */ +#define CM_HTML (1 << 1) +/* Elements that can appear within HEAD. */ +#define CM_HEAD (1 << 2) +/* HTML "block" elements. */ +#define CM_BLOCK (1 << 3) +/* HTML "inline" elements. */ +#define CM_INLINE (1 << 4) +/* Elements that mark list item ("LI"). */ +#define CM_LIST (1 << 5) +/* Elements that mark definition list item ("DL", "DT"). */ +#define CM_DEFLIST (1 << 6) +/* Elements that can appear inside TABLE. */ +#define CM_TABLE (1 << 7) +/* Used for "THEAD", "TFOOT" or "TBODY". */ +#define CM_ROWGRP (1 << 8) +/* Used for "TD", "TH" */ +#define CM_ROW (1 << 9) +/* Elements whose content must be protected against white space movement. + Includes some elements that can found in forms. */ +#define CM_FIELD (1 << 10) +#define CM_RAW (1 << 11) +/* Elements that allows "PARAM". */ +#define CM_PARAM (1 << 12) +/* Elements with an optional end tag. */ +#define CM_OPT (1 << 13) +/* Elements that use "align" attribute for vertical position. */ +#define CM_IMG (1 << 14) +#define CM_NO_INDENT (1 << 15) +/* Elements that cannot be omitted. */ +#define CM_OMITST (1 << 16) +/* Unique elements */ +#define CM_UNIQUE (1 << 17) + +#define CM_USER_SHIFT (18) + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBSERVER_HTML_TAGS_H_ */ diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx new file mode 100644 index 0000000..2fe6702 --- /dev/null +++ b/src/libserver/html/html_tests.cxx @@ -0,0 +1,304 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "html.hxx" +#include "libserver/task.h" + +#include <vector> +#include <fmt/core.h> + + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +namespace rspamd::html { + +/* + * Tests part + */ + +TEST_SUITE("html") +{ + TEST_CASE("html parsing") + { + + const std::vector<std::pair<std::string, std::string>> cases{ + {"<html><!DOCTYPE html><body>", "+html;++xml;++body;"}, + {"<html><div><div></div></div></html>", "+html;++div;+++div;"}, + {"<html><div><div></div></html>", "+html;++div;+++div;"}, + {"<html><div><div></div></html></div>", "+html;++div;+++div;"}, + {"<p><p><a></p></a></a>", "+p;++p;+++a;"}, + {"<div><a href=\"http://example.com\"></div></a>", "+div;++a;"}, + /* Broken, as I don't know how the hell this should be really parsed */ + //{"<html><!DOCTYPE html><body><head><body></body></html></body></html>", + // "+html;++xml;++body;+++head;+++body;"} + }; + + rspamd_url_init(NULL); + auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "html", 0); + struct rspamd_task fake_task; + memset(&fake_task, 0, sizeof(fake_task)); + fake_task.task_pool = pool; + + for (const auto &c: cases) { + SUBCASE((std::string("extract tags from: ") + c.first).c_str()) + { + GByteArray *tmp = g_byte_array_sized_new(c.first.size()); + g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); + auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr); + CHECK(hc != nullptr); + auto dump = html_debug_structure(*hc); + CHECK(c.second == dump); + g_byte_array_free(tmp, TRUE); + } + } + + rspamd_mempool_delete(pool); + } + + TEST_CASE("html text extraction") + { + using namespace std::string_literals; + const std::vector<std::pair<std::string, std::string>> cases{ + {"test", "test"}, + {"test\0"s, "test\uFFFD"s}, + {"test\0test"s, "test\uFFFDtest"s}, + {"test\0\0test"s, "test\uFFFD\uFFFDtest"s}, + {"test ", "test"}, + {"test foo, bar", "test foo, bar"}, + {"<p>text</p>", "text\n"}, + {"olo<p>text</p>lolo", "olo\ntext\nlolo"}, + {"<div>foo</div><div>bar</div>", "foo\nbar\n"}, + {"<b>foo<i>bar</b>baz</i>", "foobarbaz"}, + {"<b>foo<i>bar</i>baz</b>", "foobarbaz"}, + {"foo<br>baz", "foo\nbaz"}, + {"<a href=https://example.com>test</a>", "test"}, + {"<img alt=test>", "test"}, + {" <body>\n" + " <!-- escape content -->\n" + " a b a > b a < b a & b 'a "a"\n" + " </body>", + R"|(a b a > b a < b a & b 'a "a")|"}, + /* XML tags */ + {"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n" + " <!DOCTYPE html\n" + " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n" + " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" + "<body>test</body>", + "test"}, + {"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>" + " <body>\n" + " <p><br>\n" + " </p>\n" + " <div class=\"moz-forward-container\"><br>\n" + " <br>\n" + " test</div>" + "</body>", + "\n\n\ntest\n"}, + {"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>" + "sh<span style=\"FONT-SIZE: 0px\">aring </span></div>", + "fish\n"}, + /* FIXME: broken until rework of css parser */ + //{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>" + // "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"}, + /* Complex html with bad tags */ + {"<!DOCTYPE html>\n" + "<html lang=\"en\">\n" + " <head>\n" + " <meta charset=\"utf-8\">\n" + " <title>title</title>\n" + " <link rel=\"stylesheet\" href=\"style.css\">\n" + " <script src=\"script.js\"></script>\n" + " </head>\n" + " <body>\n" + " <!-- page content -->\n" + " Hello, world! <b>test</b>\n" + " <p>data<>\n" + " </P>\n" + " <b>stuff</p>?\n" + " </body>\n" + "</html>", + "Hello, world! test \ndata<>\nstuff\n?"}, + {"<p><!--comment-->test</br></hr><br>", "test\n"}, + /* Tables */ + {"<table>\n" + " <tr>\n" + " <th>heada</th>\n" + " <th>headb</th>\n" + " </tr>\n" + " <tr>\n" + " <td>data1</td>\n" + " <td>data2</td>\n" + " </tr>\n" + " </table>", + "heada headb\ndata1 data2\n"}, + /* Invalid closing br and hr + comment */ + {" <body>\n" + " <!-- page content -->\n" + " Hello, world!<br>test</br><br>content</hr>more content<br>\n" + " <div>\n" + " content inside div\n" + " </div>\n" + " </body>", + "Hello, world!\ntest\ncontentmore content\ncontent inside div\n"}, + /* First closing tag */ + {"</head>\n" + "<body>\n" + "<p> Hello. I have some bad news.\n" + "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n" + "</body>\n" + "</html>", + "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"}, + /* Invalid tags */ + {"lol <sht> omg </sht> oh my!\n" + "<name>words words</name> goodbye", + "lol omg oh my! words words goodbye"}, + /* Invisible stuff */ + {"<div style=\"color:#555555;font-family:Arial, 'Helvetica Neue', Helvetica, sans-serif;line-height:1.2;padding-top:10px;padding-right:10px;padding-bottom:10px;padding-left:10px;font-style: italic;\">\n" + "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n" + "<span style=\"color:#FFFFFF; \">F</span>Sincerely,</p>\n" + "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n" + "<span style=\"color:#FFFFFF; \">8</span>Sky<span style=\"opacity:1;\"></span>pe<span style=\"color:#FFFFFF; \">F</span>Web<span style=\"color:#FFFFFF; \">F</span></p>\n" + "<span style=\"color:#FFFFFF; \">kreyes</span>\n" + "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n" + " </p>", + " Sincerely,\n Skype Web\n"}, + {"lala<p hidden>fafa</p>", "lala"}, + {"<table style=\"FONT-SIZE: 0px;\"><tbody><tr><td>\n" + "DONKEY\n" + "</td></tr></tbody></table>", + ""}, + /* bgcolor propagation */ + {"<a style=\"display: inline-block; color: #ffffff; background-color: #00aff0;\">\n" + "<span style=\"color: #00aff0;\">F</span>Rev<span style=\"opacity: 1;\"></span></span>ie<span style=\"opacity: 1;\"></span>" + "</span>w<span style=\"color: #00aff0;\">F<span style=\"opacity: 1;\">̹</span></span>", + " Review"}, + {"<td style=\"color:#ffffff\" bgcolor=\"#005595\">\n" + "hello world\n" + "</td>", + "hello world"}, + /* Colors */ + {"goodbye <span style=\"COLOR: rgb(64,64,64)\">cruel</span>" + "<span>world</span>", + "goodbye cruelworld"}, + /* Font-size propagation */ + {"<p style=\"font-size: 11pt;line-height:22px\">goodbye <span style=\"font-size:0px\">cruel</span>world</p>", + "goodbye world\n"}, + /* Newline before tag -> must be space */ + {"goodbye <span style=\"COLOR: rgb(64,64,64)\">cruel</span>\n" + "<span>world</span>", + "goodbye cruel world"}, + /* Head tag with some stuff */ + {"<html><head><p>oh my god</head><body></body></html>", "oh my god\n"}, + {"<html><head><title>oh my god</head><body></body></html>", ""}, + {"<html><body><html><head>displayed</body></html></body></html>", "displayed"}, + + }; + + rspamd_url_init(NULL); + auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "html", 0); + struct rspamd_task fake_task; + memset(&fake_task, 0, sizeof(fake_task)); + fake_task.task_pool = pool; + + auto replace_newlines = [](std::string &str) { + auto start_pos = 0; + while ((start_pos = str.find("\n", start_pos, 1)) != std::string::npos) { + str.replace(start_pos, 1, "\\n", 2); + start_pos += 2; + } + }; + + auto i = 1; + for (const auto &c: cases) { + SUBCASE((fmt::format("html extraction case {}", i)).c_str()) + { + GByteArray *tmp = g_byte_array_sized_new(c.first.size()); + g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); + auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr); + CHECK(hc != nullptr); + replace_newlines(hc->parsed); + auto expected = c.second; + replace_newlines(expected); + CHECK(hc->parsed == expected); + g_byte_array_free(tmp, TRUE); + } + i++; + } + + rspamd_mempool_delete(pool); + } + + TEST_CASE("html urls extraction") + { + using namespace std::string_literals; + const std::vector<std::tuple<std::string, std::vector<std::string>, std::optional<std::string>>> cases{ + {"<style></style><a href=\"https://www.example.com\">yolo</a>", + {"https://www.example.com"}, + "yolo"}, + {"<a href=\"https://example.com\">test</a>", {"https://example.com"}, "test"}, + {"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}, "hello"}, + {"<html>\n" + "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=utf-8\">\n" + "<body>\n" + "<a href=\"https://www.example.com\">hello</a>\n" + "</body>\n" + "</html>", + {"https://www.example.com"}, + "hello"}, + }; + + rspamd_url_init(NULL); + auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "html", 0); + struct rspamd_task fake_task; + memset(&fake_task, 0, sizeof(fake_task)); + fake_task.task_pool = pool; + + auto i = 1; + for (const auto &c: cases) { + SUBCASE((fmt::format("html url extraction case {}", i)).c_str()) + { + GPtrArray *purls = g_ptr_array_new(); + auto input = std::get<0>(c); + GByteArray *tmp = g_byte_array_sized_new(input.size()); + g_byte_array_append(tmp, (const guint8 *) input.data(), input.size()); + auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true, nullptr); + CHECK(hc != nullptr); + auto &expected_text = std::get<2>(c); + if (expected_text.has_value()) { + CHECK(hc->parsed == expected_text.value()); + } + const auto &expected_urls = std::get<1>(c); + CHECK(expected_urls.size() == purls->len); + for (auto j = 0; j < expected_urls.size(); ++j) { + auto *url = (rspamd_url *) g_ptr_array_index(purls, j); + CHECK(expected_urls[j] == std::string{url->string, url->urllen}); + } + g_byte_array_free(tmp, TRUE); + g_ptr_array_free(purls, TRUE); + } + ++i; + } + + rspamd_mempool_delete(pool); + } +} + +} /* namespace rspamd::html */ diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx new file mode 100644 index 0000000..8f29f2c --- /dev/null +++ b/src/libserver/html/html_url.cxx @@ -0,0 +1,496 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "html_url.hxx" +#include "libutil/str_util.h" +#include "libserver/url.h" +#include "libserver/logger.h" +#include "rspamd.h" + +#include <unicode/idna.h> + +namespace rspamd::html { + +static auto +rspamd_url_is_subdomain(std::string_view t1, std::string_view t2) -> bool +{ + const auto *p1 = t1.data() + t1.size() - 1; + const auto *p2 = t2.data() + t2.size() - 1; + + /* Skip trailing dots */ + while (p1 > t1.data()) { + if (*p1 != '.') { + break; + } + + p1--; + } + + while (p2 > t2.data()) { + if (*p2 != '.') { + break; + } + + p2--; + } + + while (p1 > t1.data() && p2 > t2.data()) { + if (*p1 != *p2) { + break; + } + + p1--; + p2--; + } + + if (p2 == t2.data()) { + /* p2 can be subdomain of p1 if *p1 is '.' */ + if (p1 != t1.data() && *(p1 - 1) == '.') { + return true; + } + } + else if (p1 == t1.data()) { + if (p2 != t2.data() && *(p2 - 1) == '.') { + return true; + } + } + + return false; +} + + +static auto +get_icu_idna_instance(void) -> auto +{ + auto uc_err = U_ZERO_ERROR; + static auto *udn = icu::IDNA::createUTS46Instance(UIDNA_DEFAULT, uc_err); + + return udn; +} + +static auto +convert_idna_hostname_maybe(rspamd_mempool_t *pool, struct rspamd_url *url, bool use_tld) + -> std::string_view +{ + std::string_view ret = use_tld ? std::string_view{rspamd_url_tld_unsafe(url), url->tldlen} : std::string_view{rspamd_url_host_unsafe(url), url->hostlen}; + + /* Handle IDN url's */ + if (ret.size() > 4 && + rspamd_substring_search_caseless(ret.data(), ret.size(), "xn--", 4) != -1) { + + const auto buf_capacity = ret.size() * 2 + 1; + auto *idn_hbuf = (char *) rspamd_mempool_alloc(pool, buf_capacity); + icu::CheckedArrayByteSink byte_sink{idn_hbuf, (int) buf_capacity}; + + /* We need to convert it to the normal value first */ + icu::IDNAInfo info; + auto uc_err = U_ZERO_ERROR; + auto *udn = get_icu_idna_instance(); + udn->nameToUnicodeUTF8(icu::StringPiece(ret.data(), ret.size()), + byte_sink, info, uc_err); + + if (uc_err == U_ZERO_ERROR && !info.hasErrors()) { + /* idn_hbuf is allocated in mempool, so it is safe to use */ + ret = std::string_view{idn_hbuf, (std::size_t) byte_sink.NumberOfBytesWritten()}; + } + else { + msg_err_pool("cannot convert to IDN: %s (0x%xd)", + u_errorName(uc_err), info.getErrors()); + } + } + + return ret; +}; + +constexpr auto sv_equals(std::string_view s1, std::string_view s2) -> auto +{ + return (s1.size() == s2.size()) && + std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(), + [](const auto c1, const auto c2) { + return g_ascii_tolower(c1) == g_ascii_tolower(c2); + }); +} + +constexpr auto +is_transfer_proto(struct rspamd_url *u) -> bool +{ + return (u->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP)) != 0; +} + +auto html_url_is_phished(rspamd_mempool_t *pool, + struct rspamd_url *href_url, + std::string_view text_data) -> std::optional<rspamd_url *> +{ + struct rspamd_url *text_url; + std::string_view disp_tok, href_tok; + goffset url_pos; + gchar *url_str = NULL; + + auto sz = text_data.size(); + const auto *trimmed = rspamd_string_unicode_trim_inplace(text_data.data(), &sz); + text_data = std::string_view(trimmed, sz); + + if (text_data.size() > 4 && + rspamd_url_find(pool, text_data.data(), text_data.size(), &url_str, + RSPAMD_URL_FIND_ALL, + &url_pos, NULL) && + url_str != nullptr) { + + if (url_pos > 0) { + /* + * We have some url at some offset, so we need to check what is + * at the start of the text + */ + return std::nullopt; + } + + text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url); + auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool, + RSPAMD_URL_PARSE_TEXT); + + if (rc == URI_ERRNO_OK) { + text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; + href_url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL; + + /* Check for phishing */ + if (is_transfer_proto(text_url) == is_transfer_proto(href_url)) { + disp_tok = convert_idna_hostname_maybe(pool, text_url, false); + href_tok = convert_idna_hostname_maybe(pool, href_url, false); + + if (!sv_equals(disp_tok, href_tok) && + text_url->tldlen > 0 && href_url->tldlen > 0) { + + /* Apply the same logic for TLD */ + disp_tok = convert_idna_hostname_maybe(pool, text_url, true); + href_tok = convert_idna_hostname_maybe(pool, href_url, true); + + if (!sv_equals(disp_tok, href_tok)) { + /* Check if one url is a subdomain for another */ + + if (!rspamd_url_is_subdomain(disp_tok, href_tok)) { + href_url->flags |= RSPAMD_URL_FLAG_PHISHED; + text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; + + if (href_url->ext == nullptr) { + href_url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext); + } + href_url->ext->linked_url = text_url; + } + } + } + } + + return text_url; + } + else { + /* + * We have found something that looks like an url but it was + * not parsed correctly. + * Sometimes it means an obfuscation attempt, so we have to check + * what's inside of the text + */ + gboolean obfuscation_found = FALSE; + + if (text_data.size() > 4 && g_ascii_strncasecmp(text_data.begin(), "http", 4) == 0 && + rspamd_substring_search(text_data.begin(), text_data.size(), "://", 3) != -1) { + /* Clearly an obfuscation attempt */ + obfuscation_found = TRUE; + } + + msg_info_pool("extract of url '%s' failed: %s; obfuscation detected: %s", + url_str, + rspamd_url_strerror(rc), + obfuscation_found ? "yes" : "no"); + + if (obfuscation_found) { + href_url->flags |= RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED; + } + } + } + + return std::nullopt; +} + +void html_check_displayed_url(rspamd_mempool_t *pool, + GList **exceptions, + void *url_set, + std::string_view visible_part, + goffset href_offset, + struct rspamd_url *url) +{ + struct rspamd_url *displayed_url = nullptr; + struct rspamd_url *turl; + struct rspamd_process_exception *ex; + guint saved_flags = 0; + gsize dlen; + + if (visible_part.empty()) { + /* No displayed url, just some text within <a> tag */ + return; + } + + if (url->ext == nullptr) { + url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext); + } + url->ext->visible_part = rspamd_mempool_alloc_buffer(pool, visible_part.size() + 1); + rspamd_strlcpy(url->ext->visible_part, + visible_part.data(), + visible_part.size() + 1); + dlen = visible_part.size(); + + /* Strip unicode spaces from the start and the end */ + url->ext->visible_part = const_cast<char *>( + rspamd_string_unicode_trim_inplace(url->ext->visible_part, + &dlen)); + auto maybe_url = html_url_is_phished(pool, url, + {url->ext->visible_part, dlen}); + + if (maybe_url) { + url->flags |= saved_flags; + displayed_url = maybe_url.value(); + } + + if (exceptions && displayed_url != nullptr) { + ex = rspamd_mempool_alloc_type(pool, struct rspamd_process_exception); + ex->pos = href_offset; + ex->len = dlen; + ex->type = RSPAMD_EXCEPTION_URL; + ex->ptr = url; + + *exceptions = g_list_prepend(*exceptions, ex); + } + + if (displayed_url && url_set) { + turl = rspamd_url_set_add_or_return((khash_t(rspamd_url_hash) *) url_set, displayed_url); + + if (turl != nullptr) { + /* Here, we assume the following: + * if we have a URL in the text part which + * is the same as displayed URL in the + * HTML part, we assume that it is also + * hint only. + */ + if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) { + + /* + * We have the same URL for href and displayed url, so we + * know that this url cannot be both target and display (as + * it breaks logic in many places), so we do not + * propagate html flags + */ + if (!(turl->flags & RSPAMD_URL_FLAG_DISPLAY_URL)) { + turl->flags |= displayed_url->flags; + } + turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT; + } + + turl->count++; + } + else { + /* Already inserted by `rspamd_url_set_add_or_return` */ + } + } + + rspamd_normalise_unicode_inplace(url->ext->visible_part, &dlen); +} + +auto html_process_url(rspamd_mempool_t *pool, std::string_view &input) + -> std::optional<struct rspamd_url *> +{ + struct rspamd_url *url; + guint saved_flags = 0; + gint rc; + const gchar *s, *prefix = "http://"; + gchar *d; + gsize dlen; + gboolean has_bad_chars = FALSE, no_prefix = FALSE; + static const gchar hexdigests[] = "0123456789abcdef"; + + auto sz = input.length(); + const auto *trimmed = rspamd_string_unicode_trim_inplace(input.data(), &sz); + input = {trimmed, sz}; + + const auto *start = input.data(); + s = start; + dlen = 0; + + for (auto i = 0; i < sz; i++) { + if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) { + dlen += 3; + } + else { + dlen++; + } + } + + if (rspamd_substring_search(start, sz, "://", 3) == -1) { + if (sz >= sizeof("mailto:") && + (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 || + memcmp(start, "tel:", sizeof("tel:") - 1) == 0 || + memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) { + /* Exclusion, has valid but 'strange' prefix */ + } + else { + for (auto i = 0; i < sz; i++) { + if (!((s[i] & 0x80) || g_ascii_isalnum(s[i]))) { + if (i == 0 && sz > 2 && s[i] == '/' && s[i + 1] == '/') { + prefix = "http:"; + dlen += sizeof("http:") - 1; + no_prefix = TRUE; + } + else if (s[i] == '@') { + /* Likely email prefix */ + prefix = "mailto://"; + dlen += sizeof("mailto://") - 1; + no_prefix = TRUE; + } + else if (s[i] == ':' && i != 0) { + /* Special case */ + no_prefix = FALSE; + } + else { + if (i == 0) { + /* No valid data */ + return std::nullopt; + } + else { + no_prefix = TRUE; + dlen += strlen(prefix); + } + } + + break; + } + } + } + } + + auto *decoded = rspamd_mempool_alloc_buffer(pool, dlen + 1); + d = decoded; + + if (no_prefix) { + gsize plen = strlen(prefix); + memcpy(d, prefix, plen); + d += plen; + } + + /* + * We also need to remove all internal newlines, spaces + * and encode unsafe characters + * Another obfuscation find in the wild was encoding of the SAFE url characters, + * including essential ones + */ + for (auto i = 0; i < sz; i++) { + if (G_UNLIKELY(g_ascii_isspace(s[i]))) { + continue; + } + else if (G_UNLIKELY(((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) { + /* URL encode */ + *d++ = '%'; + *d++ = hexdigests[(s[i] >> 4) & 0xf]; + *d++ = hexdigests[s[i] & 0xf]; + has_bad_chars = TRUE; + } + else if (G_UNLIKELY(s[i] == '%')) { + if (i + 2 < sz) { + auto c1 = s[i + 1]; + auto c2 = s[i + 2]; + + if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) { + auto codepoint = 0; + + if (c1 >= '0' && c1 <= '9') codepoint = c1 - '0'; + else if (c1 >= 'A' && c1 <= 'F') + codepoint = c1 - 'A' + 10; + else if (c1 >= 'a' && c1 <= 'f') + codepoint = c1 - 'a' + 10; + + codepoint <<= 4; + + if (c2 >= '0' && c2 <= '9') codepoint += c2 - '0'; + else if (c2 >= 'A' && c2 <= 'F') + codepoint += c2 - 'A' + 10; + else if (c2 >= 'a' && c2 <= 'f') + codepoint += c2 - 'a' + 10; + + /* Now check for 'interesting' codepoints */ + if (codepoint == '@' || codepoint == ':' || codepoint == '|' || + codepoint == '?' || codepoint == '\\' || codepoint == '/') { + /* Replace it back */ + *d++ = (char) (codepoint & 0xff); + i += 2; + } + else { + *d++ = s[i]; + } + } + else { + *d++ = s[i]; + } + } + else { + *d++ = s[i]; + } + } + else { + *d++ = s[i]; + } + } + + *d = '\0'; + dlen = d - decoded; + + url = rspamd_mempool_alloc0_type(pool, struct rspamd_url); + rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags); + rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF); + + /* Filter some completely damaged urls */ + if (rc == URI_ERRNO_OK && url->hostlen > 0 && + !((url->protocol & PROTOCOL_UNKNOWN))) { + url->flags |= saved_flags; + + if (has_bad_chars) { + url->flags |= RSPAMD_URL_FLAG_OBSCURED; + } + + if (no_prefix) { + url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; + + if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) { + /* Ignore urls with both no schema and no tld */ + return std::nullopt; + } + } + + decoded = url->string; + + input = {decoded, url->urllen}; + + /* Spaces in href usually mean an attempt to obfuscate URL */ + /* See https://github.com/vstakhov/rspamd/issues/593 */ +#if 0 + if (has_spaces) { + url->flags |= RSPAMD_URL_FLAG_OBSCURED; + } +#endif + + return url; + } + + return std::nullopt; +} + +}// namespace rspamd::html
\ No newline at end of file diff --git a/src/libserver/html/html_url.hxx b/src/libserver/html/html_url.hxx new file mode 100644 index 0000000..46dde6d --- /dev/null +++ b/src/libserver/html/html_url.hxx @@ -0,0 +1,68 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_URL_HXX +#define RSPAMD_HTML_URL_HXX +#pragma once + +#include "libutil/mem_pool.h" + +#include <string_view> +#include <optional> + +struct rspamd_url; /* Forward declaration */ + +namespace rspamd::html { + + +/** + * Checks if an html url is likely phished by some displayed url + * @param pool + * @param href_url + * @param text_data + * @return + */ +auto html_url_is_phished(rspamd_mempool_t *pool, + struct rspamd_url *href_url, + std::string_view text_data) -> std::optional<rspamd_url *>; + +/** + * Check displayed part of the url at specified offset + * @param pool + * @param exceptions + * @param url_set + * @param visible_part + * @param href_offset + * @param url + */ +auto html_check_displayed_url(rspamd_mempool_t *pool, + GList **exceptions, + void *url_set, + std::string_view visible_part, + goffset href_offset, + struct rspamd_url *url) -> void; + +/** + * Process HTML url (e.g. for href component) + * @param pool + * @param input may be modified during the process + * @return + */ +auto html_process_url(rspamd_mempool_t *pool, std::string_view &input) + -> std::optional<struct rspamd_url *>; +}// namespace rspamd::html + +#endif//RSPAMD_HTML_URL_HXX
\ No newline at end of file diff --git a/src/libserver/http/http_connection.c b/src/libserver/http/http_connection.c new file mode 100644 index 0000000..5557fbf --- /dev/null +++ b/src/libserver/http/http_connection.c @@ -0,0 +1,2649 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "http_connection.h" +#include "http_private.h" +#include "http_message.h" +#include "utlist.h" +#include "util.h" +#include "printf.h" +#include "logger.h" +#include "ref.h" +#include "ottery.h" +#include "keypair_private.h" +#include "cryptobox.h" +#include "libutil/libev_helper.h" +#include "libserver/ssl_util.h" +#include "libserver/url.h" + +#include "contrib/mumhash/mum.h" +#include "contrib/http-parser/http_parser.h" +#include "unix-std.h" + +#include <openssl/err.h> + +#define ENCRYPTED_VERSION " HTTP/1.0" + +struct _rspamd_http_privbuf { + rspamd_fstring_t *data; + const gchar *zc_buf; + gsize zc_remain; + ref_entry_t ref; +}; + +enum rspamd_http_priv_flags { + RSPAMD_HTTP_CONN_FLAG_ENCRYPTED = 1u << 0u, + RSPAMD_HTTP_CONN_FLAG_NEW_HEADER = 1u << 1u, + RSPAMD_HTTP_CONN_FLAG_RESETED = 1u << 2u, + RSPAMD_HTTP_CONN_FLAG_TOO_LARGE = 1u << 3u, + RSPAMD_HTTP_CONN_FLAG_ENCRYPTION_NEEDED = 1u << 4u, + RSPAMD_HTTP_CONN_FLAG_PROXY = 1u << 5u, + RSPAMD_HTTP_CONN_FLAG_PROXY_REQUEST = 1u << 6u, + RSPAMD_HTTP_CONN_OWN_SOCKET = 1u << 7u, +}; + +#define IS_CONN_ENCRYPTED(c) ((c)->flags & RSPAMD_HTTP_CONN_FLAG_ENCRYPTED) +#define IS_CONN_RESETED(c) ((c)->flags & RSPAMD_HTTP_CONN_FLAG_RESETED) + +struct rspamd_http_connection_private { + struct rspamd_http_context *ctx; + struct rspamd_ssl_connection *ssl; + struct _rspamd_http_privbuf *buf; + struct rspamd_keypair_cache *cache; + struct rspamd_cryptobox_pubkey *peer_key; + struct rspamd_cryptobox_keypair *local_key; + struct rspamd_http_header *header; + struct http_parser parser; + struct http_parser_settings parser_cb; + struct rspamd_io_ev ev; + ev_tstamp timeout; + struct rspamd_http_message *msg; + struct iovec *out; + guint outlen; + enum rspamd_http_priv_flags flags; + gsize wr_pos; + gsize wr_total; +}; + +static const rspamd_ftok_t key_header = { + .begin = "Key", + .len = 3}; +static const rspamd_ftok_t date_header = { + .begin = "Date", + .len = 4}; +static const rspamd_ftok_t last_modified_header = { + .begin = "Last-Modified", + .len = 13}; + +static void rspamd_http_event_handler(int fd, short what, gpointer ud); +static void rspamd_http_ssl_err_handler(gpointer ud, GError *err); + + +#define HTTP_ERROR http_error_quark() +GQuark +http_error_quark(void) +{ + return g_quark_from_static_string("http-error-quark"); +} + +static void +rspamd_http_privbuf_dtor(gpointer ud) +{ + struct _rspamd_http_privbuf *p = (struct _rspamd_http_privbuf *) ud; + + if (p->data) { + rspamd_fstring_free(p->data); + } + + g_free(p); +} + +static const gchar * +rspamd_http_code_to_str(gint code) +{ + if (code == 200) { + return "OK"; + } + else if (code == 404) { + return "Not found"; + } + else if (code == 403 || code == 401) { + return "Not authorized"; + } + else if (code >= 400 && code < 500) { + return "Bad request"; + } + else if (code >= 300 && code < 400) { + return "See Other"; + } + else if (code >= 500 && code < 600) { + return "Internal server error"; + } + + return "Unknown error"; +} + +static void +rspamd_http_parse_key(rspamd_ftok_t *data, struct rspamd_http_connection *conn, + struct rspamd_http_connection_private *priv) +{ + guchar *decoded_id; + const gchar *eq_pos; + gsize id_len; + struct rspamd_cryptobox_pubkey *pk; + + if (priv->local_key == NULL) { + /* In this case we cannot do anything, e.g. we cannot decrypt payload */ + priv->flags &= ~RSPAMD_HTTP_CONN_FLAG_ENCRYPTED; + } + else { + /* Check sanity of what we have */ + eq_pos = memchr(data->begin, '=', data->len); + if (eq_pos != NULL) { + decoded_id = rspamd_decode_base32(data->begin, eq_pos - data->begin, + &id_len, RSPAMD_BASE32_DEFAULT); + + if (decoded_id != NULL && id_len >= RSPAMD_KEYPAIR_SHORT_ID_LEN) { + pk = rspamd_pubkey_from_base32(eq_pos + 1, + data->begin + data->len - eq_pos - 1, + RSPAMD_KEYPAIR_KEX, + RSPAMD_CRYPTOBOX_MODE_25519); + if (pk != NULL) { + if (memcmp(rspamd_keypair_get_id(priv->local_key), + decoded_id, + RSPAMD_KEYPAIR_SHORT_ID_LEN) == 0) { + priv->msg->peer_key = pk; + + if (priv->cache && priv->msg->peer_key) { + rspamd_keypair_cache_process(priv->cache, + priv->local_key, + priv->msg->peer_key); + } + } + else { + rspamd_pubkey_unref(pk); + } + } + } + + priv->flags |= RSPAMD_HTTP_CONN_FLAG_ENCRYPTED; + g_free(decoded_id); + } + } +} + +static inline void +rspamd_http_check_special_header(struct rspamd_http_connection *conn, + struct rspamd_http_connection_private *priv) +{ + if (rspamd_ftok_casecmp(&priv->header->name, &date_header) == 0) { + priv->msg->date = rspamd_http_parse_date(priv->header->value.begin, + priv->header->value.len); + } + else if (rspamd_ftok_casecmp(&priv->header->name, &key_header) == 0) { + rspamd_http_parse_key(&priv->header->value, conn, priv); + } + else if (rspamd_ftok_casecmp(&priv->header->name, &last_modified_header) == 0) { + priv->msg->last_modified = rspamd_http_parse_date( + priv->header->value.begin, + priv->header->value.len); + } +} + +static gint +rspamd_http_on_url(http_parser *parser, const gchar *at, size_t length) +{ + struct rspamd_http_connection *conn = + (struct rspamd_http_connection *) parser->data; + struct rspamd_http_connection_private *priv; + + priv = conn->priv; + + priv->msg->url = rspamd_fstring_append(priv->msg->url, at, length); + + return 0; +} + +static gint +rspamd_http_on_status(http_parser *parser, const gchar *at, size_t length) +{ + struct rspamd_http_connection *conn = + (struct rspamd_http_connection *) parser->data; + struct rspamd_http_connection_private *priv; + + priv = conn->priv; + + if (parser->status_code != 200) { + if (priv->msg->status == NULL) { + priv->msg->status = rspamd_fstring_new(); + } + + priv->msg->status = rspamd_fstring_append(priv->msg->status, at, length); + } + + return 0; +} + +static void +rspamd_http_finish_header(struct rspamd_http_connection *conn, + struct rspamd_http_connection_private *priv) +{ + struct rspamd_http_header *hdr; + khiter_t k; + gint r; + + priv->header->combined = rspamd_fstring_append(priv->header->combined, + "\r\n", 2); + priv->header->value.len = priv->header->combined->len - + priv->header->name.len - 4; + priv->header->value.begin = priv->header->combined->str + + priv->header->name.len + 2; + priv->header->name.begin = priv->header->combined->str; + + k = kh_put(rspamd_http_headers_hash, priv->msg->headers, &priv->header->name, + &r); + + if (r != 0) { + kh_value(priv->msg->headers, k) = priv->header; + hdr = NULL; + } + else { + hdr = kh_value(priv->msg->headers, k); + } + + DL_APPEND(hdr, priv->header); + + rspamd_http_check_special_header(conn, priv); +} + +static void +rspamd_http_init_header(struct rspamd_http_connection_private *priv) +{ + priv->header = g_malloc0(sizeof(struct rspamd_http_header)); + priv->header->combined = rspamd_fstring_new(); +} + +static gint +rspamd_http_on_header_field(http_parser *parser, + const gchar *at, + size_t length) +{ + struct rspamd_http_connection *conn = + (struct rspamd_http_connection *) parser->data; + struct rspamd_http_connection_private *priv; + + priv = conn->priv; + + if (priv->header == NULL) { + rspamd_http_init_header(priv); + } + else if (priv->flags & RSPAMD_HTTP_CONN_FLAG_NEW_HEADER) { + rspamd_http_finish_header(conn, priv); + rspamd_http_init_header(priv); + } + + priv->flags &= ~RSPAMD_HTTP_CONN_FLAG_NEW_HEADER; + priv->header->combined = rspamd_fstring_append(priv->header->combined, + at, length); + + return 0; +} + +static gint +rspamd_http_on_header_value(http_parser *parser, + const gchar *at, + size_t length) +{ + struct rspamd_http_connection *conn = + (struct rspamd_http_connection *) parser->data; + struct rspamd_http_connection_private *priv; + + priv = conn->priv; + + if (priv->header == NULL) { + /* Should not happen */ + return -1; + } + + if (!(priv->flags & RSPAMD_HTTP_CONN_FLAG_NEW_HEADER)) { + priv->flags |= RSPAMD_HTTP_CONN_FLAG_NEW_HEADER; + priv->header->combined = rspamd_fstring_append(priv->header->combined, + ": ", 2); + priv->header->name.len = priv->header->combined->len - 2; + } + + priv->header->combined = rspamd_fstring_append(priv->header->combined, + at, length); + + return 0; +} + +static int +rspamd_http_on_headers_complete(http_parser *parser) +{ + struct rspamd_http_connection *conn = + (struct rspamd_http_connection *) parser->data; + struct rspamd_http_connection_private *priv; + struct rspamd_http_message *msg; + int ret; + + priv = conn->priv; + msg = priv->msg; + + if (priv->header != NULL) { + rspamd_http_finish_header(conn, priv); + + priv->header = NULL; + priv->flags &= ~RSPAMD_HTTP_CONN_FLAG_NEW_HEADER; + } + + if (msg->method == HTTP_HEAD) { + /* We don't care about the rest */ + rspamd_ev_watcher_stop(priv->ctx->event_loop, &priv->ev); + + msg->code = parser->status_code; + rspamd_http_connection_ref(conn); + ret = conn->finish_handler(conn, msg); + + if (conn->opts & RSPAMD_HTTP_CLIENT_KEEP_ALIVE) { + rspamd_http_context_push_keepalive(conn->priv->ctx, conn, + msg, conn->priv->ctx->event_loop); + rspamd_http_connection_reset(conn); + } + else { + conn->finished = TRUE; + } + + rspamd_http_connection_unref(conn); + + return ret; + } + + /* + * HTTP parser sets content length to (-1) when it doesn't know the real + * length, for example, in case of chunked encoding. + * + * Hence, we skip body setup here + */ + if (parser->content_length != ULLONG_MAX && parser->content_length != 0 && + msg->method != HTTP_HEAD) { + if (conn->max_size > 0 && + parser->content_length > conn->max_size) { + /* Too large message */ + priv->flags |= RSPAMD_HTTP_CONN_FLAG_TOO_LARGE; + return -1; + } + + if (!rspamd_http_message_set_body(msg, NULL, parser->content_length)) { + return -1; + } + } + + if (parser->flags & F_SPAMC) { + msg->flags |= RSPAMD_HTTP_FLAG_SPAMC; + } + + + msg->method = parser->method; + msg->code = parser->status_code; + + return 0; +} + +static void +rspamd_http_switch_zc(struct _rspamd_http_privbuf *pbuf, + struct rspamd_http_message *msg) +{ + pbuf->zc_buf = msg->body_buf.begin + msg->body_buf.len; + pbuf->zc_remain = msg->body_buf.allocated_len - msg->body_buf.len; +} + +static int +rspamd_http_on_body(http_parser *parser, const gchar *at, size_t length) +{ + struct rspamd_http_connection *conn = + (struct rspamd_http_connection *) parser->data; + struct rspamd_http_connection_private *priv; + struct rspamd_http_message *msg; + struct _rspamd_http_privbuf *pbuf; + const gchar *p; + + priv = conn->priv; + msg = priv->msg; + pbuf = priv->buf; + p = at; + + if (!(msg->flags & RSPAMD_HTTP_FLAG_HAS_BODY)) { + if (!rspamd_http_message_set_body(msg, NULL, parser->content_length)) { + return -1; + } + } + + if (conn->finished) { + return 0; + } + + if (conn->max_size > 0 && + msg->body_buf.len + length > conn->max_size) { + /* Body length overflow */ + priv->flags |= RSPAMD_HTTP_CONN_FLAG_TOO_LARGE; + return -1; + } + + if (!pbuf->zc_buf) { + if (!rspamd_http_message_append_body(msg, at, length)) { + return -1; + } + + /* We might have some leftover in our private buffer */ + if (pbuf->data->len == length) { + /* Switch to zero-copy mode */ + rspamd_http_switch_zc(pbuf, msg); + } + } + else { + if (msg->body_buf.begin + msg->body_buf.len != at) { + /* Likely chunked encoding */ + memmove((gchar *) msg->body_buf.begin + msg->body_buf.len, at, length); + p = msg->body_buf.begin + msg->body_buf.len; + } + + /* Adjust zero-copy buf */ + msg->body_buf.len += length; + + if (!(msg->flags & RSPAMD_HTTP_FLAG_SHMEM)) { + msg->body_buf.c.normal->len += length; + } + + pbuf->zc_buf = msg->body_buf.begin + msg->body_buf.len; + pbuf->zc_remain = msg->body_buf.allocated_len - msg->body_buf.len; + } + + if ((conn->opts & RSPAMD_HTTP_BODY_PARTIAL) && !IS_CONN_ENCRYPTED(priv)) { + /* Incremental update is impossible for encrypted requests so far */ + return (conn->body_handler(conn, msg, p, length)); + } + + return 0; +} + +static int +rspamd_http_on_body_decrypted(http_parser *parser, const gchar *at, size_t length) +{ + struct rspamd_http_connection *conn = + (struct rspamd_http_connection *) parser->data; + struct rspamd_http_connection_private *priv; + + priv = conn->priv; + + if (priv->header != NULL) { + rspamd_http_finish_header(conn, priv); + priv->header = NULL; + } + + if (conn->finished) { + return 0; + } + + if (priv->msg->body_buf.len == 0) { + + priv->msg->body_buf.begin = at; + priv->msg->method = parser->method; + priv->msg->code = parser->status_code; + } + + priv->msg->body_buf.len += length; + + return 0; +} + +static int +rspamd_http_on_headers_complete_decrypted(http_parser *parser) +{ + struct rspamd_http_connection *conn = + (struct rspamd_http_connection *) parser->data; + struct rspamd_http_connection_private *priv; + struct rspamd_http_message *msg; + int ret; + + priv = conn->priv; + msg = priv->msg; + + if (priv->header != NULL) { + rspamd_http_finish_header(conn, priv); + + priv->header = NULL; + priv->flags &= ~RSPAMD_HTTP_CONN_FLAG_NEW_HEADER; + } + + if (parser->flags & F_SPAMC) { + priv->msg->flags |= RSPAMD_HTTP_FLAG_SPAMC; + } + + if (msg->method == HTTP_HEAD) { + /* We don't care about the rest */ + rspamd_ev_watcher_stop(priv->ctx->event_loop, &priv->ev); + msg->code = parser->status_code; + rspamd_http_connection_ref(conn); + ret = conn->finish_handler(conn, msg); + + if (conn->opts & RSPAMD_HTTP_CLIENT_KEEP_ALIVE) { + rspamd_http_context_push_keepalive(conn->priv->ctx, conn, + msg, conn->priv->ctx->event_loop); + rspamd_http_connection_reset(conn); + } + else { + conn->finished = TRUE; + } + + rspamd_http_connection_unref(conn); + + return ret; + } + + priv->msg->method = parser->method; + priv->msg->code = parser->status_code; + + return 0; +} + +static int +rspamd_http_decrypt_message(struct rspamd_http_connection *conn, + struct rspamd_http_connection_private *priv, + struct rspamd_cryptobox_pubkey *peer_key) +{ + guchar *nonce, *m; + const guchar *nm; + gsize dec_len; + struct rspamd_http_message *msg = priv->msg; + struct rspamd_http_header *hdr, *hcur, *hcurtmp; + struct http_parser decrypted_parser; + struct http_parser_settings decrypted_cb; + enum rspamd_cryptobox_mode mode; + + mode = rspamd_keypair_alg(priv->local_key); + nonce = msg->body_buf.str; + m = msg->body_buf.str + rspamd_cryptobox_nonce_bytes(mode) + + rspamd_cryptobox_mac_bytes(mode); + dec_len = msg->body_buf.len - rspamd_cryptobox_nonce_bytes(mode) - + rspamd_cryptobox_mac_bytes(mode); + + if ((nm = rspamd_pubkey_get_nm(peer_key, priv->local_key)) == NULL) { + nm = rspamd_pubkey_calculate_nm(peer_key, priv->local_key); + } + + if (!rspamd_cryptobox_decrypt_nm_inplace(m, dec_len, nonce, + nm, m - rspamd_cryptobox_mac_bytes(mode), mode)) { + msg_err("cannot verify encrypted message, first bytes of the input: %*xs", + (gint) MIN(msg->body_buf.len, 64), msg->body_buf.begin); + return -1; + } + + /* Cleanup message */ + kh_foreach_value (msg->headers, hdr, { + DL_FOREACH_SAFE (hdr, hcur, hcurtmp) { + rspamd_fstring_free (hcur->combined); + g_free (hcur); +} +}); + +kh_destroy(rspamd_http_headers_hash, msg->headers); +msg->headers = kh_init(rspamd_http_headers_hash); + +if (msg->url != NULL) { + msg->url = rspamd_fstring_assign(msg->url, "", 0); +} + +msg->body_buf.len = 0; + +memset(&decrypted_parser, 0, sizeof(decrypted_parser)); +http_parser_init(&decrypted_parser, + conn->type == RSPAMD_HTTP_SERVER ? HTTP_REQUEST : HTTP_RESPONSE); + +memset(&decrypted_cb, 0, sizeof(decrypted_cb)); +decrypted_cb.on_url = rspamd_http_on_url; +decrypted_cb.on_status = rspamd_http_on_status; +decrypted_cb.on_header_field = rspamd_http_on_header_field; +decrypted_cb.on_header_value = rspamd_http_on_header_value; +decrypted_cb.on_headers_complete = rspamd_http_on_headers_complete_decrypted; +decrypted_cb.on_body = rspamd_http_on_body_decrypted; +decrypted_parser.data = conn; +decrypted_parser.content_length = dec_len; + +if (http_parser_execute(&decrypted_parser, &decrypted_cb, m, + dec_len) != (size_t) dec_len) { + msg_err("HTTP parser error: %s when parsing encrypted request", + http_errno_description(decrypted_parser.http_errno)); + return -1; +} + +return 0; +} + +static int +rspamd_http_on_message_complete(http_parser *parser) +{ + struct rspamd_http_connection *conn = + (struct rspamd_http_connection *) parser->data; + struct rspamd_http_connection_private *priv; + int ret = 0; + enum rspamd_cryptobox_mode mode; + + if (conn->finished) { + return 0; + } + + priv = conn->priv; + + if ((conn->opts & RSPAMD_HTTP_REQUIRE_ENCRYPTION) && !IS_CONN_ENCRYPTED(priv)) { + priv->flags |= RSPAMD_HTTP_CONN_FLAG_ENCRYPTION_NEEDED; + msg_err("unencrypted connection when encryption has been requested"); + return -1; + } + + if ((conn->opts & RSPAMD_HTTP_BODY_PARTIAL) == 0 && IS_CONN_ENCRYPTED(priv)) { + mode = rspamd_keypair_alg(priv->local_key); + + if (priv->local_key == NULL || priv->msg->peer_key == NULL || + priv->msg->body_buf.len < rspamd_cryptobox_nonce_bytes(mode) + + rspamd_cryptobox_mac_bytes(mode)) { + msg_err("cannot decrypt message"); + return -1; + } + + /* We have keys, so we can decrypt message */ + ret = rspamd_http_decrypt_message(conn, priv, priv->msg->peer_key); + + if (ret != 0) { + return ret; + } + + if (conn->body_handler != NULL) { + rspamd_http_connection_ref(conn); + ret = conn->body_handler(conn, + priv->msg, + priv->msg->body_buf.begin, + priv->msg->body_buf.len); + rspamd_http_connection_unref(conn); + } + } + else if ((conn->opts & RSPAMD_HTTP_BODY_PARTIAL) == 0 && conn->body_handler) { + g_assert(conn->body_handler != NULL); + rspamd_http_connection_ref(conn); + ret = conn->body_handler(conn, + priv->msg, + priv->msg->body_buf.begin, + priv->msg->body_buf.len); + rspamd_http_connection_unref(conn); + } + + if (ret == 0) { + rspamd_ev_watcher_stop(priv->ctx->event_loop, &priv->ev); + rspamd_http_connection_ref(conn); + ret = conn->finish_handler(conn, priv->msg); + + if (conn->opts & RSPAMD_HTTP_CLIENT_KEEP_ALIVE) { + rspamd_http_context_push_keepalive(conn->priv->ctx, conn, + priv->msg, conn->priv->ctx->event_loop); + rspamd_http_connection_reset(conn); + } + else { + conn->finished = TRUE; + } + + rspamd_http_connection_unref(conn); + } + + return ret; +} + +static void +rspamd_http_simple_client_helper(struct rspamd_http_connection *conn) +{ + struct rspamd_http_connection_private *priv; + gpointer ssl; + gint request_method; + GString *prev_host = NULL; + + priv = conn->priv; + ssl = priv->ssl; + priv->ssl = NULL; + + /* Preserve data */ + if (priv->msg) { + request_method = priv->msg->method; + /* Preserve host for keepalive */ + prev_host = priv->msg->host; + priv->msg->host = NULL; + } + + rspamd_http_connection_reset(conn); + priv->ssl = ssl; + + /* Plan read message */ + + if (conn->opts & RSPAMD_HTTP_CLIENT_SHARED) { + rspamd_http_connection_read_message_shared(conn, conn->ud, + conn->priv->timeout); + } + else { + rspamd_http_connection_read_message(conn, conn->ud, + conn->priv->timeout); + } + + if (priv->msg) { + priv->msg->method = request_method; + priv->msg->host = prev_host; + } + else { + if (prev_host) { + g_string_free(prev_host, TRUE); + } + } +} + +static void +rspamd_http_write_helper(struct rspamd_http_connection *conn) +{ + struct rspamd_http_connection_private *priv; + struct iovec *start; + guint niov, i; + gint flags = 0; + gsize remain; + gssize r; + GError *err; + struct iovec *cur_iov; + struct msghdr msg; + + priv = conn->priv; + + if (priv->wr_pos == priv->wr_total) { + goto call_finish_handler; + } + + start = &priv->out[0]; + niov = priv->outlen; + remain = priv->wr_pos; + /* We know that niov is small enough for that */ + if (priv->ssl) { + /* Might be recursive! */ + cur_iov = g_malloc(niov * sizeof(struct iovec)); + } + else { + cur_iov = alloca(niov * sizeof(struct iovec)); + } + memcpy(cur_iov, priv->out, niov * sizeof(struct iovec)); + for (i = 0; i < priv->outlen && remain > 0; i++) { + /* Find out the first iov required */ + start = &cur_iov[i]; + if (start->iov_len <= remain) { + remain -= start->iov_len; + start = &cur_iov[i + 1]; + niov--; + } + else { + start->iov_base = (void *) ((char *) start->iov_base + remain); + start->iov_len -= remain; + remain = 0; + } + } + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = start; + msg.msg_iovlen = MIN(IOV_MAX, niov); + g_assert(niov > 0); +#ifdef MSG_NOSIGNAL + flags = MSG_NOSIGNAL; +#endif + + if (priv->ssl) { + r = rspamd_ssl_writev(priv->ssl, msg.msg_iov, msg.msg_iovlen); + g_free(cur_iov); + } + else { + r = sendmsg(conn->fd, &msg, flags); + } + + if (r == -1) { + if (!priv->ssl) { + err = g_error_new(HTTP_ERROR, 500, "IO write error: %s", strerror(errno)); + rspamd_http_connection_ref(conn); + conn->error_handler(conn, err); + rspamd_http_connection_unref(conn); + g_error_free(err); + } + + return; + } + else { + priv->wr_pos += r; + } + + if (priv->wr_pos >= priv->wr_total) { + goto call_finish_handler; + } + else { + /* Want to write more */ + priv->flags &= ~RSPAMD_HTTP_CONN_FLAG_RESETED; + + if (priv->ssl && r > 0) { + /* We can write more data... */ + rspamd_http_write_helper(conn); + return; + } + } + + return; + +call_finish_handler: + rspamd_ev_watcher_stop(priv->ctx->event_loop, &priv->ev); + + if ((conn->opts & RSPAMD_HTTP_CLIENT_SIMPLE) == 0) { + rspamd_http_connection_ref(conn); + conn->finished = TRUE; + conn->finish_handler(conn, priv->msg); + rspamd_http_connection_unref(conn); + } + else { + /* Plan read message */ + rspamd_http_simple_client_helper(conn); + } +} + +static gssize +rspamd_http_try_read(gint fd, + struct rspamd_http_connection *conn, + struct rspamd_http_connection_private *priv, + struct _rspamd_http_privbuf *pbuf, + const gchar **buf_ptr) +{ + gssize r; + gchar *data; + gsize len; + struct rspamd_http_message *msg; + + msg = priv->msg; + + if (pbuf->zc_buf == NULL) { + data = priv->buf->data->str; + len = priv->buf->data->allocated; + } + else { + data = (gchar *) pbuf->zc_buf; + len = pbuf->zc_remain; + + if (len == 0) { + rspamd_http_message_grow_body(priv->msg, priv->buf->data->allocated); + rspamd_http_switch_zc(pbuf, msg); + data = (gchar *) pbuf->zc_buf; + len = pbuf->zc_remain; + } + } + + if (priv->ssl) { + r = rspamd_ssl_read(priv->ssl, data, len); + } + else { + r = read(fd, data, len); + } + + if (r <= 0) { + return r; + } + else { + if (pbuf->zc_buf == NULL) { + priv->buf->data->len = r; + } + else { + pbuf->zc_remain -= r; + pbuf->zc_buf += r; + } + } + + if (buf_ptr) { + *buf_ptr = data; + } + + return r; +} + +static void +rspamd_http_ssl_err_handler(gpointer ud, GError *err) +{ + struct rspamd_http_connection *conn = (struct rspamd_http_connection *) ud; + + rspamd_http_connection_ref(conn); + conn->error_handler(conn, err); + rspamd_http_connection_unref(conn); +} + +static void +rspamd_http_event_handler(int fd, short what, gpointer ud) +{ + struct rspamd_http_connection *conn = (struct rspamd_http_connection *) ud; + struct rspamd_http_connection_private *priv; + struct _rspamd_http_privbuf *pbuf; + const gchar *d; + gssize r; + GError *err; + + priv = conn->priv; + pbuf = priv->buf; + REF_RETAIN(pbuf); + rspamd_http_connection_ref(conn); + + if (what == EV_READ) { + r = rspamd_http_try_read(fd, conn, priv, pbuf, &d); + + if (r > 0) { + if (http_parser_execute(&priv->parser, &priv->parser_cb, + d, r) != (size_t) r || + priv->parser.http_errno != 0) { + if (priv->flags & RSPAMD_HTTP_CONN_FLAG_TOO_LARGE) { + err = g_error_new(HTTP_ERROR, 413, + "Request entity too large: %zu", + (size_t) priv->parser.content_length); + } + else if (priv->flags & RSPAMD_HTTP_CONN_FLAG_ENCRYPTION_NEEDED) { + err = g_error_new(HTTP_ERROR, 400, + "Encryption required"); + } + else if (priv->parser.http_errno == HPE_CLOSED_CONNECTION) { + msg_err("got garbage after end of the message, ignore it"); + + REF_RELEASE(pbuf); + rspamd_http_connection_unref(conn); + + return; + } + else { + if (priv->parser.http_errno > HPE_CB_status) { + err = g_error_new(HTTP_ERROR, 400, + "HTTP parser error: %s", + http_errno_description(priv->parser.http_errno)); + } + else { + err = g_error_new(HTTP_ERROR, 500, + "HTTP parser internal error: %s", + http_errno_description(priv->parser.http_errno)); + } + } + + if (!conn->finished) { + conn->error_handler(conn, err); + } + else { + msg_err("got error after HTTP request is finished: %e", err); + } + + g_error_free(err); + + REF_RELEASE(pbuf); + rspamd_http_connection_unref(conn); + + return; + } + } + else if (r == 0) { + /* We can still call http parser */ + http_parser_execute(&priv->parser, &priv->parser_cb, d, r); + + if (!conn->finished) { + err = g_error_new(HTTP_ERROR, + 400, + "IO read error: unexpected EOF"); + conn->error_handler(conn, err); + g_error_free(err); + } + REF_RELEASE(pbuf); + rspamd_http_connection_unref(conn); + + return; + } + else { + if (!priv->ssl) { + err = g_error_new(HTTP_ERROR, + 500, + "HTTP IO read error: %s", + strerror(errno)); + conn->error_handler(conn, err); + g_error_free(err); + } + + REF_RELEASE(pbuf); + rspamd_http_connection_unref(conn); + + return; + } + } + else if (what == EV_TIMEOUT) { + if (!priv->ssl) { + /* Let's try to read from the socket first */ + r = rspamd_http_try_read(fd, conn, priv, pbuf, &d); + + if (r > 0) { + if (http_parser_execute(&priv->parser, &priv->parser_cb, + d, r) != (size_t) r || + priv->parser.http_errno != 0) { + err = g_error_new(HTTP_ERROR, 400, + "HTTP parser error: %s", + http_errno_description(priv->parser.http_errno)); + + if (!conn->finished) { + conn->error_handler(conn, err); + } + else { + msg_err("got error after HTTP request is finished: %e", err); + } + + g_error_free(err); + + REF_RELEASE(pbuf); + rspamd_http_connection_unref(conn); + + return; + } + } + else { + err = g_error_new(HTTP_ERROR, 408, + "IO timeout"); + conn->error_handler(conn, err); + g_error_free(err); + + REF_RELEASE(pbuf); + rspamd_http_connection_unref(conn); + + return; + } + } + else { + /* In case of SSL we disable this logic as we already came from SSL handler */ + REF_RELEASE(pbuf); + rspamd_http_connection_unref(conn); + + return; + } + } + else if (what == EV_WRITE) { + rspamd_http_write_helper(conn); + } + + REF_RELEASE(pbuf); + rspamd_http_connection_unref(conn); +} + +static void +rspamd_http_parser_reset(struct rspamd_http_connection *conn) +{ + struct rspamd_http_connection_private *priv = conn->priv; + + http_parser_init(&priv->parser, + conn->type == RSPAMD_HTTP_SERVER ? HTTP_REQUEST : HTTP_RESPONSE); + + priv->parser_cb.on_url = rspamd_http_on_url; + priv->parser_cb.on_status = rspamd_http_on_status; + priv->parser_cb.on_header_field = rspamd_http_on_header_field; + priv->parser_cb.on_header_value = rspamd_http_on_header_value; + priv->parser_cb.on_headers_complete = rspamd_http_on_headers_complete; + priv->parser_cb.on_body = rspamd_http_on_body; + priv->parser_cb.on_message_complete = rspamd_http_on_message_complete; +} + +static struct rspamd_http_connection * +rspamd_http_connection_new_common(struct rspamd_http_context *ctx, + gint fd, + rspamd_http_body_handler_t body_handler, + rspamd_http_error_handler_t error_handler, + rspamd_http_finish_handler_t finish_handler, + unsigned opts, + enum rspamd_http_connection_type type, + enum rspamd_http_priv_flags priv_flags, + struct upstream *proxy_upstream) +{ + struct rspamd_http_connection *conn; + struct rspamd_http_connection_private *priv; + + g_assert(error_handler != NULL && finish_handler != NULL); + + if (ctx == NULL) { + ctx = rspamd_http_context_default(); + } + + conn = g_malloc0(sizeof(struct rspamd_http_connection)); + conn->opts = opts; + conn->type = type; + conn->body_handler = body_handler; + conn->error_handler = error_handler; + conn->finish_handler = finish_handler; + conn->fd = fd; + conn->ref = 1; + conn->finished = FALSE; + + /* Init priv */ + priv = g_malloc0(sizeof(struct rspamd_http_connection_private)); + conn->priv = priv; + priv->ctx = ctx; + priv->flags = priv_flags; + + if (type == RSPAMD_HTTP_SERVER) { + priv->cache = ctx->server_kp_cache; + } + else { + priv->cache = ctx->client_kp_cache; + if (ctx->client_kp) { + priv->local_key = rspamd_keypair_ref(ctx->client_kp); + } + } + + rspamd_http_parser_reset(conn); + priv->parser.data = conn; + + return conn; +} + +struct rspamd_http_connection * +rspamd_http_connection_new_server(struct rspamd_http_context *ctx, + gint fd, + rspamd_http_body_handler_t body_handler, + rspamd_http_error_handler_t error_handler, + rspamd_http_finish_handler_t finish_handler, + unsigned opts) +{ + return rspamd_http_connection_new_common(ctx, fd, body_handler, + error_handler, finish_handler, opts, RSPAMD_HTTP_SERVER, 0, NULL); +} + +struct rspamd_http_connection * +rspamd_http_connection_new_client_socket(struct rspamd_http_context *ctx, + rspamd_http_body_handler_t body_handler, + rspamd_http_error_handler_t error_handler, + rspamd_http_finish_handler_t finish_handler, + unsigned opts, + gint fd) +{ + return rspamd_http_connection_new_common(ctx, fd, body_handler, + error_handler, finish_handler, opts, RSPAMD_HTTP_CLIENT, 0, NULL); +} + +struct rspamd_http_connection * +rspamd_http_connection_new_client(struct rspamd_http_context *ctx, + rspamd_http_body_handler_t body_handler, + rspamd_http_error_handler_t error_handler, + rspamd_http_finish_handler_t finish_handler, + unsigned opts, + rspamd_inet_addr_t *addr) +{ + gint fd; + + if (ctx == NULL) { + ctx = rspamd_http_context_default(); + } + + if (ctx->http_proxies) { + struct upstream *up = rspamd_upstream_get(ctx->http_proxies, + RSPAMD_UPSTREAM_ROUND_ROBIN, NULL, 0); + + if (up) { + rspamd_inet_addr_t *proxy_addr = rspamd_upstream_addr_next(up); + + fd = rspamd_inet_address_connect(proxy_addr, SOCK_STREAM, TRUE); + + if (fd == -1) { + msg_info("cannot connect to http proxy %s: %s", + rspamd_inet_address_to_string_pretty(proxy_addr), + strerror(errno)); + rspamd_upstream_fail(up, TRUE, strerror(errno)); + + return NULL; + } + + return rspamd_http_connection_new_common(ctx, fd, body_handler, + error_handler, finish_handler, opts, + RSPAMD_HTTP_CLIENT, + RSPAMD_HTTP_CONN_OWN_SOCKET | RSPAMD_HTTP_CONN_FLAG_PROXY, + up); + } + } + + /* Unproxied version */ + fd = rspamd_inet_address_connect(addr, SOCK_STREAM, TRUE); + + if (fd == -1) { + msg_info("cannot connect make http connection to %s: %s", + rspamd_inet_address_to_string_pretty(addr), + strerror(errno)); + + return NULL; + } + + return rspamd_http_connection_new_common(ctx, fd, body_handler, + error_handler, finish_handler, opts, + RSPAMD_HTTP_CLIENT, + RSPAMD_HTTP_CONN_OWN_SOCKET, + NULL); +} + +struct rspamd_http_connection * +rspamd_http_connection_new_client_keepalive(struct rspamd_http_context *ctx, + rspamd_http_body_handler_t body_handler, + rspamd_http_error_handler_t error_handler, + rspamd_http_finish_handler_t finish_handler, + unsigned opts, + rspamd_inet_addr_t *addr, + const gchar *host) +{ + struct rspamd_http_connection *conn; + + if (ctx == NULL) { + ctx = rspamd_http_context_default(); + } + + conn = rspamd_http_context_check_keepalive(ctx, addr, host, + opts & RSPAMD_HTTP_CLIENT_SSL); + + if (conn) { + return conn; + } + + conn = rspamd_http_connection_new_client(ctx, + body_handler, error_handler, finish_handler, + opts | RSPAMD_HTTP_CLIENT_SIMPLE | RSPAMD_HTTP_CLIENT_KEEP_ALIVE, + addr); + + if (conn) { + rspamd_http_context_prepare_keepalive(ctx, conn, addr, host, + opts & RSPAMD_HTTP_CLIENT_SSL); + } + + return conn; +} + +void rspamd_http_connection_reset(struct rspamd_http_connection *conn) +{ + struct rspamd_http_connection_private *priv; + struct rspamd_http_message *msg; + + priv = conn->priv; + msg = priv->msg; + + /* Clear request */ + if (msg != NULL) { + if (msg->peer_key) { + priv->peer_key = msg->peer_key; + msg->peer_key = NULL; + } + rspamd_http_message_unref(msg); + priv->msg = NULL; + } + + conn->finished = FALSE; + /* Clear priv */ + rspamd_ev_watcher_stop(priv->ctx->event_loop, &priv->ev); + + if (!(priv->flags & RSPAMD_HTTP_CONN_FLAG_RESETED)) { + rspamd_http_parser_reset(conn); + } + + if (priv->buf != NULL) { + REF_RELEASE(priv->buf); + priv->buf = NULL; + } + + if (priv->out != NULL) { + g_free(priv->out); + priv->out = NULL; + } + + priv->flags |= RSPAMD_HTTP_CONN_FLAG_RESETED; +} + +struct rspamd_http_message * +rspamd_http_connection_steal_msg(struct rspamd_http_connection *conn) +{ + struct rspamd_http_connection_private *priv; + struct rspamd_http_message *msg; + + priv = conn->priv; + msg = priv->msg; + + /* Clear request */ + if (msg != NULL) { + if (msg->peer_key) { + priv->peer_key = msg->peer_key; + msg->peer_key = NULL; + } + priv->msg = NULL; + } + + return msg; +} + +struct rspamd_http_message * +rspamd_http_connection_copy_msg(struct rspamd_http_message *msg, GError **err) +{ + struct rspamd_http_message *new_msg; + struct rspamd_http_header *hdr, *nhdr, *nhdrs, *hcur; + const gchar *old_body; + gsize old_len; + struct stat st; + union _rspamd_storage_u *storage; + + new_msg = rspamd_http_new_message(msg->type); + new_msg->flags = msg->flags; + + if (msg->body_buf.len > 0) { + + if (msg->flags & RSPAMD_HTTP_FLAG_SHMEM) { + /* Avoid copying by just mapping a shared segment */ + new_msg->flags |= RSPAMD_HTTP_FLAG_SHMEM_IMMUTABLE; + + storage = &new_msg->body_buf.c; + storage->shared.shm_fd = dup(msg->body_buf.c.shared.shm_fd); + + if (storage->shared.shm_fd == -1) { + rspamd_http_message_unref(new_msg); + g_set_error(err, http_error_quark(), errno, + "cannot dup shmem fd: %d: %s", + msg->body_buf.c.shared.shm_fd, strerror(errno)); + + return NULL; + } + + if (fstat(storage->shared.shm_fd, &st) == -1) { + g_set_error(err, http_error_quark(), errno, + "cannot stat shmem fd: %d: %s", + storage->shared.shm_fd, strerror(errno)); + rspamd_http_message_unref(new_msg); + + return NULL; + } + + /* We don't own segment, so do not try to touch it */ + + if (msg->body_buf.c.shared.name) { + storage->shared.name = msg->body_buf.c.shared.name; + REF_RETAIN(storage->shared.name); + } + + new_msg->body_buf.str = mmap(NULL, st.st_size, + PROT_READ, MAP_SHARED, + storage->shared.shm_fd, 0); + + if (new_msg->body_buf.str == MAP_FAILED) { + g_set_error(err, http_error_quark(), errno, + "cannot mmap shmem fd: %d: %s", + storage->shared.shm_fd, strerror(errno)); + rspamd_http_message_unref(new_msg); + + return NULL; + } + + new_msg->body_buf.begin = new_msg->body_buf.str; + new_msg->body_buf.len = msg->body_buf.len; + new_msg->body_buf.begin = new_msg->body_buf.str + + (msg->body_buf.begin - msg->body_buf.str); + } + else { + old_body = rspamd_http_message_get_body(msg, &old_len); + + if (!rspamd_http_message_set_body(new_msg, old_body, old_len)) { + g_set_error(err, http_error_quark(), errno, + "cannot set body for message, length: %zd", + old_len); + rspamd_http_message_unref(new_msg); + + return NULL; + } + } + } + + if (msg->url) { + if (new_msg->url) { + new_msg->url = rspamd_fstring_append(new_msg->url, msg->url->str, + msg->url->len); + } + else { + new_msg->url = rspamd_fstring_new_init(msg->url->str, + msg->url->len); + } + } + + if (msg->host) { + new_msg->host = g_string_new_len(msg->host->str, msg->host->len); + } + + new_msg->method = msg->method; + new_msg->port = msg->port; + new_msg->date = msg->date; + new_msg->last_modified = msg->last_modified; + + kh_foreach_value(msg->headers, hdr, { + nhdrs = NULL; + + DL_FOREACH(hdr, hcur) + { + nhdr = g_malloc(sizeof(struct rspamd_http_header)); + + nhdr->combined = rspamd_fstring_new_init(hcur->combined->str, + hcur->combined->len); + nhdr->name.begin = nhdr->combined->str + + (hcur->name.begin - hcur->combined->str); + nhdr->name.len = hcur->name.len; + nhdr->value.begin = nhdr->combined->str + + (hcur->value.begin - hcur->combined->str); + nhdr->value.len = hcur->value.len; + DL_APPEND(nhdrs, nhdr); + } + + gint r; + khiter_t k = kh_put(rspamd_http_headers_hash, new_msg->headers, + &nhdrs->name, &r); + + if (r != 0) { + kh_value(new_msg->headers, k) = nhdrs; + } + else { + DL_CONCAT(kh_value(new_msg->headers, k), nhdrs); + } + }); + + return new_msg; +} + +void rspamd_http_connection_free(struct rspamd_http_connection *conn) +{ + struct rspamd_http_connection_private *priv; + + priv = conn->priv; + + if (priv != NULL) { + rspamd_http_connection_reset(conn); + + if (priv->ssl) { + rspamd_ssl_connection_free(priv->ssl); + priv->ssl = NULL; + } + + if (priv->local_key) { + rspamd_keypair_unref(priv->local_key); + } + if (priv->peer_key) { + rspamd_pubkey_unref(priv->peer_key); + } + + if (priv->flags & RSPAMD_HTTP_CONN_OWN_SOCKET) { + /* Fd is owned by a connection */ + close(conn->fd); + } + + g_free(priv); + } + + g_free(conn); +} + +static void +rspamd_http_connection_read_message_common(struct rspamd_http_connection *conn, + gpointer ud, ev_tstamp timeout, + gint flags) +{ + struct rspamd_http_connection_private *priv = conn->priv; + struct rspamd_http_message *req; + + conn->ud = ud; + req = rspamd_http_new_message( + conn->type == RSPAMD_HTTP_SERVER ? HTTP_REQUEST : HTTP_RESPONSE); + priv->msg = req; + req->flags = flags; + + if (flags & RSPAMD_HTTP_FLAG_SHMEM) { + req->body_buf.c.shared.shm_fd = -1; + } + + if (priv->peer_key) { + priv->msg->peer_key = priv->peer_key; + priv->peer_key = NULL; + priv->flags |= RSPAMD_HTTP_CONN_FLAG_ENCRYPTED; + } + + priv->timeout = timeout; + priv->header = NULL; + priv->buf = g_malloc0(sizeof(*priv->buf)); + REF_INIT_RETAIN(priv->buf, rspamd_http_privbuf_dtor); + priv->buf->data = rspamd_fstring_sized_new(8192); + priv->flags |= RSPAMD_HTTP_CONN_FLAG_NEW_HEADER; + + if (!priv->ssl) { + rspamd_ev_watcher_init(&priv->ev, conn->fd, EV_READ, + rspamd_http_event_handler, conn); + rspamd_ev_watcher_start(priv->ctx->event_loop, &priv->ev, priv->timeout); + } + else { + rspamd_ssl_connection_restore_handlers(priv->ssl, + rspamd_http_event_handler, + rspamd_http_ssl_err_handler, + conn, + EV_READ); + } + + priv->flags &= ~RSPAMD_HTTP_CONN_FLAG_RESETED; +} + +void rspamd_http_connection_read_message(struct rspamd_http_connection *conn, + gpointer ud, ev_tstamp timeout) +{ + rspamd_http_connection_read_message_common(conn, ud, timeout, 0); +} + +void rspamd_http_connection_read_message_shared(struct rspamd_http_connection *conn, + gpointer ud, ev_tstamp timeout) +{ + rspamd_http_connection_read_message_common(conn, ud, timeout, + RSPAMD_HTTP_FLAG_SHMEM); +} + +static void +rspamd_http_connection_encrypt_message( + struct rspamd_http_connection *conn, + struct rspamd_http_message *msg, + struct rspamd_http_connection_private *priv, + guchar *pbody, + guint bodylen, + guchar *pmethod, + guint methodlen, + guint preludelen, + gint hdrcount, + guchar *np, + guchar *mp, + struct rspamd_cryptobox_pubkey *peer_key) +{ + struct rspamd_cryptobox_segment *segments; + guchar *crlfp; + const guchar *nm; + gint i, cnt; + guint outlen; + struct rspamd_http_header *hdr, *hcur; + enum rspamd_cryptobox_mode mode; + + mode = rspamd_keypair_alg(priv->local_key); + crlfp = mp + rspamd_cryptobox_mac_bytes(mode); + + outlen = priv->out[0].iov_len + priv->out[1].iov_len; + /* + * Create segments from the following: + * Method, [URL], CRLF, nheaders, CRLF, body + */ + segments = g_new(struct rspamd_cryptobox_segment, hdrcount + 5); + + segments[0].data = pmethod; + segments[0].len = methodlen; + + if (conn->type != RSPAMD_HTTP_SERVER) { + segments[1].data = msg->url->str; + segments[1].len = msg->url->len; + /* space + HTTP version + crlf */ + segments[2].data = crlfp; + segments[2].len = preludelen - 2; + crlfp += segments[2].len; + i = 3; + } + else { + /* Here we send just CRLF */ + segments[1].data = crlfp; + segments[1].len = 2; + crlfp += segments[1].len; + + i = 2; + } + + + kh_foreach_value (msg->headers, hdr, { + DL_FOREACH (hdr, hcur) { + segments[i].data = hcur->combined->str; + segments[i++].len = hcur->combined->len; +} +}); + +/* crlfp should point now at the second crlf */ +segments[i].data = crlfp; +segments[i++].len = 2; + +if (pbody) { + segments[i].data = pbody; + segments[i++].len = bodylen; +} + +cnt = i; + +if ((nm = rspamd_pubkey_get_nm(peer_key, priv->local_key)) == NULL) { + nm = rspamd_pubkey_calculate_nm(peer_key, priv->local_key); +} + +rspamd_cryptobox_encryptv_nm_inplace(segments, cnt, np, nm, mp, mode); + +/* + * iov[0] = base HTTP request + * iov[1] = CRLF + * iov[2] = nonce + * iov[3] = mac + * iov[4..i] = encrypted HTTP request/reply + */ +priv->out[2].iov_base = np; +priv->out[2].iov_len = rspamd_cryptobox_nonce_bytes(mode); +priv->out[3].iov_base = mp; +priv->out[3].iov_len = rspamd_cryptobox_mac_bytes(mode); + +outlen += rspamd_cryptobox_nonce_bytes(mode) + + rspamd_cryptobox_mac_bytes(mode); + +for (i = 0; i < cnt; i++) { + priv->out[i + 4].iov_base = segments[i].data; + priv->out[i + 4].iov_len = segments[i].len; + outlen += segments[i].len; +} + +priv->wr_total = outlen; + +g_free(segments); +} + +static void +rspamd_http_detach_shared(struct rspamd_http_message *msg) +{ + rspamd_fstring_t *cpy_str; + + cpy_str = rspamd_fstring_new_init(msg->body_buf.begin, msg->body_buf.len); + rspamd_http_message_set_body_from_fstring_steal(msg, cpy_str); +} + +gint rspamd_http_message_write_header(const gchar *mime_type, gboolean encrypted, + gchar *repbuf, gsize replen, gsize bodylen, gsize enclen, const gchar *host, + struct rspamd_http_connection *conn, struct rspamd_http_message *msg, + rspamd_fstring_t **buf, + struct rspamd_http_connection_private *priv, + struct rspamd_cryptobox_pubkey *peer_key) +{ + gchar datebuf[64]; + gint meth_len = 0; + const gchar *conn_type = "close"; + + if (conn->type == RSPAMD_HTTP_SERVER) { + /* Format reply */ + if (msg->method < HTTP_SYMBOLS) { + rspamd_ftok_t status; + + rspamd_http_date_format(datebuf, sizeof(datebuf), msg->date); + + if (mime_type == NULL) { + mime_type = + encrypted ? "application/octet-stream" : "text/plain"; + } + + if (msg->status == NULL || msg->status->len == 0) { + if (msg->code == 200) { + RSPAMD_FTOK_ASSIGN(&status, "OK"); + } + else if (msg->code == 404) { + RSPAMD_FTOK_ASSIGN(&status, "Not Found"); + } + else if (msg->code == 403) { + RSPAMD_FTOK_ASSIGN(&status, "Forbidden"); + } + else if (msg->code >= 500 && msg->code < 600) { + RSPAMD_FTOK_ASSIGN(&status, "Internal Server Error"); + } + else { + RSPAMD_FTOK_ASSIGN(&status, "Undefined Error"); + } + } + else { + status.begin = msg->status->str; + status.len = msg->status->len; + } + + if (encrypted) { + /* Internal reply (encrypted) */ + if (mime_type) { + meth_len = + rspamd_snprintf(repbuf, replen, + "HTTP/1.1 %d %T\r\n" + "Connection: close\r\n" + "Server: %s\r\n" + "Date: %s\r\n" + "Content-Length: %z\r\n" + "Content-Type: %s", /* NO \r\n at the end ! */ + msg->code, &status, priv->ctx->config.server_hdr, + datebuf, + bodylen, mime_type); + } + else { + meth_len = + rspamd_snprintf(repbuf, replen, + "HTTP/1.1 %d %T\r\n" + "Connection: close\r\n" + "Server: %s\r\n" + "Date: %s\r\n" + "Content-Length: %z", /* NO \r\n at the end ! */ + msg->code, &status, priv->ctx->config.server_hdr, + datebuf, + bodylen); + } + enclen += meth_len; + /* External reply */ + rspamd_printf_fstring(buf, + "HTTP/1.1 200 OK\r\n" + "Connection: close\r\n" + "Server: %s\r\n" + "Date: %s\r\n" + "Content-Length: %z\r\n" + "Content-Type: application/octet-stream\r\n", + priv->ctx->config.server_hdr, + datebuf, enclen); + } + else { + if (mime_type) { + meth_len = + rspamd_printf_fstring(buf, + "HTTP/1.1 %d %T\r\n" + "Connection: close\r\n" + "Server: %s\r\n" + "Date: %s\r\n" + "Content-Length: %z\r\n" + "Content-Type: %s\r\n", + msg->code, &status, priv->ctx->config.server_hdr, + datebuf, + bodylen, mime_type); + } + else { + meth_len = + rspamd_printf_fstring(buf, + "HTTP/1.1 %d %T\r\n" + "Connection: close\r\n" + "Server: %s\r\n" + "Date: %s\r\n" + "Content-Length: %z\r\n", + msg->code, &status, priv->ctx->config.server_hdr, + datebuf, + bodylen); + } + } + } + else { + /* Legacy spamd reply */ + if (msg->flags & RSPAMD_HTTP_FLAG_SPAMC) { + gsize real_bodylen; + goffset eoh_pos; + GString tmp; + + /* Unfortunately, spamc protocol is deadly brain damaged */ + tmp.str = (gchar *) msg->body_buf.begin; + tmp.len = msg->body_buf.len; + + if (rspamd_string_find_eoh(&tmp, &eoh_pos) != -1 && + bodylen > eoh_pos) { + real_bodylen = bodylen - eoh_pos; + } + else { + real_bodylen = bodylen; + } + + rspamd_printf_fstring(buf, "SPAMD/1.1 0 EX_OK\r\n" + "Content-length: %z\r\n", + real_bodylen); + } + else { + rspamd_printf_fstring(buf, "RSPAMD/1.3 0 EX_OK\r\n"); + } + } + } + else { + + /* Client request */ + if (conn->opts & RSPAMD_HTTP_CLIENT_KEEP_ALIVE) { + conn_type = "keep-alive"; + } + + /* Format request */ + enclen += RSPAMD_FSTRING_LEN(msg->url) + + strlen(http_method_str(msg->method)) + 1; + + if (host == NULL && msg->host == NULL) { + /* Fallback to HTTP/1.0 */ + if (encrypted) { + rspamd_printf_fstring(buf, + "%s %s HTTP/1.0\r\n" + "Content-Length: %z\r\n" + "Content-Type: application/octet-stream\r\n" + "Connection: %s\r\n", + "POST", + "/post", + enclen, + conn_type); + } + else { + rspamd_printf_fstring(buf, + "%s %V HTTP/1.0\r\n" + "Content-Length: %z\r\n" + "Connection: %s\r\n", + http_method_str(msg->method), + msg->url, + bodylen, + conn_type); + + if (bodylen > 0) { + if (mime_type == NULL) { + mime_type = "text/plain"; + } + + rspamd_printf_fstring(buf, + "Content-Type: %s\r\n", + mime_type); + } + } + } + else { + /* Normal HTTP/1.1 with Host */ + if (host == NULL) { + host = msg->host->str; + } + + if (encrypted) { + /* TODO: Add proxy support to HTTPCrypt */ + if (rspamd_http_message_is_standard_port(msg)) { + rspamd_printf_fstring(buf, + "%s %s HTTP/1.1\r\n" + "Connection: %s\r\n" + "Host: %s\r\n" + "Content-Length: %z\r\n" + "Content-Type: application/octet-stream\r\n", + "POST", + "/post", + conn_type, + host, + enclen); + } + else { + rspamd_printf_fstring(buf, + "%s %s HTTP/1.1\r\n" + "Connection: %s\r\n" + "Host: %s:%d\r\n" + "Content-Length: %z\r\n" + "Content-Type: application/octet-stream\r\n", + "POST", + "/post", + conn_type, + host, + msg->port, + enclen); + } + } + else { + if (conn->priv->flags & RSPAMD_HTTP_CONN_FLAG_PROXY) { + /* Write proxied request */ + if ((msg->flags & RSPAMD_HTTP_FLAG_HAS_HOST_HEADER)) { + rspamd_printf_fstring(buf, + "%s %s://%s:%d/%V HTTP/1.1\r\n" + "Connection: %s\r\n" + "Content-Length: %z\r\n", + http_method_str(msg->method), + (conn->opts & RSPAMD_HTTP_CLIENT_SSL) ? "https" : "http", + host, + msg->port, + msg->url, + conn_type, + bodylen); + } + else { + if (rspamd_http_message_is_standard_port(msg)) { + rspamd_printf_fstring(buf, + "%s %s://%s:%d/%V HTTP/1.1\r\n" + "Connection: %s\r\n" + "Host: %s\r\n" + "Content-Length: %z\r\n", + http_method_str(msg->method), + (conn->opts & RSPAMD_HTTP_CLIENT_SSL) ? "https" : "http", + host, + msg->port, + msg->url, + conn_type, + host, + bodylen); + } + else { + rspamd_printf_fstring(buf, + "%s %s://%s:%d/%V HTTP/1.1\r\n" + "Connection: %s\r\n" + "Host: %s:%d\r\n" + "Content-Length: %z\r\n", + http_method_str(msg->method), + (conn->opts & RSPAMD_HTTP_CLIENT_SSL) ? "https" : "http", + host, + msg->port, + msg->url, + conn_type, + host, + msg->port, + bodylen); + } + } + } + else { + /* Unproxied version */ + if ((msg->flags & RSPAMD_HTTP_FLAG_HAS_HOST_HEADER)) { + rspamd_printf_fstring(buf, + "%s %V HTTP/1.1\r\n" + "Connection: %s\r\n" + "Content-Length: %z\r\n", + http_method_str(msg->method), + msg->url, + conn_type, + bodylen); + } + else { + if (rspamd_http_message_is_standard_port(msg)) { + rspamd_printf_fstring(buf, + "%s %V HTTP/1.1\r\n" + "Connection: %s\r\n" + "Host: %s\r\n" + "Content-Length: %z\r\n", + http_method_str(msg->method), + msg->url, + conn_type, + host, + bodylen); + } + else { + rspamd_printf_fstring(buf, + "%s %V HTTP/1.1\r\n" + "Connection: %s\r\n" + "Host: %s:%d\r\n" + "Content-Length: %z\r\n", + http_method_str(msg->method), + msg->url, + conn_type, + host, + msg->port, + bodylen); + } + } + } + + if (bodylen > 0) { + if (mime_type != NULL) { + rspamd_printf_fstring(buf, + "Content-Type: %s\r\n", + mime_type); + } + } + } + } + + if (encrypted) { + GString *b32_key, *b32_id; + + b32_key = rspamd_keypair_print(priv->local_key, + RSPAMD_KEYPAIR_PUBKEY | RSPAMD_KEYPAIR_BASE32); + b32_id = rspamd_pubkey_print(peer_key, + RSPAMD_KEYPAIR_ID_SHORT | RSPAMD_KEYPAIR_BASE32); + /* XXX: add some fuzz here */ + rspamd_printf_fstring(&*buf, "Key: %v=%v\r\n", b32_id, b32_key); + g_string_free(b32_key, TRUE); + g_string_free(b32_id, TRUE); + } + } + + return meth_len; +} + +static gboolean +rspamd_http_connection_write_message_common(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg, + const gchar *host, + const gchar *mime_type, + gpointer ud, + ev_tstamp timeout, + gboolean allow_shared) +{ + struct rspamd_http_connection_private *priv = conn->priv; + struct rspamd_http_header *hdr, *hcur; + gchar repbuf[512], *pbody; + gint i, hdrcount, meth_len = 0, preludelen = 0; + gsize bodylen, enclen = 0; + rspamd_fstring_t *buf; + gboolean encrypted = FALSE; + guchar nonce[rspamd_cryptobox_MAX_NONCEBYTES], mac[rspamd_cryptobox_MAX_MACBYTES]; + guchar *np = NULL, *mp = NULL, *meth_pos = NULL; + struct rspamd_cryptobox_pubkey *peer_key = NULL; + enum rspamd_cryptobox_mode mode; + GError *err; + + conn->ud = ud; + priv->msg = msg; + priv->timeout = timeout; + + priv->header = NULL; + priv->buf = g_malloc0(sizeof(*priv->buf)); + REF_INIT_RETAIN(priv->buf, rspamd_http_privbuf_dtor); + priv->buf->data = rspamd_fstring_sized_new(512); + buf = priv->buf->data; + + if ((msg->flags & RSPAMD_HTTP_FLAG_WANT_SSL) && !(conn->opts & RSPAMD_HTTP_CLIENT_SSL)) { + err = g_error_new(HTTP_ERROR, 400, + "SSL connection requested but not created properly, internal error"); + rspamd_http_connection_ref(conn); + conn->error_handler(conn, err); + rspamd_http_connection_unref(conn); + g_error_free(err); + return FALSE; + } + + if (priv->peer_key && priv->local_key) { + priv->msg->peer_key = priv->peer_key; + priv->peer_key = NULL; + priv->flags |= RSPAMD_HTTP_CONN_FLAG_ENCRYPTED; + } + + if (msg->peer_key != NULL) { + if (priv->local_key == NULL) { + /* Automatically generate a temporary keypair */ + priv->local_key = rspamd_keypair_new(RSPAMD_KEYPAIR_KEX, + RSPAMD_CRYPTOBOX_MODE_25519); + } + + encrypted = TRUE; + + if (priv->cache) { + rspamd_keypair_cache_process(priv->cache, + priv->local_key, priv->msg->peer_key); + } + } + + if (encrypted && (msg->flags & + (RSPAMD_HTTP_FLAG_SHMEM_IMMUTABLE | RSPAMD_HTTP_FLAG_SHMEM))) { + /* We cannot use immutable body to encrypt message in place */ + allow_shared = FALSE; + rspamd_http_detach_shared(msg); + } + + if (allow_shared) { + gchar tmpbuf[64]; + + if (!(msg->flags & RSPAMD_HTTP_FLAG_SHMEM) || + msg->body_buf.c.shared.name == NULL) { + allow_shared = FALSE; + } + else { + /* Insert new headers */ + rspamd_http_message_add_header(msg, "Shm", + msg->body_buf.c.shared.name->shm_name); + rspamd_snprintf(tmpbuf, sizeof(tmpbuf), "%d", + (int) (msg->body_buf.begin - msg->body_buf.str)); + rspamd_http_message_add_header(msg, "Shm-Offset", + tmpbuf); + rspamd_snprintf(tmpbuf, sizeof(tmpbuf), "%z", + msg->body_buf.len); + rspamd_http_message_add_header(msg, "Shm-Length", + tmpbuf); + } + } + + if (priv->ctx->config.user_agent && conn->type == RSPAMD_HTTP_CLIENT) { + rspamd_ftok_t srch; + khiter_t k; + gint r; + + RSPAMD_FTOK_ASSIGN(&srch, "User-Agent"); + + k = kh_put(rspamd_http_headers_hash, msg->headers, &srch, &r); + + if (r != 0) { + hdr = g_malloc0(sizeof(struct rspamd_http_header)); + guint vlen = strlen(priv->ctx->config.user_agent); + hdr->combined = rspamd_fstring_sized_new(srch.len + vlen + 4); + rspamd_printf_fstring(&hdr->combined, "%T: %*s\r\n", &srch, vlen, + priv->ctx->config.user_agent); + hdr->name.begin = hdr->combined->str; + hdr->name.len = srch.len; + hdr->value.begin = hdr->combined->str + srch.len + 2; + hdr->value.len = vlen; + hdr->prev = hdr; /* for utlists */ + + kh_value(msg->headers, k) = hdr; + /* as we searched using static buffer */ + kh_key(msg->headers, k) = &hdr->name; + } + } + + if (encrypted) { + mode = rspamd_keypair_alg(priv->local_key); + + if (msg->body_buf.len == 0) { + pbody = NULL; + bodylen = 0; + msg->method = HTTP_GET; + } + else { + pbody = (gchar *) msg->body_buf.begin; + bodylen = msg->body_buf.len; + msg->method = HTTP_POST; + } + + if (conn->type == RSPAMD_HTTP_SERVER) { + /* + * iov[0] = base reply + * iov[1] = CRLF + * iov[2] = nonce + * iov[3] = mac + * iov[4] = encrypted reply + * iov[6] = encrypted crlf + * iov[7..n] = encrypted headers + * iov[n + 1] = encrypted crlf + * [iov[n + 2] = encrypted body] + */ + priv->outlen = 7; + enclen = rspamd_cryptobox_nonce_bytes(mode) + + rspamd_cryptobox_mac_bytes(mode) + + 4 + /* 2 * CRLF */ + bodylen; + } + else { + /* + * iov[0] = base request + * iov[1] = CRLF + * iov[2] = nonce + * iov[3] = mac + * iov[4] = encrypted method + space + * iov[5] = encrypted url + * iov[7] = encrypted prelude + * iov[8..n] = encrypted headers + * iov[n + 1] = encrypted crlf + * [iov[n + 2] = encrypted body] + */ + priv->outlen = 8; + + if (bodylen > 0) { + if (mime_type != NULL) { + preludelen = rspamd_snprintf(repbuf, sizeof(repbuf), "%s\r\n" + "Content-Length: %z\r\n" + "Content-Type: %s\r\n" + "\r\n", + ENCRYPTED_VERSION, bodylen, + mime_type); + } + else { + preludelen = rspamd_snprintf(repbuf, sizeof(repbuf), "%s\r\n" + "Content-Length: %z\r\n" + "" + "\r\n", + ENCRYPTED_VERSION, bodylen); + } + } + else { + preludelen = rspamd_snprintf(repbuf, sizeof(repbuf), + "%s\r\n\r\n", + ENCRYPTED_VERSION); + } + + enclen = rspamd_cryptobox_nonce_bytes(mode) + + rspamd_cryptobox_mac_bytes(mode) + + preludelen + /* version [content-length] + 2 * CRLF */ + bodylen; + } + + if (bodylen > 0) { + priv->outlen++; + } + } + else { + if (msg->method < HTTP_SYMBOLS) { + if (msg->body_buf.len == 0 || allow_shared) { + pbody = NULL; + bodylen = 0; + priv->outlen = 2; + + if (msg->method == HTTP_INVALID) { + msg->method = HTTP_GET; + } + } + else { + pbody = (gchar *) msg->body_buf.begin; + bodylen = msg->body_buf.len; + priv->outlen = 3; + + if (msg->method == HTTP_INVALID) { + msg->method = HTTP_POST; + } + } + } + else if (msg->body_buf.len > 0) { + allow_shared = FALSE; + pbody = (gchar *) msg->body_buf.begin; + bodylen = msg->body_buf.len; + priv->outlen = 2; + } + else { + /* Invalid body for spamc method */ + abort(); + } + } + + peer_key = msg->peer_key; + + priv->wr_total = bodylen + 2; + + hdrcount = 0; + + if (msg->method < HTTP_SYMBOLS) { + kh_foreach_value (msg->headers, hdr, { + DL_FOREACH (hdr, hcur) { + /* <name: value\r\n> */ + priv->wr_total += hcur->combined->len; + enclen += hcur->combined->len; + priv->outlen ++; + hdrcount ++; + } +}); +} + +/* Allocate iov */ +priv->out = g_malloc0(sizeof(struct iovec) * priv->outlen); +priv->wr_pos = 0; + +meth_len = rspamd_http_message_write_header(mime_type, encrypted, + repbuf, sizeof(repbuf), bodylen, enclen, + host, conn, msg, + &buf, priv, peer_key); +priv->wr_total += buf->len; + +/* Setup external request body */ +priv->out[0].iov_base = buf->str; +priv->out[0].iov_len = buf->len; + +/* Buf will be used eventually for encryption */ +if (encrypted) { + gint meth_offset, nonce_offset, mac_offset; + mode = rspamd_keypair_alg(priv->local_key); + + ottery_rand_bytes(nonce, rspamd_cryptobox_nonce_bytes(mode)); + memset(mac, 0, rspamd_cryptobox_mac_bytes(mode)); + meth_offset = buf->len; + + if (conn->type == RSPAMD_HTTP_SERVER) { + buf = rspamd_fstring_append(buf, repbuf, meth_len); + } + else { + meth_len = strlen(http_method_str(msg->method)) + 1; /* + space */ + buf = rspamd_fstring_append(buf, http_method_str(msg->method), + meth_len - 1); + buf = rspamd_fstring_append(buf, " ", 1); + } + + nonce_offset = buf->len; + buf = rspamd_fstring_append(buf, nonce, + rspamd_cryptobox_nonce_bytes(mode)); + mac_offset = buf->len; + buf = rspamd_fstring_append(buf, mac, + rspamd_cryptobox_mac_bytes(mode)); + + /* Need to be encrypted */ + if (conn->type == RSPAMD_HTTP_SERVER) { + buf = rspamd_fstring_append(buf, "\r\n\r\n", 4); + } + else { + buf = rspamd_fstring_append(buf, repbuf, preludelen); + } + + meth_pos = buf->str + meth_offset; + np = buf->str + nonce_offset; + mp = buf->str + mac_offset; +} + +/* During previous writes, buf might be reallocated and changed */ +priv->buf->data = buf; + +if (encrypted) { + /* Finish external HTTP request */ + priv->out[1].iov_base = "\r\n"; + priv->out[1].iov_len = 2; + /* Encrypt the real request */ + rspamd_http_connection_encrypt_message(conn, msg, priv, pbody, bodylen, + meth_pos, meth_len, preludelen, hdrcount, np, mp, peer_key); +} +else { + i = 1; + if (msg->method < HTTP_SYMBOLS) { + kh_foreach_value (msg->headers, hdr, { + DL_FOREACH (hdr, hcur) { + priv->out[i].iov_base = hcur->combined->str; + priv->out[i++].iov_len = hcur->combined->len; + } +}); + +priv->out[i].iov_base = "\r\n"; +priv->out[i++].iov_len = 2; +} +else +{ + /* No CRLF for compatibility reply */ + priv->wr_total -= 2; +} + +if (pbody != NULL) { + priv->out[i].iov_base = pbody; + priv->out[i++].iov_len = bodylen; +} +} + +priv->flags &= ~RSPAMD_HTTP_CONN_FLAG_RESETED; + +if ((priv->flags & RSPAMD_HTTP_CONN_FLAG_PROXY) && (conn->opts & RSPAMD_HTTP_CLIENT_SSL)) { + /* We need to disable SSL flag! */ + err = g_error_new(HTTP_ERROR, 400, "cannot use proxy for SSL connections"); + rspamd_http_connection_ref(conn); + conn->error_handler(conn, err); + rspamd_http_connection_unref(conn); + g_error_free(err); + return FALSE; +} + +rspamd_ev_watcher_stop(priv->ctx->event_loop, &priv->ev); + +if (conn->opts & RSPAMD_HTTP_CLIENT_SSL) { + gpointer ssl_ctx = (msg->flags & RSPAMD_HTTP_FLAG_SSL_NOVERIFY) ? priv->ctx->ssl_ctx_noverify : priv->ctx->ssl_ctx; + + if (!ssl_ctx) { + err = g_error_new(HTTP_ERROR, 400, "ssl message requested " + "with no ssl ctx"); + rspamd_http_connection_ref(conn); + conn->error_handler(conn, err); + rspamd_http_connection_unref(conn); + g_error_free(err); + return FALSE; + } + else { + if (!priv->ssl) { + priv->ssl = rspamd_ssl_connection_new(ssl_ctx, priv->ctx->event_loop, + !(msg->flags & RSPAMD_HTTP_FLAG_SSL_NOVERIFY), + conn->log_tag); + g_assert(priv->ssl != NULL); + + if (!rspamd_ssl_connect_fd(priv->ssl, conn->fd, host, &priv->ev, + priv->timeout, rspamd_http_event_handler, + rspamd_http_ssl_err_handler, conn)) { + + err = g_error_new(HTTP_ERROR, 400, + "ssl connection error: ssl error=%s, errno=%s", + ERR_error_string(ERR_get_error(), NULL), + strerror(errno)); + rspamd_http_connection_ref(conn); + conn->error_handler(conn, err); + rspamd_http_connection_unref(conn); + g_error_free(err); + return FALSE; + } + } + else { + /* Just restore SSL handlers */ + rspamd_ssl_connection_restore_handlers(priv->ssl, + rspamd_http_event_handler, + rspamd_http_ssl_err_handler, + conn, + EV_WRITE); + } + } +} +else { + rspamd_ev_watcher_init(&priv->ev, conn->fd, EV_WRITE, + rspamd_http_event_handler, conn); + rspamd_ev_watcher_start(priv->ctx->event_loop, &priv->ev, priv->timeout); +} + +return TRUE; +} + +gboolean +rspamd_http_connection_write_message(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg, + const gchar *host, + const gchar *mime_type, + gpointer ud, + ev_tstamp timeout) +{ + return rspamd_http_connection_write_message_common(conn, msg, host, mime_type, + ud, timeout, FALSE); +} + +gboolean +rspamd_http_connection_write_message_shared(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg, + const gchar *host, + const gchar *mime_type, + gpointer ud, + ev_tstamp timeout) +{ + return rspamd_http_connection_write_message_common(conn, msg, host, mime_type, + ud, timeout, TRUE); +} + + +void rspamd_http_connection_set_max_size(struct rspamd_http_connection *conn, + gsize sz) +{ + conn->max_size = sz; +} + +void rspamd_http_connection_set_key(struct rspamd_http_connection *conn, + struct rspamd_cryptobox_keypair *key) +{ + struct rspamd_http_connection_private *priv = conn->priv; + + g_assert(key != NULL); + priv->local_key = rspamd_keypair_ref(key); +} + +void rspamd_http_connection_own_socket(struct rspamd_http_connection *conn) +{ + struct rspamd_http_connection_private *priv = conn->priv; + + priv->flags |= RSPAMD_HTTP_CONN_OWN_SOCKET; +} + +const struct rspamd_cryptobox_pubkey * +rspamd_http_connection_get_peer_key(struct rspamd_http_connection *conn) +{ + struct rspamd_http_connection_private *priv = conn->priv; + + if (priv->peer_key) { + return priv->peer_key; + } + else if (priv->msg) { + return priv->msg->peer_key; + } + + return NULL; +} + +gboolean +rspamd_http_connection_is_encrypted(struct rspamd_http_connection *conn) +{ + struct rspamd_http_connection_private *priv = conn->priv; + + if (priv->peer_key != NULL) { + return TRUE; + } + else if (priv->msg) { + return priv->msg->peer_key != NULL; + } + + return FALSE; +} + +GHashTable * +rspamd_http_message_parse_query(struct rspamd_http_message *msg) +{ + GHashTable *res; + rspamd_fstring_t *key = NULL, *value = NULL; + rspamd_ftok_t *key_tok = NULL, *value_tok = NULL; + const gchar *p, *c, *end; + struct http_parser_url u; + enum { + parse_key, + parse_eqsign, + parse_value, + parse_ampersand + } state = parse_key; + + res = g_hash_table_new_full(rspamd_ftok_icase_hash, + rspamd_ftok_icase_equal, + rspamd_fstring_mapped_ftok_free, + rspamd_fstring_mapped_ftok_free); + + if (msg->url && msg->url->len > 0) { + http_parser_parse_url(msg->url->str, msg->url->len, TRUE, &u); + + if (u.field_set & (1 << UF_QUERY)) { + p = msg->url->str + u.field_data[UF_QUERY].off; + c = p; + end = p + u.field_data[UF_QUERY].len; + + while (p <= end) { + switch (state) { + case parse_key: + if ((p == end || *p == '&') && p > c) { + /* We have a single parameter without a value */ + key = rspamd_fstring_new_init(c, p - c); + key_tok = rspamd_ftok_map(key); + key_tok->len = rspamd_url_decode(key->str, key->str, + key->len); + + value = rspamd_fstring_new_init("", 0); + value_tok = rspamd_ftok_map(value); + + g_hash_table_replace(res, key_tok, value_tok); + state = parse_ampersand; + } + else if (*p == '=' && p > c) { + /* We have something like key=value */ + key = rspamd_fstring_new_init(c, p - c); + key_tok = rspamd_ftok_map(key); + key_tok->len = rspamd_url_decode(key->str, key->str, + key->len); + + state = parse_eqsign; + } + else { + p++; + } + break; + + case parse_eqsign: + if (*p != '=') { + c = p; + state = parse_value; + } + else { + p++; + } + break; + + case parse_value: + if ((p == end || *p == '&') && p >= c) { + g_assert(key != NULL); + if (p > c) { + value = rspamd_fstring_new_init(c, p - c); + value_tok = rspamd_ftok_map(value); + value_tok->len = rspamd_url_decode(value->str, + value->str, + value->len); + /* Detect quotes for value */ + if (value_tok->begin[0] == '"') { + memmove(value->str, value->str + 1, + value_tok->len - 1); + value_tok->len--; + } + if (value_tok->begin[value_tok->len - 1] == '"') { + value_tok->len--; + } + } + else { + value = rspamd_fstring_new_init("", 0); + value_tok = rspamd_ftok_map(value); + } + + g_hash_table_replace(res, key_tok, value_tok); + key = value = NULL; + key_tok = value_tok = NULL; + state = parse_ampersand; + } + else { + p++; + } + break; + + case parse_ampersand: + if (p != end && *p != '&') { + c = p; + state = parse_key; + } + else { + p++; + } + break; + } + } + } + + if (state != parse_ampersand && key != NULL) { + rspamd_fstring_free(key); + } + } + + return res; +} + + +struct rspamd_http_message * +rspamd_http_message_ref(struct rspamd_http_message *msg) +{ + REF_RETAIN(msg); + + return msg; +} + +void rspamd_http_message_unref(struct rspamd_http_message *msg) +{ + REF_RELEASE(msg); +} + +void rspamd_http_connection_disable_encryption(struct rspamd_http_connection *conn) +{ + struct rspamd_http_connection_private *priv; + + priv = conn->priv; + + if (priv) { + if (priv->local_key) { + rspamd_keypair_unref(priv->local_key); + } + if (priv->peer_key) { + rspamd_pubkey_unref(priv->peer_key); + } + + priv->local_key = NULL; + priv->peer_key = NULL; + priv->flags &= ~RSPAMD_HTTP_CONN_FLAG_ENCRYPTED; + } +}
\ No newline at end of file diff --git a/src/libserver/http/http_connection.h b/src/libserver/http/http_connection.h new file mode 100644 index 0000000..e98d164 --- /dev/null +++ b/src/libserver/http/http_connection.h @@ -0,0 +1,320 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef HTTP_H_ +#define HTTP_H_ + +/** + * @file http.h + * + * This is an interface for HTTP client and conn. + * This code uses HTTP parser written by Joyent Inc based on nginx code. + */ + +#include "config.h" +#include "http_context.h" +#include "fstring.h" +#include "ref.h" +#include "http_message.h" +#include "http_util.h" +#include "addr.h" + +#include "contrib/libev/ev.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum rspamd_http_connection_type { + RSPAMD_HTTP_SERVER, + RSPAMD_HTTP_CLIENT +}; + +struct rspamd_http_header; +struct rspamd_http_message; +struct rspamd_http_connection_private; +struct rspamd_http_connection; +struct rspamd_http_connection_router; +struct rspamd_http_connection_entry; +struct rspamd_keepalive_hash_key; + +struct rspamd_storage_shmem { + gchar *shm_name; + ref_entry_t ref; +}; + +/** + * Legacy spamc protocol + */ +#define RSPAMD_HTTP_FLAG_SPAMC (1 << 0) +/** + * Store body of the message in a shared memory segment + */ +#define RSPAMD_HTTP_FLAG_SHMEM (1 << 2) +/** + * Store body of the message in an immutable shared memory segment + */ +#define RSPAMD_HTTP_FLAG_SHMEM_IMMUTABLE (1 << 3) +/** + * Body has been set for a message + */ +#define RSPAMD_HTTP_FLAG_HAS_BODY (1 << 5) +/** + * Do not verify server's certificate + */ +#define RSPAMD_HTTP_FLAG_SSL_NOVERIFY (1 << 6) +/** + * Body has been set for a message + */ +#define RSPAMD_HTTP_FLAG_HAS_HOST_HEADER (1 << 7) +/** + * Message is intended for SSL connection + */ +#define RSPAMD_HTTP_FLAG_WANT_SSL (1 << 8) +/** + * Options for HTTP connection + */ +enum rspamd_http_options { + RSPAMD_HTTP_BODY_PARTIAL = 1, /**< Call body handler on all body data portions */ + RSPAMD_HTTP_CLIENT_SIMPLE = 1u << 1, /**< Read HTTP client reply automatically */ + RSPAMD_HTTP_CLIENT_ENCRYPTED = 1u << 2, /**< Encrypt data for client */ + RSPAMD_HTTP_CLIENT_SHARED = 1u << 3, /**< Store reply in shared memory */ + RSPAMD_HTTP_REQUIRE_ENCRYPTION = 1u << 4, + RSPAMD_HTTP_CLIENT_KEEP_ALIVE = 1u << 5, + RSPAMD_HTTP_CLIENT_SSL = 1u << 6u, +}; + +typedef int (*rspamd_http_body_handler_t)(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg, + const gchar *chunk, + gsize len); + +typedef void (*rspamd_http_error_handler_t)(struct rspamd_http_connection *conn, + GError *err); + +typedef int (*rspamd_http_finish_handler_t)(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg); + +/** + * HTTP connection structure + */ +struct rspamd_http_connection { + struct rspamd_http_connection_private *priv; + rspamd_http_body_handler_t body_handler; + rspamd_http_error_handler_t error_handler; + rspamd_http_finish_handler_t finish_handler; + gpointer ud; + const gchar *log_tag; + /* Used for keepalive */ + struct rspamd_keepalive_hash_key *keepalive_hash_key; + gsize max_size; + unsigned opts; + enum rspamd_http_connection_type type; + gboolean finished; + gint fd; + gint ref; +}; + +/** + * Creates a new HTTP server connection from an opened FD returned by accept function + * @param ctx + * @param fd + * @param body_handler + * @param error_handler + * @param finish_handler + * @param opts + * @return + */ +struct rspamd_http_connection *rspamd_http_connection_new_server( + struct rspamd_http_context *ctx, + gint fd, + rspamd_http_body_handler_t body_handler, + rspamd_http_error_handler_t error_handler, + rspamd_http_finish_handler_t finish_handler, + unsigned opts); + +/** + * Creates or reuses a new keepalive client connection identified by hostname and inet_addr + * @param ctx + * @param body_handler + * @param error_handler + * @param finish_handler + * @param addr + * @param host + * @return + */ +struct rspamd_http_connection *rspamd_http_connection_new_client_keepalive( + struct rspamd_http_context *ctx, + rspamd_http_body_handler_t body_handler, + rspamd_http_error_handler_t error_handler, + rspamd_http_finish_handler_t finish_handler, + unsigned opts, + rspamd_inet_addr_t *addr, + const gchar *host); + +/** + * Creates an ordinary connection using the address specified (if proxy is not set) + * @param ctx + * @param body_handler + * @param error_handler + * @param finish_handler + * @param opts + * @param addr + * @return + */ +struct rspamd_http_connection *rspamd_http_connection_new_client( + struct rspamd_http_context *ctx, + rspamd_http_body_handler_t body_handler, + rspamd_http_error_handler_t error_handler, + rspamd_http_finish_handler_t finish_handler, + unsigned opts, + rspamd_inet_addr_t *addr); + +/** + * Creates an ordinary client connection using ready file descriptor (ignores proxy) + * @param ctx + * @param body_handler + * @param error_handler + * @param finish_handler + * @param opts + * @param addr + * @return + */ +struct rspamd_http_connection *rspamd_http_connection_new_client_socket( + struct rspamd_http_context *ctx, + rspamd_http_body_handler_t body_handler, + rspamd_http_error_handler_t error_handler, + rspamd_http_finish_handler_t finish_handler, + unsigned opts, + gint fd); + +/** + * Set key pointed by an opaque pointer + * @param conn connection structure + * @param key opaque key structure + */ +void rspamd_http_connection_set_key(struct rspamd_http_connection *conn, + struct rspamd_cryptobox_keypair *key); + +/** + * Transfer ownership on socket to an HTTP connection + * @param conn + */ +void rspamd_http_connection_own_socket(struct rspamd_http_connection *conn); + +/** + * Get peer's public key + * @param conn connection structure + * @return pubkey structure or NULL + */ +const struct rspamd_cryptobox_pubkey *rspamd_http_connection_get_peer_key( + struct rspamd_http_connection *conn); + +/** + * Returns TRUE if a connection is encrypted + * @param conn + * @return + */ +gboolean rspamd_http_connection_is_encrypted(struct rspamd_http_connection *conn); + +/** + * Handle a request using socket fd and user data ud + * @param conn connection structure + * @param ud opaque user data + * @param fd fd to read/write + */ +void rspamd_http_connection_read_message( + struct rspamd_http_connection *conn, + gpointer ud, + ev_tstamp timeout); + +void rspamd_http_connection_read_message_shared( + struct rspamd_http_connection *conn, + gpointer ud, + ev_tstamp timeout); + +/** + * Send reply using initialised connection + * @param conn connection structure + * @param msg HTTP message + * @param ud opaque user data + * @param fd fd to read/write + */ +gboolean rspamd_http_connection_write_message( + struct rspamd_http_connection *conn, + struct rspamd_http_message *msg, + const gchar *host, + const gchar *mime_type, + gpointer ud, + ev_tstamp timeout); + +gboolean rspamd_http_connection_write_message_shared( + struct rspamd_http_connection *conn, + struct rspamd_http_message *msg, + const gchar *host, + const gchar *mime_type, + gpointer ud, + ev_tstamp timeout); + +/** + * Free connection structure + * @param conn + */ +void rspamd_http_connection_free(struct rspamd_http_connection *conn); + +/** + * Increase refcount for a connection + * @param conn + * @return + */ +static inline struct rspamd_http_connection * +rspamd_http_connection_ref(struct rspamd_http_connection *conn) +{ + conn->ref++; + return conn; +} + +/** + * Decrease a refcount for a connection and free it if refcount is equal to zero + * @param conn + */ +static void +rspamd_http_connection_unref(struct rspamd_http_connection *conn) +{ + if (--conn->ref <= 0) { + rspamd_http_connection_free(conn); + } +} + +/** + * Reset connection for a new request + * @param conn + */ +void rspamd_http_connection_reset(struct rspamd_http_connection *conn); + +/** + * Sets global maximum size for HTTP message being processed + * @param sz + */ +void rspamd_http_connection_set_max_size(struct rspamd_http_connection *conn, + gsize sz); + +void rspamd_http_connection_disable_encryption(struct rspamd_http_connection *conn); + +#ifdef __cplusplus +} +#endif + +#endif /* HTTP_H_ */ diff --git a/src/libserver/http/http_context.c b/src/libserver/http/http_context.c new file mode 100644 index 0000000..f08e33b --- /dev/null +++ b/src/libserver/http/http_context.c @@ -0,0 +1,670 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "http_context.h" +#include "http_private.h" +#include "keypair.h" +#include "keypairs_cache.h" +#include "cfg_file.h" +#include "contrib/libottery/ottery.h" +#include "contrib/http-parser/http_parser.h" +#include "ssl_util.h" +#include "rspamd.h" +#include "libev_helper.h" + +INIT_LOG_MODULE(http_context) + +#define msg_debug_http_context(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_http_context_log_id, "http_context", NULL, \ + G_STRFUNC, \ + __VA_ARGS__) + +static struct rspamd_http_context *default_ctx = NULL; + +struct rspamd_http_keepalive_cbdata { + struct rspamd_http_connection *conn; + struct rspamd_http_context *ctx; + GQueue *queue; + GList *link; + struct rspamd_io_ev ev; +}; + +static void +rspamd_http_keepalive_queue_cleanup(GQueue *conns) +{ + GList *cur; + + cur = conns->head; + + while (cur) { + struct rspamd_http_keepalive_cbdata *cbd; + + cbd = (struct rspamd_http_keepalive_cbdata *) cur->data; + /* unref call closes fd, so we need to remove ev watcher first! */ + rspamd_ev_watcher_stop(cbd->ctx->event_loop, &cbd->ev); + rspamd_http_connection_unref(cbd->conn); + g_free(cbd); + + cur = cur->next; + } + + g_queue_clear(conns); +} + +static void +rspamd_http_context_client_rotate_ev(struct ev_loop *loop, ev_timer *w, int revents) +{ + struct rspamd_http_context *ctx = (struct rspamd_http_context *) w->data; + gpointer kp; + + w->repeat = rspamd_time_jitter(ctx->config.client_key_rotate_time, 0); + msg_debug_http_context("rotate local keypair, next rotate in %.0f seconds", + w->repeat); + + ev_timer_again(loop, w); + + kp = ctx->client_kp; + ctx->client_kp = rspamd_keypair_new(RSPAMD_KEYPAIR_KEX, + RSPAMD_CRYPTOBOX_MODE_25519); + rspamd_keypair_unref(kp); +} + +static struct rspamd_http_context * +rspamd_http_context_new_default(struct rspamd_config *cfg, + struct ev_loop *ev_base, + struct upstream_ctx *ups_ctx) +{ + struct rspamd_http_context *ctx; + + static const int default_kp_size = 1024; + static const gdouble default_rotate_time = 120; + static const gdouble default_keepalive_interval = 65; + static const gchar *default_user_agent = "rspamd-" RSPAMD_VERSION_FULL; + static const gchar *default_server_hdr = "rspamd/" RSPAMD_VERSION_FULL; + + ctx = g_malloc0(sizeof(*ctx)); + ctx->config.kp_cache_size_client = default_kp_size; + ctx->config.kp_cache_size_server = default_kp_size; + ctx->config.client_key_rotate_time = default_rotate_time; + ctx->config.user_agent = default_user_agent; + ctx->config.keepalive_interval = default_keepalive_interval; + ctx->config.server_hdr = default_server_hdr; + ctx->ups_ctx = ups_ctx; + + if (cfg) { + ctx->ssl_ctx = cfg->libs_ctx->ssl_ctx; + ctx->ssl_ctx_noverify = cfg->libs_ctx->ssl_ctx_noverify; + } + else { + ctx->ssl_ctx = rspamd_init_ssl_ctx(); + ctx->ssl_ctx_noverify = rspamd_init_ssl_ctx_noverify(); + } + + ctx->event_loop = ev_base; + + ctx->keep_alive_hash = kh_init(rspamd_keep_alive_hash); + + return ctx; +} + +static void +rspamd_http_context_parse_proxy(struct rspamd_http_context *ctx, + const gchar *name, + struct upstream_list **pls) +{ + struct http_parser_url u; + struct upstream_list *uls; + + if (!ctx->ups_ctx) { + msg_err("cannot parse http_proxy %s - upstreams context is undefined", name); + return; + } + + memset(&u, 0, sizeof(u)); + + if (http_parser_parse_url(name, strlen(name), 1, &u) == 0) { + if (!(u.field_set & (1u << UF_HOST)) || u.port == 0) { + msg_err("cannot parse http(s) proxy %s - invalid host or port", name); + + return; + } + + uls = rspamd_upstreams_create(ctx->ups_ctx); + + if (!rspamd_upstreams_parse_line_len(uls, + name + u.field_data[UF_HOST].off, + u.field_data[UF_HOST].len, u.port, NULL)) { + msg_err("cannot parse http(s) proxy %s - invalid data", name); + + rspamd_upstreams_destroy(uls); + } + else { + *pls = uls; + msg_info("set http(s) proxy to %s", name); + } + } + else { + uls = rspamd_upstreams_create(ctx->ups_ctx); + + if (!rspamd_upstreams_parse_line(uls, + name, 3128, NULL)) { + msg_err("cannot parse http(s) proxy %s - invalid data", name); + + rspamd_upstreams_destroy(uls); + } + else { + *pls = uls; + msg_info("set http(s) proxy to %s", name); + } + } +} + +static void +rspamd_http_context_init(struct rspamd_http_context *ctx) +{ + if (ctx->config.kp_cache_size_client > 0) { + ctx->client_kp_cache = rspamd_keypair_cache_new(ctx->config.kp_cache_size_client); + } + + if (ctx->config.kp_cache_size_server > 0) { + ctx->server_kp_cache = rspamd_keypair_cache_new(ctx->config.kp_cache_size_server); + } + + if (ctx->config.client_key_rotate_time > 0 && ctx->event_loop) { + double jittered = rspamd_time_jitter(ctx->config.client_key_rotate_time, + 0); + + ev_timer_init(&ctx->client_rotate_ev, + rspamd_http_context_client_rotate_ev, jittered, 0); + ev_timer_start(ctx->event_loop, &ctx->client_rotate_ev); + ctx->client_rotate_ev.data = ctx; + } + + if (ctx->config.http_proxy) { + rspamd_http_context_parse_proxy(ctx, ctx->config.http_proxy, + &ctx->http_proxies); + } + + default_ctx = ctx; +} + +struct rspamd_http_context * +rspamd_http_context_create(struct rspamd_config *cfg, + struct ev_loop *ev_base, + struct upstream_ctx *ups_ctx) +{ + struct rspamd_http_context *ctx; + const ucl_object_t *http_obj; + + ctx = rspamd_http_context_new_default(cfg, ev_base, ups_ctx); + http_obj = ucl_object_lookup(cfg->cfg_ucl_obj, "http"); + + if (http_obj) { + const ucl_object_t *server_obj, *client_obj; + + client_obj = ucl_object_lookup(http_obj, "client"); + + if (client_obj) { + const ucl_object_t *kp_size; + + kp_size = ucl_object_lookup(client_obj, "cache_size"); + + if (kp_size) { + ctx->config.kp_cache_size_client = ucl_object_toint(kp_size); + } + + const ucl_object_t *rotate_time; + + rotate_time = ucl_object_lookup(client_obj, "rotate_time"); + + if (rotate_time) { + ctx->config.client_key_rotate_time = ucl_object_todouble(rotate_time); + } + + const ucl_object_t *user_agent; + + user_agent = ucl_object_lookup(client_obj, "user_agent"); + + if (user_agent) { + ctx->config.user_agent = ucl_object_tostring(user_agent); + + if (ctx->config.user_agent && strlen(ctx->config.user_agent) == 0) { + ctx->config.user_agent = NULL; + } + } + + const ucl_object_t *server_hdr; + server_hdr = ucl_object_lookup(client_obj, "server_hdr"); + + if (server_hdr) { + ctx->config.server_hdr = ucl_object_tostring(server_hdr); + + if (ctx->config.server_hdr && strlen(ctx->config.server_hdr) == 0) { + ctx->config.server_hdr = ""; + } + } + + const ucl_object_t *keepalive_interval; + + keepalive_interval = ucl_object_lookup(client_obj, "keepalive_interval"); + + if (keepalive_interval) { + ctx->config.keepalive_interval = ucl_object_todouble(keepalive_interval); + } + + const ucl_object_t *http_proxy; + http_proxy = ucl_object_lookup(client_obj, "http_proxy"); + + if (http_proxy) { + ctx->config.http_proxy = ucl_object_tostring(http_proxy); + } + } + + server_obj = ucl_object_lookup(http_obj, "server"); + + if (server_obj) { + const ucl_object_t *kp_size; + + kp_size = ucl_object_lookup(server_obj, "cache_size"); + + if (kp_size) { + ctx->config.kp_cache_size_server = ucl_object_toint(kp_size); + } + } + } + + rspamd_http_context_init(ctx); + + return ctx; +} + + +void rspamd_http_context_free(struct rspamd_http_context *ctx) +{ + if (ctx == default_ctx) { + default_ctx = NULL; + } + + if (ctx->client_kp_cache) { + rspamd_keypair_cache_destroy(ctx->client_kp_cache); + } + + if (ctx->server_kp_cache) { + rspamd_keypair_cache_destroy(ctx->server_kp_cache); + } + + if (ctx->config.client_key_rotate_time > 0) { + ev_timer_stop(ctx->event_loop, &ctx->client_rotate_ev); + + if (ctx->client_kp) { + rspamd_keypair_unref(ctx->client_kp); + } + } + + struct rspamd_keepalive_hash_key *hk; + + kh_foreach_key(ctx->keep_alive_hash, hk, { + msg_debug_http_context("cleanup keepalive elt %s (%s)", + rspamd_inet_address_to_string_pretty(hk->addr), + hk->host); + + if (hk->host) { + g_free(hk->host); + } + + rspamd_inet_address_free(hk->addr); + rspamd_http_keepalive_queue_cleanup(&hk->conns); + g_free(hk); + }); + + kh_destroy(rspamd_keep_alive_hash, ctx->keep_alive_hash); + + if (ctx->http_proxies) { + rspamd_upstreams_destroy(ctx->http_proxies); + } + + g_free(ctx); +} + +struct rspamd_http_context * +rspamd_http_context_create_config(struct rspamd_http_context_cfg *cfg, + struct ev_loop *ev_base, + struct upstream_ctx *ups_ctx) +{ + struct rspamd_http_context *ctx; + + ctx = rspamd_http_context_new_default(NULL, ev_base, ups_ctx); + memcpy(&ctx->config, cfg, sizeof(*cfg)); + rspamd_http_context_init(ctx); + + return ctx; +} + +struct rspamd_http_context * +rspamd_http_context_default(void) +{ + g_assert(default_ctx != NULL); + + return default_ctx; +} + +gint32 +rspamd_keep_alive_key_hash(struct rspamd_keepalive_hash_key *k) +{ + rspamd_cryptobox_fast_hash_state_t hst; + + rspamd_cryptobox_fast_hash_init(&hst, 0); + + if (k->host) { + rspamd_cryptobox_fast_hash_update(&hst, k->host, strlen(k->host)); + } + + rspamd_cryptobox_fast_hash_update(&hst, &k->port, sizeof(k->port)); + rspamd_cryptobox_fast_hash_update(&hst, &k->is_ssl, sizeof(k->is_ssl)); + + return rspamd_cryptobox_fast_hash_final(&hst); +} + +bool rspamd_keep_alive_key_equal(struct rspamd_keepalive_hash_key *k1, + struct rspamd_keepalive_hash_key *k2) +{ + if (k1->is_ssl != k2->is_ssl) { + return false; + } + + if (k1->host && k2->host) { + if (k1->port == k2->port) { + return strcmp(k1->host, k2->host) == 0; + } + } + else if (!k1->host && !k2->host) { + return (k1->port == k2->port); + } + + /* One has host and another has no host */ + return false; +} + +struct rspamd_http_connection * +rspamd_http_context_check_keepalive(struct rspamd_http_context *ctx, + const rspamd_inet_addr_t *addr, + const gchar *host, + bool is_ssl) +{ + struct rspamd_keepalive_hash_key hk, *phk; + khiter_t k; + + if (ctx == NULL) { + ctx = rspamd_http_context_default(); + } + + hk.addr = (rspamd_inet_addr_t *) addr; + hk.host = (gchar *) host; + hk.port = rspamd_inet_address_get_port(addr); + hk.is_ssl = is_ssl; + + k = kh_get(rspamd_keep_alive_hash, ctx->keep_alive_hash, &hk); + + if (k != kh_end(ctx->keep_alive_hash)) { + phk = kh_key(ctx->keep_alive_hash, k); + GQueue *conns = &phk->conns; + + /* Use stack based approach */ + + if (g_queue_get_length(conns) > 0) { + struct rspamd_http_keepalive_cbdata *cbd; + struct rspamd_http_connection *conn; + gint err; + socklen_t len = sizeof(gint); + + cbd = g_queue_pop_head(conns); + rspamd_ev_watcher_stop(ctx->event_loop, &cbd->ev); + conn = cbd->conn; + g_free(cbd); + + if (getsockopt(conn->fd, SOL_SOCKET, SO_ERROR, (void *) &err, &len) == -1) { + err = errno; + } + + if (err != 0) { + rspamd_http_connection_unref(conn); + + msg_debug_http_context("invalid reused keepalive element %s (%s, ssl=%d); " + "%s error; " + "%d connections queued", + rspamd_inet_address_to_string_pretty(phk->addr), + phk->host, + (int) phk->is_ssl, + g_strerror(err), + conns->length); + + return NULL; + } + + msg_debug_http_context("reused keepalive element %s (%s, ssl=%d), %d connections queued", + rspamd_inet_address_to_string_pretty(phk->addr), + phk->host, + (int) phk->is_ssl, + conns->length); + + /* We transfer refcount here! */ + return conn; + } + else { + msg_debug_http_context("found empty keepalive element %s (%s), cannot reuse", + rspamd_inet_address_to_string_pretty(phk->addr), + phk->host); + } + } + + return NULL; +} + +const rspamd_inet_addr_t * +rspamd_http_context_has_keepalive(struct rspamd_http_context *ctx, + const gchar *host, + unsigned port, + bool is_ssl) +{ + struct rspamd_keepalive_hash_key hk, *phk; + khiter_t k; + + if (ctx == NULL) { + ctx = rspamd_http_context_default(); + } + + hk.host = (gchar *) host; + hk.port = port; + hk.is_ssl = is_ssl; + + k = kh_get(rspamd_keep_alive_hash, ctx->keep_alive_hash, &hk); + + if (k != kh_end(ctx->keep_alive_hash)) { + phk = kh_key(ctx->keep_alive_hash, k); + GQueue *conns = &phk->conns; + + if (g_queue_get_length(conns) > 0) { + return phk->addr; + } + } + + return NULL; +} + +void rspamd_http_context_prepare_keepalive(struct rspamd_http_context *ctx, + struct rspamd_http_connection *conn, + const rspamd_inet_addr_t *addr, + const gchar *host, + bool is_ssl) +{ + struct rspamd_keepalive_hash_key hk, *phk; + khiter_t k; + + hk.addr = (rspamd_inet_addr_t *) addr; + hk.host = (gchar *) host; + hk.is_ssl = is_ssl; + hk.port = rspamd_inet_address_get_port(addr); + + k = kh_get(rspamd_keep_alive_hash, ctx->keep_alive_hash, &hk); + + if (k != kh_end(ctx->keep_alive_hash)) { + /* Reuse existing */ + conn->keepalive_hash_key = kh_key(ctx->keep_alive_hash, k); + msg_debug_http_context("use existing keepalive element %s (%s)", + rspamd_inet_address_to_string_pretty(conn->keepalive_hash_key->addr), + conn->keepalive_hash_key->host); + } + else { + /* Create new one */ + GQueue empty_init = G_QUEUE_INIT; + gint r; + + phk = g_malloc(sizeof(*phk)); + phk->conns = empty_init; + phk->host = g_strdup(host); + phk->is_ssl = is_ssl; + phk->addr = rspamd_inet_address_copy(addr, NULL); + phk->port = hk.port; + + + kh_put(rspamd_keep_alive_hash, ctx->keep_alive_hash, phk, &r); + conn->keepalive_hash_key = phk; + + msg_debug_http_context("create new keepalive element %s (%s)", + rspamd_inet_address_to_string_pretty(conn->keepalive_hash_key->addr), + conn->keepalive_hash_key->host); + } +} + +static void +rspamd_http_keepalive_handler(gint fd, short what, gpointer ud) +{ + struct rspamd_http_keepalive_cbdata *cbdata = + (struct rspamd_http_keepalive_cbdata *) ud; /* + * We can get here if a remote side reported something or it has + * timed out. In both cases we just terminate keepalive connection. + */ + + g_queue_delete_link(cbdata->queue, cbdata->link); + msg_debug_http_context("remove keepalive element %s (%s), %d connections left", + rspamd_inet_address_to_string_pretty(cbdata->conn->keepalive_hash_key->addr), + cbdata->conn->keepalive_hash_key->host, + cbdata->queue->length); + /* unref call closes fd, so we need to remove ev watcher first! */ + rspamd_ev_watcher_stop(cbdata->ctx->event_loop, &cbdata->ev); + rspamd_http_connection_unref(cbdata->conn); + g_free(cbdata); +} + +/* Non-static for unit testing */ +long rspamd_http_parse_keepalive_timeout(const rspamd_ftok_t *tok) +{ + long timeout = -1; + goffset pos = rspamd_substring_search(tok->begin, + tok->len, "timeout", sizeof("timeout") - 1); + + if (pos != -1) { + pos += sizeof("timeout") - 1; + + /* Skip spaces and equal sign */ + while (pos < tok->len) { + if (tok->begin[pos] != '=' && !g_ascii_isspace(tok->begin[pos])) { + break; + } + pos++; + } + + gsize ndigits = rspamd_memspn(tok->begin + pos, "0123456789", tok->len - pos); + glong real_timeout; + + if (ndigits > 0) { + if (rspamd_strtoul(tok->begin + pos, ndigits, &real_timeout)) { + timeout = real_timeout; + msg_debug_http_context("got timeout attr %l", timeout); + } + } + } + + return timeout; +} + +void rspamd_http_context_push_keepalive(struct rspamd_http_context *ctx, + struct rspamd_http_connection *conn, + struct rspamd_http_message *msg, + struct ev_loop *event_loop) +{ + struct rspamd_http_keepalive_cbdata *cbdata; + gdouble timeout = ctx->config.keepalive_interval; + + g_assert(conn->keepalive_hash_key != NULL); + + if (msg) { + const rspamd_ftok_t *tok; + rspamd_ftok_t cmp; + + tok = rspamd_http_message_find_header(msg, "Connection"); + + if (!tok) { + /* Server has not stated that it can do keep alive */ + conn->finished = TRUE; + msg_debug_http_context("no Connection header"); + return; + } + + RSPAMD_FTOK_ASSIGN(&cmp, "keep-alive"); + + if (rspamd_ftok_casecmp(&cmp, tok) != 0) { + conn->finished = TRUE; + msg_debug_http_context("connection header is not `keep-alive`"); + return; + } + + /* We can proceed, check timeout */ + + tok = rspamd_http_message_find_header(msg, "Keep-Alive"); + + if (tok) { + long maybe_timeout = rspamd_http_parse_keepalive_timeout(tok); + + if (maybe_timeout > 0) { + timeout = maybe_timeout; + } + } + } + + /* Move connection to the keepalive pool */ + cbdata = g_malloc0(sizeof(*cbdata)); + + cbdata->conn = rspamd_http_connection_ref(conn); + /* Use stack like approach to that would easy reading */ + g_queue_push_head(&conn->keepalive_hash_key->conns, cbdata); + cbdata->link = conn->keepalive_hash_key->conns.head; + + cbdata->queue = &conn->keepalive_hash_key->conns; + cbdata->ctx = ctx; + conn->finished = FALSE; + + rspamd_ev_watcher_init(&cbdata->ev, conn->fd, EV_READ, + rspamd_http_keepalive_handler, + cbdata); + rspamd_ev_watcher_start(event_loop, &cbdata->ev, timeout); + + msg_debug_http_context("push keepalive element %s (%s), %d connections queued, %.1f timeout", + rspamd_inet_address_to_string_pretty(cbdata->conn->keepalive_hash_key->addr), + cbdata->conn->keepalive_hash_key->host, + cbdata->queue->length, + timeout); +}
\ No newline at end of file diff --git a/src/libserver/http/http_context.h b/src/libserver/http/http_context.h new file mode 100644 index 0000000..f3622ae --- /dev/null +++ b/src/libserver/http/http_context.h @@ -0,0 +1,122 @@ +/*- + * Copyright 2019 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTTP_CONTEXT_H +#define RSPAMD_HTTP_CONTEXT_H + +#include "config.h" +#include "ucl.h" +#include "addr.h" + +#include "contrib/libev/ev.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_http_context; +struct rspamd_config; +struct rspamd_http_message; +struct upstream_ctx; + +struct rspamd_http_context_cfg { + guint kp_cache_size_client; + guint kp_cache_size_server; + guint ssl_cache_size; + gdouble keepalive_interval; + gdouble client_key_rotate_time; + const gchar *user_agent; + const gchar *http_proxy; + const gchar *server_hdr; +}; + +/** + * Creates and configures new HTTP context + * @param root_conf configuration object + * @param ev_base event base + * @return new context used for both client and server HTTP connections + */ +struct rspamd_http_context *rspamd_http_context_create(struct rspamd_config *cfg, + struct ev_loop *ev_base, + struct upstream_ctx *ctx); + +struct rspamd_http_context *rspamd_http_context_create_config( + struct rspamd_http_context_cfg *cfg, + struct ev_loop *ev_base, + struct upstream_ctx *ctx); + +/** + * Destroys context + * @param ctx + */ +void rspamd_http_context_free(struct rspamd_http_context *ctx); + +struct rspamd_http_context *rspamd_http_context_default(void); + +/** + * Returns preserved keepalive connection if it's available. + * Refcount is transferred to caller! + * @param ctx + * @param addr + * @param host + * @return + */ +struct rspamd_http_connection *rspamd_http_context_check_keepalive(struct rspamd_http_context *ctx, + const rspamd_inet_addr_t *addr, + const gchar *host, + bool is_ssl); + +/** + * Checks if there is a valid keepalive connection + * @param ctx + * @param addr + * @param host + * @param is_ssl + * @return + */ +const rspamd_inet_addr_t *rspamd_http_context_has_keepalive(struct rspamd_http_context *ctx, + const gchar *host, + unsigned port, + bool is_ssl); + +/** + * Prepares keepalive key for a connection by creating a new entry or by reusing existent + * Bear in mind, that keepalive pool has currently no cleanup methods! + * @param ctx + * @param conn + * @param addr + * @param host + */ +void rspamd_http_context_prepare_keepalive(struct rspamd_http_context *ctx, struct rspamd_http_connection *conn, + const rspamd_inet_addr_t *addr, const gchar *host, bool is_ssl); + +/** + * Pushes a connection to keepalive pool after client request is finished, + * keepalive key *must* be prepared before using of this function + * @param ctx + * @param conn + * @param msg + */ +void rspamd_http_context_push_keepalive(struct rspamd_http_context *ctx, + struct rspamd_http_connection *conn, + struct rspamd_http_message *msg, + struct ev_loop *ev_base); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/http/http_message.c b/src/libserver/http/http_message.c new file mode 100644 index 0000000..670122d --- /dev/null +++ b/src/libserver/http/http_message.c @@ -0,0 +1,725 @@ +/*- + * Copyright 2019 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "http_message.h" +#include "http_connection.h" +#include "http_private.h" +#include "libutil/printf.h" +#include "libserver/logger.h" +#include "utlist.h" +#include "unix-std.h" + +struct rspamd_http_message * +rspamd_http_new_message(enum rspamd_http_message_type type) +{ + struct rspamd_http_message *new; + + new = g_malloc0(sizeof(struct rspamd_http_message)); + + if (type == HTTP_REQUEST) { + new->url = rspamd_fstring_new(); + } + else { + new->url = NULL; + new->code = 200; + } + + new->port = 80; + new->type = type; + new->method = HTTP_INVALID; + new->headers = kh_init(rspamd_http_headers_hash); + + REF_INIT_RETAIN(new, rspamd_http_message_free); + + return new; +} + +struct rspamd_http_message * +rspamd_http_message_from_url(const gchar *url) +{ + struct http_parser_url pu; + struct rspamd_http_message *msg; + const gchar *host, *path; + size_t pathlen, urllen; + guint flags = 0; + + if (url == NULL) { + return NULL; + } + + urllen = strlen(url); + memset(&pu, 0, sizeof(pu)); + + if (http_parser_parse_url(url, urllen, FALSE, &pu) != 0) { + msg_warn("cannot parse URL: %s", url); + return NULL; + } + + if ((pu.field_set & (1 << UF_HOST)) == 0) { + msg_warn("no host argument in URL: %s", url); + return NULL; + } + + if ((pu.field_set & (1 << UF_SCHEMA))) { + if (pu.field_data[UF_SCHEMA].len == sizeof("https") - 1 && + memcmp(url + pu.field_data[UF_SCHEMA].off, "https", 5) == 0) { + flags |= RSPAMD_HTTP_FLAG_WANT_SSL; + } + } + + if ((pu.field_set & (1 << UF_PATH)) == 0) { + path = "/"; + pathlen = 1; + } + else { + path = url + pu.field_data[UF_PATH].off; + pathlen = urllen - pu.field_data[UF_PATH].off; + } + + msg = rspamd_http_new_message(HTTP_REQUEST); + host = url + pu.field_data[UF_HOST].off; + msg->flags = flags; + + if ((pu.field_set & (1 << UF_PORT)) != 0) { + msg->port = pu.port; + } + else { + /* XXX: magic constant */ + if (flags & RSPAMD_HTTP_FLAG_WANT_SSL) { + msg->port = 443; + } + else { + msg->port = 80; + } + } + + msg->host = g_string_new_len(host, pu.field_data[UF_HOST].len); + msg->url = rspamd_fstring_append(msg->url, path, pathlen); + + REF_INIT_RETAIN(msg, rspamd_http_message_free); + + return msg; +} + +const gchar * +rspamd_http_message_get_body(struct rspamd_http_message *msg, + gsize *blen) +{ + const gchar *ret = NULL; + + if (msg->body_buf.len > 0) { + ret = msg->body_buf.begin; + } + + if (blen) { + *blen = msg->body_buf.len; + } + + return ret; +} + +static void +rspamd_http_shname_dtor(void *p) +{ + struct rspamd_storage_shmem *n = p; + +#ifdef HAVE_SANE_SHMEM + shm_unlink(n->shm_name); +#else + unlink(n->shm_name); +#endif + g_free(n->shm_name); + g_free(n); +} + +struct rspamd_storage_shmem * +rspamd_http_message_shmem_ref(struct rspamd_http_message *msg) +{ + if ((msg->flags & RSPAMD_HTTP_FLAG_SHMEM) && msg->body_buf.c.shared.name) { + REF_RETAIN(msg->body_buf.c.shared.name); + return msg->body_buf.c.shared.name; + } + + return NULL; +} + +guint rspamd_http_message_get_flags(struct rspamd_http_message *msg) +{ + return msg->flags; +} + +void rspamd_http_message_shmem_unref(struct rspamd_storage_shmem *p) +{ + REF_RELEASE(p); +} + +gboolean +rspamd_http_message_set_body(struct rspamd_http_message *msg, + const gchar *data, gsize len) +{ + union _rspamd_storage_u *storage; + storage = &msg->body_buf.c; + + rspamd_http_message_storage_cleanup(msg); + + if (msg->flags & RSPAMD_HTTP_FLAG_SHMEM) { + storage->shared.name = g_malloc(sizeof(*storage->shared.name)); + REF_INIT_RETAIN(storage->shared.name, rspamd_http_shname_dtor); +#ifdef HAVE_SANE_SHMEM +#if defined(__DragonFly__) + // DragonFly uses regular files for shm. User rspamd is not allowed to create + // files in the root. + storage->shared.name->shm_name = g_strdup("/tmp/rhm.XXXXXXXXXXXXXXXXXXXX"); +#else + storage->shared.name->shm_name = g_strdup("/rhm.XXXXXXXXXXXXXXXXXXXX"); +#endif + storage->shared.shm_fd = rspamd_shmem_mkstemp(storage->shared.name->shm_name); +#else + /* XXX: assume that tempdir is /tmp */ + storage->shared.name->shm_name = g_strdup("/tmp/rhm.XXXXXXXXXXXXXXXXXXXX"); + storage->shared.shm_fd = mkstemp(storage->shared.name->shm_name); +#endif + + if (storage->shared.shm_fd == -1) { + return FALSE; + } + + if (len != 0 && len != G_MAXSIZE) { + if (ftruncate(storage->shared.shm_fd, len) == -1) { + return FALSE; + } + + msg->body_buf.str = mmap(NULL, len, + PROT_WRITE | PROT_READ, MAP_SHARED, + storage->shared.shm_fd, 0); + + if (msg->body_buf.str == MAP_FAILED) { + return FALSE; + } + + msg->body_buf.begin = msg->body_buf.str; + msg->body_buf.allocated_len = len; + + if (data != NULL) { + memcpy(msg->body_buf.str, data, len); + msg->body_buf.len = len; + } + } + else { + msg->body_buf.len = 0; + msg->body_buf.begin = NULL; + msg->body_buf.str = NULL; + msg->body_buf.allocated_len = 0; + } + } + else { + if (len != 0 && len != G_MAXSIZE) { + if (data == NULL) { + storage->normal = rspamd_fstring_sized_new(len); + msg->body_buf.len = 0; + } + else { + storage->normal = rspamd_fstring_new_init(data, len); + msg->body_buf.len = len; + } + } + else { + storage->normal = rspamd_fstring_new(); + } + + msg->body_buf.begin = storage->normal->str; + msg->body_buf.str = storage->normal->str; + msg->body_buf.allocated_len = storage->normal->allocated; + } + + msg->flags |= RSPAMD_HTTP_FLAG_HAS_BODY; + + return TRUE; +} + +void rspamd_http_message_set_method(struct rspamd_http_message *msg, + const gchar *method) +{ + gint i; + + /* Linear search: not very efficient method */ + for (i = 0; i < HTTP_METHOD_MAX; i++) { + if (g_ascii_strcasecmp(method, http_method_str(i)) == 0) { + msg->method = i; + } + } +} + +gboolean +rspamd_http_message_set_body_from_fd(struct rspamd_http_message *msg, + gint fd) +{ + union _rspamd_storage_u *storage; + struct stat st; + + rspamd_http_message_storage_cleanup(msg); + + storage = &msg->body_buf.c; + msg->flags |= RSPAMD_HTTP_FLAG_SHMEM | RSPAMD_HTTP_FLAG_SHMEM_IMMUTABLE; + + storage->shared.shm_fd = dup(fd); + msg->body_buf.str = MAP_FAILED; + + if (storage->shared.shm_fd == -1) { + return FALSE; + } + + if (fstat(storage->shared.shm_fd, &st) == -1) { + return FALSE; + } + + msg->body_buf.str = mmap(NULL, st.st_size, + PROT_READ, MAP_SHARED, + storage->shared.shm_fd, 0); + + if (msg->body_buf.str == MAP_FAILED) { + return FALSE; + } + + msg->body_buf.begin = msg->body_buf.str; + msg->body_buf.len = st.st_size; + msg->body_buf.allocated_len = st.st_size; + + return TRUE; +} + +gboolean +rspamd_http_message_set_body_from_fstring_steal(struct rspamd_http_message *msg, + rspamd_fstring_t *fstr) +{ + union _rspamd_storage_u *storage; + + rspamd_http_message_storage_cleanup(msg); + + storage = &msg->body_buf.c; + msg->flags &= ~(RSPAMD_HTTP_FLAG_SHMEM | RSPAMD_HTTP_FLAG_SHMEM_IMMUTABLE); + + storage->normal = fstr; + msg->body_buf.str = fstr->str; + msg->body_buf.begin = msg->body_buf.str; + msg->body_buf.len = fstr->len; + msg->body_buf.allocated_len = fstr->allocated; + + return TRUE; +} + +gboolean +rspamd_http_message_set_body_from_fstring_copy(struct rspamd_http_message *msg, + const rspamd_fstring_t *fstr) +{ + union _rspamd_storage_u *storage; + + rspamd_http_message_storage_cleanup(msg); + + storage = &msg->body_buf.c; + msg->flags &= ~(RSPAMD_HTTP_FLAG_SHMEM | RSPAMD_HTTP_FLAG_SHMEM_IMMUTABLE); + + storage->normal = rspamd_fstring_new_init(fstr->str, fstr->len); + msg->body_buf.str = storage->normal->str; + msg->body_buf.begin = msg->body_buf.str; + msg->body_buf.len = storage->normal->len; + msg->body_buf.allocated_len = storage->normal->allocated; + + return TRUE; +} + + +gboolean +rspamd_http_message_grow_body(struct rspamd_http_message *msg, gsize len) +{ + struct stat st; + union _rspamd_storage_u *storage; + gsize newlen; + + storage = &msg->body_buf.c; + + if (msg->flags & RSPAMD_HTTP_FLAG_SHMEM) { + if (storage->shared.shm_fd == -1) { + return FALSE; + } + + if (fstat(storage->shared.shm_fd, &st) == -1) { + return FALSE; + } + + /* Check if we need to grow */ + if ((gsize) st.st_size < msg->body_buf.len + len) { + /* Need to grow */ + newlen = rspamd_fstring_suggest_size(msg->body_buf.len, st.st_size, + len); + /* Unmap as we need another size of segment */ + if (msg->body_buf.str != MAP_FAILED) { + munmap(msg->body_buf.str, st.st_size); + } + + if (ftruncate(storage->shared.shm_fd, newlen) == -1) { + return FALSE; + } + + msg->body_buf.str = mmap(NULL, newlen, + PROT_WRITE | PROT_READ, MAP_SHARED, + storage->shared.shm_fd, 0); + if (msg->body_buf.str == MAP_FAILED) { + return FALSE; + } + + msg->body_buf.begin = msg->body_buf.str; + msg->body_buf.allocated_len = newlen; + } + } + else { + storage->normal = rspamd_fstring_grow(storage->normal, len); + + /* Append might cause realloc */ + msg->body_buf.begin = storage->normal->str; + msg->body_buf.len = storage->normal->len; + msg->body_buf.str = storage->normal->str; + msg->body_buf.allocated_len = storage->normal->allocated; + } + + return TRUE; +} + +gboolean +rspamd_http_message_append_body(struct rspamd_http_message *msg, + const gchar *data, gsize len) +{ + union _rspamd_storage_u *storage; + + storage = &msg->body_buf.c; + + if (msg->flags & RSPAMD_HTTP_FLAG_SHMEM) { + if (!rspamd_http_message_grow_body(msg, len)) { + return FALSE; + } + + memcpy(msg->body_buf.str + msg->body_buf.len, data, len); + msg->body_buf.len += len; + } + else { + storage->normal = rspamd_fstring_append(storage->normal, data, len); + + /* Append might cause realloc */ + msg->body_buf.begin = storage->normal->str; + msg->body_buf.len = storage->normal->len; + msg->body_buf.str = storage->normal->str; + msg->body_buf.allocated_len = storage->normal->allocated; + } + + return TRUE; +} + +void rspamd_http_message_storage_cleanup(struct rspamd_http_message *msg) +{ + union _rspamd_storage_u *storage; + struct stat st; + + if (msg->flags & RSPAMD_HTTP_FLAG_SHMEM) { + storage = &msg->body_buf.c; + + if (storage->shared.shm_fd > 0) { + g_assert(fstat(storage->shared.shm_fd, &st) != -1); + + if (msg->body_buf.str != MAP_FAILED) { + munmap(msg->body_buf.str, st.st_size); + } + + close(storage->shared.shm_fd); + } + + if (storage->shared.name != NULL) { + REF_RELEASE(storage->shared.name); + } + + storage->shared.shm_fd = -1; + msg->body_buf.str = MAP_FAILED; + } + else { + if (msg->body_buf.c.normal) { + rspamd_fstring_free(msg->body_buf.c.normal); + } + + msg->body_buf.c.normal = NULL; + } + + msg->body_buf.len = 0; +} + +void rspamd_http_message_free(struct rspamd_http_message *msg) +{ + struct rspamd_http_header *hdr, *hcur, *hcurtmp; + + kh_foreach_value (msg->headers, hdr, { + DL_FOREACH_SAFE (hdr, hcur, hcurtmp) { + rspamd_fstring_free (hcur->combined); + g_free (hcur); +} +}); + +kh_destroy(rspamd_http_headers_hash, msg->headers); +rspamd_http_message_storage_cleanup(msg); + +if (msg->url != NULL) { + rspamd_fstring_free(msg->url); +} +if (msg->status != NULL) { + rspamd_fstring_free(msg->status); +} +if (msg->host != NULL) { + g_string_free(msg->host, TRUE); +} +if (msg->peer_key != NULL) { + rspamd_pubkey_unref(msg->peer_key); +} + +g_free(msg); +} + +void rspamd_http_message_set_peer_key(struct rspamd_http_message *msg, + struct rspamd_cryptobox_pubkey *pk) +{ + if (msg->peer_key != NULL) { + rspamd_pubkey_unref(msg->peer_key); + } + + if (pk) { + msg->peer_key = rspamd_pubkey_ref(pk); + } + else { + msg->peer_key = NULL; + } +} + +void rspamd_http_message_add_header_len(struct rspamd_http_message *msg, + const gchar *name, + const gchar *value, + gsize len) +{ + struct rspamd_http_header *hdr, *found; + guint nlen, vlen; + khiter_t k; + gint r; + + if (msg != NULL && name != NULL && value != NULL) { + hdr = g_malloc0(sizeof(struct rspamd_http_header)); + nlen = strlen(name); + vlen = len; + + if (g_ascii_strcasecmp(name, "host") == 0) { + msg->flags |= RSPAMD_HTTP_FLAG_HAS_HOST_HEADER; + } + + hdr->combined = rspamd_fstring_sized_new(nlen + vlen + 4); + rspamd_printf_fstring(&hdr->combined, "%s: %*s\r\n", name, (gint) vlen, + value); + hdr->name.begin = hdr->combined->str; + hdr->name.len = nlen; + hdr->value.begin = hdr->combined->str + nlen + 2; + hdr->value.len = vlen; + + k = kh_put(rspamd_http_headers_hash, msg->headers, &hdr->name, + &r); + + if (r != 0) { + kh_value(msg->headers, k) = hdr; + found = NULL; + } + else { + found = kh_value(msg->headers, k); + } + + DL_APPEND(found, hdr); + } +} + +void rspamd_http_message_add_header(struct rspamd_http_message *msg, + const gchar *name, + const gchar *value) +{ + if (value) { + rspamd_http_message_add_header_len(msg, name, value, strlen(value)); + } +} + +void rspamd_http_message_add_header_fstr(struct rspamd_http_message *msg, + const gchar *name, + rspamd_fstring_t *value) +{ + struct rspamd_http_header *hdr, *found = NULL; + guint nlen, vlen; + khiter_t k; + gint r; + + if (msg != NULL && name != NULL && value != NULL) { + hdr = g_malloc0(sizeof(struct rspamd_http_header)); + nlen = strlen(name); + vlen = value->len; + hdr->combined = rspamd_fstring_sized_new(nlen + vlen + 4); + rspamd_printf_fstring(&hdr->combined, "%s: %V\r\n", name, value); + hdr->name.begin = hdr->combined->str; + hdr->name.len = nlen; + hdr->value.begin = hdr->combined->str + nlen + 2; + hdr->value.len = vlen; + + k = kh_put(rspamd_http_headers_hash, msg->headers, &hdr->name, + &r); + + if (r != 0) { + kh_value(msg->headers, k) = hdr; + found = NULL; + } + else { + found = kh_value(msg->headers, k); + } + + DL_APPEND(found, hdr); + } +} + +const rspamd_ftok_t * +rspamd_http_message_find_header(struct rspamd_http_message *msg, + const gchar *name) +{ + const rspamd_ftok_t *res = NULL; + rspamd_ftok_t srch; + guint slen = strlen(name); + khiter_t k; + + if (msg != NULL) { + srch.begin = name; + srch.len = slen; + + k = kh_get(rspamd_http_headers_hash, msg->headers, &srch); + + if (k != kh_end(msg->headers)) { + res = &(kh_value(msg->headers, k)->value); + } + } + + return res; +} + +GPtrArray * +rspamd_http_message_find_header_multiple( + struct rspamd_http_message *msg, + const gchar *name) +{ + GPtrArray *res = NULL; + struct rspamd_http_header *hdr, *cur; + rspamd_ftok_t srch; + khiter_t k; + guint cnt = 0; + + guint slen = strlen(name); + + if (msg != NULL) { + srch.begin = name; + srch.len = slen; + + k = kh_get(rspamd_http_headers_hash, msg->headers, &srch); + + if (k != kh_end(msg->headers)) { + hdr = kh_value(msg->headers, k); + + LL_COUNT(hdr, cur, cnt); + res = g_ptr_array_sized_new(cnt); + + LL_FOREACH(hdr, cur) + { + g_ptr_array_add(res, &cur->value); + } + } + } + + + return res; +} + + +gboolean +rspamd_http_message_remove_header(struct rspamd_http_message *msg, + const gchar *name) +{ + struct rspamd_http_header *hdr, *hcur, *hcurtmp; + gboolean res = FALSE; + guint slen = strlen(name); + rspamd_ftok_t srch; + khiter_t k; + + if (msg != NULL) { + srch.begin = name; + srch.len = slen; + + k = kh_get(rspamd_http_headers_hash, msg->headers, &srch); + + if (k != kh_end(msg->headers)) { + hdr = kh_value(msg->headers, k); + kh_del(rspamd_http_headers_hash, msg->headers, k); + res = TRUE; + + DL_FOREACH_SAFE(hdr, hcur, hcurtmp) + { + rspamd_fstring_free(hcur->combined); + g_free(hcur); + } + } + } + + return res; +} + +const gchar * +rspamd_http_message_get_http_host(struct rspamd_http_message *msg, + gsize *hostlen) +{ + if (msg->flags & RSPAMD_HTTP_FLAG_HAS_HOST_HEADER) { + rspamd_ftok_t srch; + + RSPAMD_FTOK_ASSIGN(&srch, "Host"); + + khiter_t k = kh_get(rspamd_http_headers_hash, msg->headers, &srch); + + if (k != kh_end(msg->headers)) { + *hostlen = (kh_value(msg->headers, k)->value).len; + return (kh_value(msg->headers, k)->value).begin; + } + else if (msg->host) { + *hostlen = msg->host->len; + return msg->host->str; + } + } + else { + if (msg->host) { + *hostlen = msg->host->len; + return msg->host->str; + } + } + + return NULL; +} + +bool rspamd_http_message_is_standard_port(struct rspamd_http_message *msg) +{ + if (msg->flags & RSPAMD_HTTP_FLAG_WANT_SSL) { + return msg->port == 443; + } + + return msg->port == 80; +}
\ No newline at end of file diff --git a/src/libserver/http/http_message.h b/src/libserver/http/http_message.h new file mode 100644 index 0000000..fa8ed04 --- /dev/null +++ b/src/libserver/http/http_message.h @@ -0,0 +1,254 @@ +/*- + * Copyright 2019 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_HTTP_MESSAGE_H +#define RSPAMD_HTTP_MESSAGE_H + +#include "config.h" +#include "keypair.h" +#include "keypairs_cache.h" +#include "fstring.h" +#include "ref.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_http_connection; + +enum rspamd_http_message_type { + HTTP_REQUEST = 0, + HTTP_RESPONSE +}; + +/** + * Extract the current message from a connection to deal with separately + * @param conn + * @return + */ +struct rspamd_http_message *rspamd_http_connection_steal_msg( + struct rspamd_http_connection *conn); + +/** + * Copy the current message from a connection to deal with separately + * @param conn + * @return + */ +struct rspamd_http_message *rspamd_http_connection_copy_msg( + struct rspamd_http_message *msg, GError **err); + +/** + * Create new HTTP message + * @param type request or response + * @return new http message + */ +struct rspamd_http_message *rspamd_http_new_message(enum rspamd_http_message_type type); + +/** + * Increase refcount number for an HTTP message + * @param msg message to use + * @return + */ +struct rspamd_http_message *rspamd_http_message_ref(struct rspamd_http_message *msg); + +/** + * Decrease number of refcounts for http message + * @param msg + */ +void rspamd_http_message_unref(struct rspamd_http_message *msg); + +/** + * Sets a key for peer + * @param msg + * @param pk + */ +void rspamd_http_message_set_peer_key(struct rspamd_http_message *msg, + struct rspamd_cryptobox_pubkey *pk); + +/** + * Create HTTP message from URL + * @param url + * @return new message or NULL + */ +struct rspamd_http_message *rspamd_http_message_from_url(const gchar *url); + +/** + * Returns body for a message + * @param msg + * @param blen pointer where to save body length + * @return pointer to body start + */ +const gchar *rspamd_http_message_get_body(struct rspamd_http_message *msg, + gsize *blen); + +/** + * Set message's body from the string + * @param msg + * @param data + * @param len + * @return TRUE if a message's body has been set + */ +gboolean rspamd_http_message_set_body(struct rspamd_http_message *msg, + const gchar *data, gsize len); + +/** + * Set message's method by name + * @param msg + * @param method + */ +void rspamd_http_message_set_method(struct rspamd_http_message *msg, + const gchar *method); + +/** + * Maps fd as message's body + * @param msg + * @param fd + * @return TRUE if a message's body has been set + */ +gboolean rspamd_http_message_set_body_from_fd(struct rspamd_http_message *msg, + gint fd); + +/** + * Uses rspamd_fstring_t as message's body, string is consumed by this operation + * @param msg + * @param fstr + * @return TRUE if a message's body has been set + */ +gboolean rspamd_http_message_set_body_from_fstring_steal(struct rspamd_http_message *msg, + rspamd_fstring_t *fstr); + +/** + * Uses rspamd_fstring_t as message's body, string is copied by this operation + * @param msg + * @param fstr + * @return TRUE if a message's body has been set + */ +gboolean rspamd_http_message_set_body_from_fstring_copy(struct rspamd_http_message *msg, + const rspamd_fstring_t *fstr); + +/** + * Appends data to message's body + * @param msg + * @param data + * @param len + * @return TRUE if a message's body has been set + */ +gboolean rspamd_http_message_append_body(struct rspamd_http_message *msg, + const gchar *data, gsize len); + +/** + * Append a header to http message + * @param rep + * @param name + * @param value + */ +void rspamd_http_message_add_header(struct rspamd_http_message *msg, + const gchar *name, + const gchar *value); + +void rspamd_http_message_add_header_len(struct rspamd_http_message *msg, + const gchar *name, + const gchar *value, + gsize len); + +void rspamd_http_message_add_header_fstr(struct rspamd_http_message *msg, + const gchar *name, + rspamd_fstring_t *value); + +/** + * Search for a specified header in message + * @param msg message + * @param name name of header + */ +const rspamd_ftok_t *rspamd_http_message_find_header( + struct rspamd_http_message *msg, + const gchar *name); + +/** + * Search for a header that has multiple values + * @param msg + * @param name + * @return list of rspamd_ftok_t * with values + */ +GPtrArray *rspamd_http_message_find_header_multiple( + struct rspamd_http_message *msg, + const gchar *name); + +/** + * Remove specific header from a message + * @param msg + * @param name + * @return + */ +gboolean rspamd_http_message_remove_header(struct rspamd_http_message *msg, + const gchar *name); + +/** + * Free HTTP message + * @param msg + */ +void rspamd_http_message_free(struct rspamd_http_message *msg); + +/** + * Extract arguments from a message's URI contained inside query string decoding + * them if needed + * @param msg HTTP request message + * @return new GHashTable which maps rspamd_ftok_t* to rspamd_ftok_t* + * (table must be freed by a caller) + */ +GHashTable *rspamd_http_message_parse_query(struct rspamd_http_message *msg); + +/** + * Increase refcount for shared file (if any) to prevent early memory unlinking + * @param msg + */ +struct rspamd_storage_shmem *rspamd_http_message_shmem_ref(struct rspamd_http_message *msg); + +/** + * Decrease external ref for shmem segment associated with a message + * @param msg + */ +void rspamd_http_message_shmem_unref(struct rspamd_storage_shmem *p); + +/** + * Returns message's flags + * @param msg + * @return + */ +guint rspamd_http_message_get_flags(struct rspamd_http_message *msg); + +/** + * Returns an HTTP hostname for a message, derived from a header if it has it + * or from a url if it doesn't + * @param msg + * @param hostlen output of the host length + * @return + */ +const gchar *rspamd_http_message_get_http_host(struct rspamd_http_message *msg, + gsize *hostlen); + +/** + * Returns true if a message has standard port (80 or 443 for https) + * @param msg + * @return + */ +bool rspamd_http_message_is_standard_port(struct rspamd_http_message *msg); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/http/http_private.h b/src/libserver/http/http_private.h new file mode 100644 index 0000000..096545e --- /dev/null +++ b/src/libserver/http/http_private.h @@ -0,0 +1,129 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBUTIL_HTTP_PRIVATE_H_ +#define SRC_LIBUTIL_HTTP_PRIVATE_H_ + +#include "http_connection.h" +#include "http_parser.h" +#include "str_util.h" +#include "keypair.h" +#include "keypairs_cache.h" +#include "ref.h" +#include "upstream.h" +#include "khash.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * HTTP header structure + */ +struct rspamd_http_header { + rspamd_fstring_t *combined; + rspamd_ftok_t name; + rspamd_ftok_t value; + struct rspamd_http_header *prev, *next; +}; + +KHASH_INIT(rspamd_http_headers_hash, rspamd_ftok_t *, + struct rspamd_http_header *, 1, + rspamd_ftok_icase_hash, rspamd_ftok_icase_equal); + +/** + * HTTP message structure, used for requests and replies + */ +struct rspamd_http_message { + rspamd_fstring_t *url; + GString *host; + rspamd_fstring_t *status; + khash_t(rspamd_http_headers_hash) * headers; + + struct _rspamd_body_buf_s { + /* Data start */ + const gchar *begin; + /* Data len */ + gsize len; + /* Allocated len */ + gsize allocated_len; + /* Data buffer (used to write data inside) */ + gchar *str; + + /* Internal storage */ + union _rspamd_storage_u { + rspamd_fstring_t *normal; + struct _rspamd_storage_shared_s { + struct rspamd_storage_shmem *name; + gint shm_fd; + } shared; + } c; + } body_buf; + + struct rspamd_cryptobox_pubkey *peer_key; + time_t date; + time_t last_modified; + unsigned port; + int type; + gint code; + enum http_method method; + gint flags; + ref_entry_t ref; +}; + +struct rspamd_keepalive_hash_key { + rspamd_inet_addr_t *addr; + gchar *host; + gboolean is_ssl; + unsigned port; + GQueue conns; +}; + +gint32 rspamd_keep_alive_key_hash(struct rspamd_keepalive_hash_key *k); + +bool rspamd_keep_alive_key_equal(struct rspamd_keepalive_hash_key *k1, + struct rspamd_keepalive_hash_key *k2); + +KHASH_INIT(rspamd_keep_alive_hash, struct rspamd_keepalive_hash_key *, + char, 0, rspamd_keep_alive_key_hash, rspamd_keep_alive_key_equal); + +struct rspamd_http_context { + struct rspamd_http_context_cfg config; + struct rspamd_keypair_cache *client_kp_cache; + struct rspamd_cryptobox_keypair *client_kp; + struct rspamd_keypair_cache *server_kp_cache; + struct upstream_ctx *ups_ctx; + struct upstream_list *http_proxies; + gpointer ssl_ctx; + gpointer ssl_ctx_noverify; + struct ev_loop *event_loop; + ev_timer client_rotate_ev; + khash_t(rspamd_keep_alive_hash) * keep_alive_hash; +}; + +#define HTTP_ERROR http_error_quark() + +GQuark http_error_quark(void); + +void rspamd_http_message_storage_cleanup(struct rspamd_http_message *msg); + +gboolean rspamd_http_message_grow_body(struct rspamd_http_message *msg, + gsize len); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBUTIL_HTTP_PRIVATE_H_ */ diff --git a/src/libserver/http/http_router.c b/src/libserver/http/http_router.c new file mode 100644 index 0000000..2fdfe48 --- /dev/null +++ b/src/libserver/http/http_router.c @@ -0,0 +1,559 @@ +/*- + * Copyright 2019 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "http_router.h" +#include "http_connection.h" +#include "http_private.h" +#include "libutil/regexp.h" +#include "libutil/printf.h" +#include "libserver/logger.h" +#include "utlist.h" +#include "unix-std.h" + +enum http_magic_type { + HTTP_MAGIC_PLAIN = 0, + HTTP_MAGIC_HTML, + HTTP_MAGIC_CSS, + HTTP_MAGIC_JS, + HTTP_MAGIC_ICO, + HTTP_MAGIC_PNG, + HTTP_MAGIC_JPG +}; + +static const struct _rspamd_http_magic { + const gchar *ext; + const gchar *ct; +} http_file_types[] = { + [HTTP_MAGIC_PLAIN] = {"txt", "text/plain"}, + [HTTP_MAGIC_HTML] = {"html", "text/html"}, + [HTTP_MAGIC_CSS] = {"css", "text/css"}, + [HTTP_MAGIC_JS] = {"js", "application/javascript"}, + [HTTP_MAGIC_ICO] = {"ico", "image/x-icon"}, + [HTTP_MAGIC_PNG] = {"png", "image/png"}, + [HTTP_MAGIC_JPG] = {"jpg", "image/jpeg"}, +}; + +/* + * HTTP router functions + */ + +static void +rspamd_http_entry_free(struct rspamd_http_connection_entry *entry) +{ + if (entry != NULL) { + close(entry->conn->fd); + rspamd_http_connection_unref(entry->conn); + if (entry->rt->finish_handler) { + entry->rt->finish_handler(entry); + } + + DL_DELETE(entry->rt->conns, entry); + g_free(entry); + } +} + +static void +rspamd_http_router_error_handler(struct rspamd_http_connection *conn, + GError *err) +{ + struct rspamd_http_connection_entry *entry = conn->ud; + struct rspamd_http_message *msg; + + if (entry->is_reply) { + /* At this point we need to finish this session and close owned socket */ + if (entry->rt->error_handler != NULL) { + entry->rt->error_handler(entry, err); + } + rspamd_http_entry_free(entry); + } + else { + /* Here we can write a reply to a client */ + if (entry->rt->error_handler != NULL) { + entry->rt->error_handler(entry, err); + } + msg = rspamd_http_new_message(HTTP_RESPONSE); + msg->date = time(NULL); + msg->code = err->code; + rspamd_http_message_set_body(msg, err->message, strlen(err->message)); + rspamd_http_connection_reset(entry->conn); + rspamd_http_connection_write_message(entry->conn, + msg, + NULL, + "text/plain", + entry, + entry->rt->timeout); + entry->is_reply = TRUE; + } +} + +static const gchar * +rspamd_http_router_detect_ct(const gchar *path) +{ + const gchar *dot; + guint i; + + dot = strrchr(path, '.'); + if (dot == NULL) { + return http_file_types[HTTP_MAGIC_PLAIN].ct; + } + dot++; + + for (i = 0; i < G_N_ELEMENTS(http_file_types); i++) { + if (strcmp(http_file_types[i].ext, dot) == 0) { + return http_file_types[i].ct; + } + } + + return http_file_types[HTTP_MAGIC_PLAIN].ct; +} + +static gboolean +rspamd_http_router_is_subdir(const gchar *parent, const gchar *sub) +{ + if (parent == NULL || sub == NULL || *parent == '\0') { + return FALSE; + } + + while (*parent != '\0') { + if (*sub != *parent) { + return FALSE; + } + parent++; + sub++; + } + + parent--; + if (*parent == G_DIR_SEPARATOR) { + return TRUE; + } + + return (*sub == G_DIR_SEPARATOR || *sub == '\0'); +} + +static gboolean +rspamd_http_router_try_file(struct rspamd_http_connection_entry *entry, + rspamd_ftok_t *lookup, gboolean expand_path) +{ + struct stat st; + gint fd; + gchar filebuf[PATH_MAX], realbuf[PATH_MAX], *dir; + struct rspamd_http_message *reply_msg; + + rspamd_snprintf(filebuf, sizeof(filebuf), "%s%c%T", + entry->rt->default_fs_path, G_DIR_SEPARATOR, lookup); + + if (realpath(filebuf, realbuf) == NULL || + lstat(realbuf, &st) == -1) { + return FALSE; + } + + if (S_ISDIR(st.st_mode) && expand_path) { + /* Try to append 'index.html' to the url */ + rspamd_fstring_t *nlookup; + rspamd_ftok_t tok; + gboolean ret; + + nlookup = rspamd_fstring_sized_new(lookup->len + sizeof("index.html")); + rspamd_printf_fstring(&nlookup, "%T%c%s", lookup, G_DIR_SEPARATOR, + "index.html"); + tok.begin = nlookup->str; + tok.len = nlookup->len; + ret = rspamd_http_router_try_file(entry, &tok, FALSE); + rspamd_fstring_free(nlookup); + + return ret; + } + else if (!S_ISREG(st.st_mode)) { + return FALSE; + } + + /* We also need to ensure that file is inside the defined dir */ + rspamd_strlcpy(filebuf, realbuf, sizeof(filebuf)); + dir = dirname(filebuf); + + if (dir == NULL || + !rspamd_http_router_is_subdir(entry->rt->default_fs_path, + dir)) { + return FALSE; + } + + fd = open(realbuf, O_RDONLY); + if (fd == -1) { + return FALSE; + } + + reply_msg = rspamd_http_new_message(HTTP_RESPONSE); + reply_msg->date = time(NULL); + reply_msg->code = 200; + rspamd_http_router_insert_headers(entry->rt, reply_msg); + + if (!rspamd_http_message_set_body_from_fd(reply_msg, fd)) { + rspamd_http_message_free(reply_msg); + close(fd); + return FALSE; + } + + close(fd); + + rspamd_http_connection_reset(entry->conn); + + msg_debug("requested file %s", realbuf); + rspamd_http_connection_write_message(entry->conn, reply_msg, NULL, + rspamd_http_router_detect_ct(realbuf), entry, + entry->rt->timeout); + + return TRUE; +} + +static void +rspamd_http_router_send_error(GError *err, + struct rspamd_http_connection_entry *entry) +{ + struct rspamd_http_message *err_msg; + + err_msg = rspamd_http_new_message(HTTP_RESPONSE); + err_msg->date = time(NULL); + err_msg->code = err->code; + rspamd_http_message_set_body(err_msg, err->message, + strlen(err->message)); + entry->is_reply = TRUE; + err_msg->status = rspamd_fstring_new_init(err->message, strlen(err->message)); + rspamd_http_router_insert_headers(entry->rt, err_msg); + rspamd_http_connection_reset(entry->conn); + rspamd_http_connection_write_message(entry->conn, + err_msg, + NULL, + "text/plain", + entry, + entry->rt->timeout); +} + + +static int +rspamd_http_router_finish_handler(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg) +{ + struct rspamd_http_connection_entry *entry = conn->ud; + rspamd_http_router_handler_t handler = NULL; + gpointer found; + + GError *err; + rspamd_ftok_t lookup; + const rspamd_ftok_t *encoding; + struct http_parser_url u; + guint i; + rspamd_regexp_t *re; + struct rspamd_http_connection_router *router; + gchar *pathbuf = NULL; + + G_STATIC_ASSERT(sizeof(rspamd_http_router_handler_t) == + sizeof(gpointer)); + + memset(&lookup, 0, sizeof(lookup)); + router = entry->rt; + + if (entry->is_reply) { + /* Request is finished, it is safe to free a connection */ + rspamd_http_entry_free(entry); + } + else { + if (G_UNLIKELY(msg->method != HTTP_GET && msg->method != HTTP_POST)) { + if (router->unknown_method_handler) { + return router->unknown_method_handler(entry, msg); + } + else { + err = g_error_new(HTTP_ERROR, 500, + "Invalid method"); + if (entry->rt->error_handler != NULL) { + entry->rt->error_handler(entry, err); + } + + rspamd_http_router_send_error(err, entry); + g_error_free(err); + + return 0; + } + } + + /* Search for path */ + if (msg->url != NULL && msg->url->len != 0) { + + http_parser_parse_url(msg->url->str, msg->url->len, TRUE, &u); + + if (u.field_set & (1 << UF_PATH)) { + gsize unnorm_len; + + pathbuf = g_malloc(u.field_data[UF_PATH].len); + memcpy(pathbuf, msg->url->str + u.field_data[UF_PATH].off, + u.field_data[UF_PATH].len); + lookup.begin = pathbuf; + lookup.len = u.field_data[UF_PATH].len; + + rspamd_normalize_path_inplace(pathbuf, + lookup.len, + &unnorm_len); + lookup.len = unnorm_len; + } + else { + lookup.begin = msg->url->str; + lookup.len = msg->url->len; + } + + found = g_hash_table_lookup(entry->rt->paths, &lookup); + memcpy(&handler, &found, sizeof(found)); + msg_debug("requested known path: %T", &lookup); + } + else { + err = g_error_new(HTTP_ERROR, 404, + "Empty path requested"); + if (entry->rt->error_handler != NULL) { + entry->rt->error_handler(entry, err); + } + + rspamd_http_router_send_error(err, entry); + g_error_free(err); + + return 0; + } + + entry->is_reply = TRUE; + + encoding = rspamd_http_message_find_header(msg, "Accept-Encoding"); + + if (encoding && rspamd_substring_search(encoding->begin, encoding->len, + "gzip", 4) != -1) { + entry->support_gzip = TRUE; + } + + if (handler != NULL) { + if (pathbuf) { + g_free(pathbuf); + } + + return handler(entry, msg); + } + else { + /* Try regexps */ + for (i = 0; i < router->regexps->len; i++) { + re = g_ptr_array_index(router->regexps, i); + if (rspamd_regexp_match(re, lookup.begin, lookup.len, + TRUE)) { + found = rspamd_regexp_get_ud(re); + memcpy(&handler, &found, sizeof(found)); + + if (pathbuf) { + g_free(pathbuf); + } + + return handler(entry, msg); + } + } + + /* Now try plain file */ + if (entry->rt->default_fs_path == NULL || lookup.len == 0 || + !rspamd_http_router_try_file(entry, &lookup, TRUE)) { + + err = g_error_new(HTTP_ERROR, 404, + "Not found"); + if (entry->rt->error_handler != NULL) { + entry->rt->error_handler(entry, err); + } + + msg_info("path: %T not found", &lookup); + rspamd_http_router_send_error(err, entry); + g_error_free(err); + } + } + } + + if (pathbuf) { + g_free(pathbuf); + } + + return 0; +} + +struct rspamd_http_connection_router * +rspamd_http_router_new(rspamd_http_router_error_handler_t eh, + rspamd_http_router_finish_handler_t fh, + ev_tstamp timeout, + const char *default_fs_path, + struct rspamd_http_context *ctx) +{ + struct rspamd_http_connection_router *nrouter; + struct stat st; + + nrouter = g_malloc0(sizeof(struct rspamd_http_connection_router)); + nrouter->paths = g_hash_table_new_full(rspamd_ftok_icase_hash, + rspamd_ftok_icase_equal, rspamd_fstring_mapped_ftok_free, NULL); + nrouter->regexps = g_ptr_array_new(); + nrouter->conns = NULL; + nrouter->error_handler = eh; + nrouter->finish_handler = fh; + nrouter->response_headers = g_hash_table_new_full(rspamd_strcase_hash, + rspamd_strcase_equal, g_free, g_free); + nrouter->event_loop = ctx->event_loop; + nrouter->timeout = timeout; + nrouter->default_fs_path = NULL; + + if (default_fs_path != NULL) { + if (stat(default_fs_path, &st) == -1) { + msg_err("cannot stat %s", default_fs_path); + } + else { + if (!S_ISDIR(st.st_mode)) { + msg_err("path %s is not a directory", default_fs_path); + } + else { + nrouter->default_fs_path = realpath(default_fs_path, NULL); + } + } + } + + nrouter->ctx = ctx; + + return nrouter; +} + +void rspamd_http_router_set_key(struct rspamd_http_connection_router *router, + struct rspamd_cryptobox_keypair *key) +{ + g_assert(key != NULL); + + router->key = rspamd_keypair_ref(key); +} + +void rspamd_http_router_add_path(struct rspamd_http_connection_router *router, + const gchar *path, rspamd_http_router_handler_t handler) +{ + gpointer ptr; + rspamd_ftok_t *key; + rspamd_fstring_t *storage; + G_STATIC_ASSERT(sizeof(rspamd_http_router_handler_t) == + sizeof(gpointer)); + + if (path != NULL && handler != NULL && router != NULL) { + memcpy(&ptr, &handler, sizeof(ptr)); + storage = rspamd_fstring_new_init(path, strlen(path)); + key = g_malloc0(sizeof(*key)); + key->begin = storage->str; + key->len = storage->len; + g_hash_table_insert(router->paths, key, ptr); + } +} + +void rspamd_http_router_set_unknown_handler(struct rspamd_http_connection_router *router, + rspamd_http_router_handler_t handler) +{ + if (router != NULL) { + router->unknown_method_handler = handler; + } +} + +void rspamd_http_router_add_header(struct rspamd_http_connection_router *router, + const gchar *name, const gchar *value) +{ + if (name != NULL && value != NULL && router != NULL) { + g_hash_table_replace(router->response_headers, g_strdup(name), + g_strdup(value)); + } +} + +void rspamd_http_router_insert_headers(struct rspamd_http_connection_router *router, + struct rspamd_http_message *msg) +{ + GHashTableIter it; + gpointer k, v; + + if (router && msg) { + g_hash_table_iter_init(&it, router->response_headers); + + while (g_hash_table_iter_next(&it, &k, &v)) { + rspamd_http_message_add_header(msg, k, v); + } + } +} + +void rspamd_http_router_add_regexp(struct rspamd_http_connection_router *router, + struct rspamd_regexp_s *re, rspamd_http_router_handler_t handler) +{ + gpointer ptr; + G_STATIC_ASSERT(sizeof(rspamd_http_router_handler_t) == + sizeof(gpointer)); + + if (re != NULL && handler != NULL && router != NULL) { + memcpy(&ptr, &handler, sizeof(ptr)); + rspamd_regexp_set_ud(re, ptr); + g_ptr_array_add(router->regexps, rspamd_regexp_ref(re)); + } +} + +void rspamd_http_router_handle_socket(struct rspamd_http_connection_router *router, + gint fd, gpointer ud) +{ + struct rspamd_http_connection_entry *conn; + + conn = g_malloc0(sizeof(struct rspamd_http_connection_entry)); + conn->rt = router; + conn->ud = ud; + conn->is_reply = FALSE; + + conn->conn = rspamd_http_connection_new_server(router->ctx, + fd, + NULL, + rspamd_http_router_error_handler, + rspamd_http_router_finish_handler, + 0); + + if (router->key) { + rspamd_http_connection_set_key(conn->conn, router->key); + } + + rspamd_http_connection_read_message(conn->conn, conn, router->timeout); + DL_PREPEND(router->conns, conn); +} + +void rspamd_http_router_free(struct rspamd_http_connection_router *router) +{ + struct rspamd_http_connection_entry *conn, *tmp; + rspamd_regexp_t *re; + guint i; + + if (router) { + DL_FOREACH_SAFE(router->conns, conn, tmp) + { + rspamd_http_entry_free(conn); + } + + if (router->key) { + rspamd_keypair_unref(router->key); + } + + if (router->default_fs_path != NULL) { + g_free(router->default_fs_path); + } + + for (i = 0; i < router->regexps->len; i++) { + re = g_ptr_array_index(router->regexps, i); + rspamd_regexp_unref(re); + } + + g_ptr_array_free(router->regexps, TRUE); + g_hash_table_unref(router->paths); + g_hash_table_unref(router->response_headers); + g_free(router); + } +} diff --git a/src/libserver/http/http_router.h b/src/libserver/http/http_router.h new file mode 100644 index 0000000..1bf70ed --- /dev/null +++ b/src/libserver/http/http_router.h @@ -0,0 +1,149 @@ +/*- + * Copyright 2019 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_HTTP_ROUTER_H +#define RSPAMD_HTTP_ROUTER_H + +#include "config.h" +#include "http_connection.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_http_connection_router; +struct rspamd_http_connection_entry; + +typedef int (*rspamd_http_router_handler_t)(struct rspamd_http_connection_entry + *conn_ent, + struct rspamd_http_message *msg); + +typedef void (*rspamd_http_router_error_handler_t)(struct rspamd_http_connection_entry *conn_ent, + GError *err); + +typedef void (*rspamd_http_router_finish_handler_t)(struct rspamd_http_connection_entry *conn_ent); + + +struct rspamd_http_connection_entry { + struct rspamd_http_connection_router *rt; + struct rspamd_http_connection *conn; + gpointer ud; + gboolean is_reply; + gboolean support_gzip; + struct rspamd_http_connection_entry *prev, *next; +}; + +struct rspamd_http_connection_router { + struct rspamd_http_connection_entry *conns; + GHashTable *paths; + GHashTable *response_headers; + GPtrArray *regexps; + ev_tstamp timeout; + struct ev_loop *event_loop; + struct rspamd_http_context *ctx; + gchar *default_fs_path; + rspamd_http_router_handler_t unknown_method_handler; + struct rspamd_cryptobox_keypair *key; + rspamd_http_router_error_handler_t error_handler; + rspamd_http_router_finish_handler_t finish_handler; +}; + +/** + * Create new http connection router and the associated HTTP connection + * @param eh error handler callback + * @param fh finish handler callback + * @param default_fs_path if not NULL try to serve static files from + * the specified directory + * @return + */ +struct rspamd_http_connection_router *rspamd_http_router_new( + rspamd_http_router_error_handler_t eh, + rspamd_http_router_finish_handler_t fh, + ev_tstamp timeout, + const char *default_fs_path, + struct rspamd_http_context *ctx); + +/** + * Set encryption key for the HTTP router + * @param router router structure + * @param key opaque key structure + */ +void rspamd_http_router_set_key(struct rspamd_http_connection_router *router, + struct rspamd_cryptobox_keypair *key); + +/** + * Add new path to the router + */ +void rspamd_http_router_add_path(struct rspamd_http_connection_router *router, + const gchar *path, rspamd_http_router_handler_t handler); + +/** + * Add custom header to append to router replies + * @param router + * @param name + * @param value + */ +void rspamd_http_router_add_header(struct rspamd_http_connection_router *router, + const gchar *name, const gchar *value); + +/** + * Sets method to handle unknown request methods + * @param router + * @param handler + */ +void rspamd_http_router_set_unknown_handler(struct rspamd_http_connection_router *router, + rspamd_http_router_handler_t handler); + +/** + * Inserts router headers to the outbound message + * @param router + * @param msg + */ +void rspamd_http_router_insert_headers(struct rspamd_http_connection_router *router, + struct rspamd_http_message *msg); + +struct rspamd_regexp_s; + +/** + * Adds new pattern to router, regexp object is refcounted by this function + * @param router + * @param re + * @param handler + */ +void rspamd_http_router_add_regexp(struct rspamd_http_connection_router *router, + struct rspamd_regexp_s *re, rspamd_http_router_handler_t handler); + +/** + * Handle new accepted socket + * @param router router object + * @param fd server socket + * @param ud opaque userdata + */ +void rspamd_http_router_handle_socket( + struct rspamd_http_connection_router *router, + gint fd, + gpointer ud); + +/** + * Free router and all connections associated + * @param router + */ +void rspamd_http_router_free(struct rspamd_http_connection_router *router); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/http/http_util.c b/src/libserver/http/http_util.c new file mode 100644 index 0000000..d5c4a57 --- /dev/null +++ b/src/libserver/http/http_util.c @@ -0,0 +1,295 @@ +/*- + * Copyright 2019 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "libserver/http/http_util.h" +#include "libutil/printf.h" +#include "libutil/util.h" + +static const gchar *http_week[] = {"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"}; +static const gchar *http_month[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}; + +/* + * Obtained from nginx + * Copyright (C) Igor Sysoev + */ +static guint mday[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; + +time_t +rspamd_http_parse_date(const gchar *header, gsize len) +{ + const gchar *p, *end; + gint month; + guint day, year, hour, min, sec; + guint64 time; + enum { + no = 0, + rfc822, /* Tue, 10 Nov 2002 23:50:13 */ + rfc850, /* Tuesday, 10-Dec-02 23:50:13 */ + isoc /* Tue Dec 10 23:50:13 2002 */ + } fmt; + + fmt = 0; + if (len > 0) { + end = header + len; + } + else { + end = header + strlen(header); + } + + day = 32; + year = 2038; + + for (p = header; p < end; p++) { + if (*p == ',') { + break; + } + + if (*p == ' ') { + fmt = isoc; + break; + } + } + + for (p++; p < end; p++) + if (*p != ' ') { + break; + } + + if (end - p < 18) { + return (time_t) -1; + } + + if (fmt != isoc) { + if (*p < '0' || *p > '9' || *(p + 1) < '0' || *(p + 1) > '9') { + return (time_t) -1; + } + + day = (*p - '0') * 10 + *(p + 1) - '0'; + p += 2; + + if (*p == ' ') { + if (end - p < 18) { + return (time_t) -1; + } + fmt = rfc822; + } + else if (*p == '-') { + fmt = rfc850; + } + else { + return (time_t) -1; + } + + p++; + } + + switch (*p) { + + case 'J': + month = *(p + 1) == 'a' ? 0 : *(p + 2) == 'n' ? 5 + : 6; + break; + + case 'F': + month = 1; + break; + + case 'M': + month = *(p + 2) == 'r' ? 2 : 4; + break; + + case 'A': + month = *(p + 1) == 'p' ? 3 : 7; + break; + + case 'S': + month = 8; + break; + + case 'O': + month = 9; + break; + + case 'N': + month = 10; + break; + + case 'D': + month = 11; + break; + + default: + return (time_t) -1; + } + + p += 3; + + if ((fmt == rfc822 && *p != ' ') || (fmt == rfc850 && *p != '-')) { + return (time_t) -1; + } + + p++; + + if (fmt == rfc822) { + if (*p < '0' || *p > '9' || *(p + 1) < '0' || *(p + 1) > '9' || *(p + 2) < '0' || *(p + 2) > '9' || *(p + 3) < '0' || *(p + 3) > '9') { + return (time_t) -1; + } + + year = (*p - '0') * 1000 + (*(p + 1) - '0') * 100 + (*(p + 2) - '0') * 10 + *(p + 3) - '0'; + p += 4; + } + else if (fmt == rfc850) { + if (*p < '0' || *p > '9' || *(p + 1) < '0' || *(p + 1) > '9') { + return (time_t) -1; + } + + year = (*p - '0') * 10 + *(p + 1) - '0'; + year += (year < 70) ? 2000 : 1900; + p += 2; + } + + if (fmt == isoc) { + if (*p == ' ') { + p++; + } + + if (*p < '0' || *p > '9') { + return (time_t) -1; + } + + day = *p++ - '0'; + + if (*p != ' ') { + if (*p < '0' || *p > '9') { + return (time_t) -1; + } + + day = day * 10 + *p++ - '0'; + } + + if (end - p < 14) { + return (time_t) -1; + } + } + + if (*p++ != ' ') { + return (time_t) -1; + } + + if (*p < '0' || *p > '9' || *(p + 1) < '0' || *(p + 1) > '9') { + return (time_t) -1; + } + + hour = (*p - '0') * 10 + *(p + 1) - '0'; + p += 2; + + if (*p++ != ':') { + return (time_t) -1; + } + + if (*p < '0' || *p > '9' || *(p + 1) < '0' || *(p + 1) > '9') { + return (time_t) -1; + } + + min = (*p - '0') * 10 + *(p + 1) - '0'; + p += 2; + + if (*p++ != ':') { + return (time_t) -1; + } + + if (*p < '0' || *p > '9' || *(p + 1) < '0' || *(p + 1) > '9') { + return (time_t) -1; + } + + sec = (*p - '0') * 10 + *(p + 1) - '0'; + + if (fmt == isoc) { + p += 2; + + if (*p++ != ' ') { + return (time_t) -1; + } + + if (*p < '0' || *p > '9' || *(p + 1) < '0' || *(p + 1) > '9' || *(p + 2) < '0' || *(p + 2) > '9' || *(p + 3) < '0' || *(p + 3) > '9') { + return (time_t) -1; + } + + year = (*p - '0') * 1000 + (*(p + 1) - '0') * 100 + (*(p + 2) - '0') * 10 + *(p + 3) - '0'; + } + + if (hour > 23 || min > 59 || sec > 59) { + return (time_t) -1; + } + + if (day == 29 && month == 1) { + if ((year & 3) || ((year % 100 == 0) && (year % 400) != 0)) { + return (time_t) -1; + } + } + else if (day > mday[month]) { + return (time_t) -1; + } + + /* + * shift new year to March 1 and start months from 1 (not 0), + * it is needed for Gauss' formula + */ + + if (--month <= 0) { + month += 12; + year -= 1; + } + + /* Gauss' formula for Gregorian days since March 1, 1 BC */ + + time = (guint64) ( + /* days in years including leap years since March 1, 1 BC */ + + 365 * year + year / 4 - year / 100 + year / 400 + + /* days before the month */ + + + 367 * month / 12 - 30 + + /* days before the day */ + + + day - 1 + + /* + * 719527 days were between March 1, 1 BC and March 1, 1970, + * 31 and 28 days were in January and February 1970 + */ + + - 719527 + 31 + 28) * + 86400 + + hour * 3600 + min * 60 + sec; + + return (time_t) time; +} + +glong rspamd_http_date_format(gchar *buf, gsize len, time_t time) +{ + struct tm tms; + + rspamd_gmtime(time, &tms); + + return rspamd_snprintf(buf, len, "%s, %02d %s %4d %02d:%02d:%02d GMT", + http_week[tms.tm_wday], tms.tm_mday, + http_month[tms.tm_mon], tms.tm_year + 1900, + tms.tm_hour, tms.tm_min, tms.tm_sec); +}
\ No newline at end of file diff --git a/src/libserver/http/http_util.h b/src/libserver/http/http_util.h new file mode 100644 index 0000000..ec57508 --- /dev/null +++ b/src/libserver/http/http_util.h @@ -0,0 +1,47 @@ +/*- + * Copyright 2019 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTTP_UTIL_H +#define RSPAMD_HTTP_UTIL_H + +#include "config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Parse HTTP date header and return it as time_t + * @param header HTTP date header + * @param len length of header + * @return time_t or (time_t)-1 in case of error + */ +time_t rspamd_http_parse_date(const gchar *header, gsize len); + +/** + * Prints HTTP date from `time` to `buf` using standard HTTP date format + * @param buf date buffer + * @param len length of buffer + * @param time time in unix seconds + * @return number of bytes written + */ +glong rspamd_http_date_format(gchar *buf, gsize len, time_t time); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/hyperscan_tools.cxx b/src/libserver/hyperscan_tools.cxx new file mode 100644 index 0000000..7d1ecf3 --- /dev/null +++ b/src/libserver/hyperscan_tools.cxx @@ -0,0 +1,627 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" + +#ifdef WITH_HYPERSCAN +#include <string> +#include <filesystem> +#include "contrib/ankerl/unordered_dense.h" +#include "contrib/ankerl/svector.h" +#include "fmt/core.h" +#include "libutil/cxx/file_util.hxx" +#include "libutil/cxx/error.hxx" +#include "hs.h" +#include "logger.h" +#include "worker_util.h" +#include "hyperscan_tools.h" + +#include <glob.h> /* for glob */ +#include <unistd.h> /* for unlink */ +#include <optional> +#include <cstdlib> /* for std::getenv */ +#include "unix-std.h" +#include "rspamd_control.h" + +#define HYPERSCAN_LOG_TAG "hsxxxx" + +// Hyperscan does not provide any API to check validity of it's databases +// However, it is required for us to perform migrations properly without +// failing at `hs_alloc_scratch` phase or even `hs_scan` which is **way too late** +// Hence, we have to check hyperscan internal guts to prevent that situation... + +#ifdef HS_MAJOR +#ifndef HS_VERSION_32BIT +#define HS_VERSION_32BIT ((HS_MAJOR << 24) | (HS_MINOR << 16) | (HS_PATCH << 8) | 0) +#endif +#endif// defined(HS_MAJOR) + +#if !defined(HS_DB_VERSION) && defined(HS_VERSION_32BIT) +#define HS_DB_VERSION HS_VERSION_32BIT +#endif + +#ifndef HS_DB_MAGIC +#define HS_DB_MAGIC (0xdbdbdbdbU) +#endif + +#define msg_info_hyperscan(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "hyperscan", HYPERSCAN_LOG_TAG, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_hyperscan_lambda(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "hyperscan", HYPERSCAN_LOG_TAG, \ + log_func, \ + __VA_ARGS__) +#define msg_err_hyperscan(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "hyperscan", HYPERSCAN_LOG_TAG, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_hyperscan(...) rspamd_conditional_debug_fast(nullptr, nullptr, \ + rspamd_hyperscan_log_id, "hyperscan", HYPERSCAN_LOG_TAG, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_hyperscan_lambda(...) rspamd_conditional_debug_fast(nullptr, nullptr, \ + rspamd_hyperscan_log_id, "hyperscan", HYPERSCAN_LOG_TAG, \ + log_func, \ + __VA_ARGS__) + +INIT_LOG_MODULE_PUBLIC(hyperscan) + +namespace rspamd::util { + +/* + * A singleton class that is responsible for deletion of the outdated hyperscan files + * One issue is that it must know about HS files in all workers, which is a problem + * TODO: we need to export hyperscan caches from all workers to a single place where + * we can clean them up (probably, to the main process) + */ +class hs_known_files_cache { +private: + // These fields are filled when we add new known cache files + ankerl::svector<std::string, 4> cache_dirs; + ankerl::svector<std::string, 8> cache_extensions; + ankerl::unordered_dense::set<std::string> known_cached_files; + bool loaded = false; + +private: + hs_known_files_cache() = default; + + virtual ~hs_known_files_cache() + { + // Cleanup cache dir + cleanup_maybe(); + } + +public: + hs_known_files_cache(const hs_known_files_cache &) = delete; + hs_known_files_cache(hs_known_files_cache &&) = delete; + + static auto get() -> hs_known_files_cache & + { + static hs_known_files_cache *singleton = nullptr; + + if (singleton == nullptr) { + singleton = new hs_known_files_cache; + } + + return *singleton; + } + + void add_cached_file(const raii_file &file) + { + auto fpath = std::filesystem::path{file.get_name()}; + std::error_code ec; + + fpath = std::filesystem::canonical(fpath, ec); + + if (ec && ec.value() != 0) { + msg_err_hyperscan("invalid path: \"%s\", error message: %s", fpath.c_str(), ec.message().c_str()); + return; + } + + auto dir = fpath.parent_path(); + auto ext = fpath.extension(); + + if (std::find_if(cache_dirs.begin(), cache_dirs.end(), + [&](const auto &item) { return item == dir; }) == std::end(cache_dirs)) { + cache_dirs.emplace_back(std::string{dir}); + } + if (std::find_if(cache_extensions.begin(), cache_extensions.end(), + [&](const auto &item) { return item == ext; }) == std::end(cache_extensions)) { + cache_extensions.emplace_back(std::string{ext}); + } + + auto is_known = known_cached_files.insert(fpath.string()); + msg_debug_hyperscan("added %s hyperscan file: %s", + is_known.second ? "new" : "already known", + fpath.c_str()); + } + + void add_cached_file(const char *fname) + { + auto fpath = std::filesystem::path{fname}; + std::error_code ec; + + fpath = std::filesystem::canonical(fpath, ec); + + if (ec && ec.value() != 0) { + msg_err_hyperscan("invalid path: \"%s\", error message: %s", fname, ec.message().c_str()); + return; + } + + auto dir = fpath.parent_path(); + auto ext = fpath.extension(); + + if (std::find_if(cache_dirs.begin(), cache_dirs.end(), + [&](const auto &item) { return item == dir; }) == std::end(cache_dirs)) { + cache_dirs.emplace_back(dir.string()); + } + if (std::find_if(cache_extensions.begin(), cache_extensions.end(), + [&](const auto &item) { return item == ext; }) == std::end(cache_extensions)) { + cache_extensions.emplace_back(ext.string()); + } + + auto is_known = known_cached_files.insert(fpath.string()); + msg_debug_hyperscan("added %s hyperscan file: %s", + is_known.second ? "new" : "already known", + fpath.c_str()); + } + + void delete_cached_file(const char *fname) + { + auto fpath = std::filesystem::path{fname}; + std::error_code ec; + + fpath = std::filesystem::canonical(fpath, ec); + + if (ec && ec.value() != 0) { + msg_err_hyperscan("invalid path to remove: \"%s\", error message: %s", + fname, ec.message().c_str()); + return; + } + + if (fpath.empty()) { + msg_err_hyperscan("attempt to remove an empty hyperscan file!"); + return; + } + + if (unlink(fpath.c_str()) == -1) { + msg_err_hyperscan("cannot remove hyperscan file %s: %s", + fpath.c_str(), strerror(errno)); + } + else { + msg_debug_hyperscan("removed hyperscan file %s", fpath.c_str()); + } + + known_cached_files.erase(fpath.string()); + } + + auto cleanup_maybe() -> void + { + auto env_cleanup_disable = std::getenv("RSPAMD_NO_CLEANUP"); + /* We clean dir merely if we are running from the main process */ + if (rspamd_current_worker == nullptr && env_cleanup_disable == nullptr && loaded) { + const auto *log_func = RSPAMD_LOG_FUNC; + auto cleanup_dir = [&](std::string_view dir) -> void { + for (const auto &ext: cache_extensions) { + glob_t globbuf; + + auto glob_pattern = fmt::format("{}{}*{}", + dir, G_DIR_SEPARATOR_S, ext); + msg_debug_hyperscan_lambda("perform glob for pattern: %s", + glob_pattern.c_str()); + memset(&globbuf, 0, sizeof(globbuf)); + + if (glob(glob_pattern.c_str(), 0, nullptr, &globbuf) == 0) { + for (auto i = 0; i < globbuf.gl_pathc; i++) { + auto path = std::string{globbuf.gl_pathv[i]}; + std::size_t nsz; + struct stat st; + + rspamd_normalize_path_inplace(path.data(), path.size(), &nsz); + path.resize(nsz); + + if (stat(path.c_str(), &st) == -1) { + msg_debug_hyperscan_lambda("cannot stat file %s: %s", + path.c_str(), strerror(errno)); + continue; + } + + if (S_ISREG(st.st_mode)) { + if (!known_cached_files.contains(path)) { + msg_info_hyperscan_lambda("remove stale hyperscan file %s", path.c_str()); + unlink(path.c_str()); + } + else { + msg_debug_hyperscan_lambda("found known hyperscan file %s, size: %Hz", + path.c_str(), st.st_size); + } + } + } + } + + globfree(&globbuf); + } + }; + + for (const auto &dir: cache_dirs) { + msg_info_hyperscan("cleaning up directory %s", dir.c_str()); + cleanup_dir(dir); + } + + cache_dirs.clear(); + cache_extensions.clear(); + known_cached_files.clear(); + } + else if (rspamd_current_worker == nullptr && env_cleanup_disable != nullptr) { + msg_info_hyperscan("disable hyperscan cleanup: env variable RSPAMD_NO_CLEANUP is set"); + } + else if (!loaded) { + msg_info_hyperscan("disable hyperscan cleanup: not loaded"); + } + } + + auto notice_loaded() -> void + { + loaded = true; + } +}; + + +/** + * This is a higher level representation of the cached hyperscan file + */ +struct hs_shared_database { + hs_database_t *db = nullptr; /**< internal database (might be in a shared memory) */ + std::optional<raii_mmaped_file> maybe_map; + std::string cached_path; + + ~hs_shared_database() + { + if (!maybe_map) { + hs_free_database(db); + } + // Otherwise, handled by maybe_map dtor + } + + explicit hs_shared_database(raii_mmaped_file &&map, hs_database_t *db) + : db(db), maybe_map(std::move(map)) + { + cached_path = maybe_map.value().get_file().get_name(); + } + explicit hs_shared_database(hs_database_t *db, const char *fname) + : db(db), maybe_map(std::nullopt) + { + if (fname) { + cached_path = fname; + } + else { + /* Likely a test case */ + cached_path = ""; + } + } + hs_shared_database(const hs_shared_database &other) = delete; + hs_shared_database() = default; + hs_shared_database(hs_shared_database &&other) noexcept + { + *this = std::move(other); + } + hs_shared_database &operator=(hs_shared_database &&other) noexcept + { + std::swap(db, other.db); + std::swap(maybe_map, other.maybe_map); + return *this; + } +}; + +struct real_hs_db { + std::uint32_t magic; + std::uint32_t version; + std::uint32_t length; + std::uint64_t platform; + std::uint32_t crc32; +}; +static auto +hs_is_valid_database(void *raw, std::size_t len, std::string_view fname) -> tl::expected<bool, std::string> +{ + if (len < sizeof(real_hs_db)) { + return tl::make_unexpected(fmt::format("cannot load hyperscan database from {}: too short", fname)); + } + + static real_hs_db test; + + memcpy(&test, raw, sizeof(test)); + + if (test.magic != HS_DB_MAGIC) { + return tl::make_unexpected(fmt::format("cannot load hyperscan database from {}: invalid magic: {} ({} expected)", + fname, test.magic, HS_DB_MAGIC)); + } + +#ifdef HS_DB_VERSION + if (test.version != HS_DB_VERSION) { + return tl::make_unexpected(fmt::format("cannot load hyperscan database from {}: invalid version: {} ({} expected)", + fname, test.version, HS_DB_VERSION)); + } +#endif + + return true; +} + +static auto +hs_shared_from_unserialized(hs_known_files_cache &hs_cache, raii_mmaped_file &&map) -> tl::expected<hs_shared_database, error> +{ + auto ptr = map.get_map(); + auto db = (hs_database_t *) ptr; + + auto is_valid = hs_is_valid_database(map.get_map(), map.get_size(), map.get_file().get_name()); + if (!is_valid) { + return tl::make_unexpected(error{is_valid.error(), -1, error_category::IMPORTANT}); + } + + hs_cache.add_cached_file(map.get_file()); + return tl::expected<hs_shared_database, error>{tl::in_place, std::move(map), db}; +} + +static auto +hs_shared_from_serialized(hs_known_files_cache &hs_cache, raii_mmaped_file &&map, std::int64_t offset) -> tl::expected<hs_shared_database, error> +{ + hs_database_t *target = nullptr; + + if (auto ret = hs_deserialize_database((const char *) map.get_map() + offset, + map.get_size() - offset, &target); + ret != HS_SUCCESS) { + return tl::make_unexpected(error{"cannot deserialize database", ret}); + } + + hs_cache.add_cached_file(map.get_file()); + return tl::expected<hs_shared_database, error>{tl::in_place, target, map.get_file().get_name().data()}; +} + +auto load_cached_hs_file(const char *fname, std::int64_t offset = 0) -> tl::expected<hs_shared_database, error> +{ + auto &hs_cache = hs_known_files_cache::get(); + const auto *log_func = RSPAMD_LOG_FUNC; + + return raii_mmaped_file::mmap_shared(fname, O_RDONLY, PROT_READ, 0) + .and_then([&]<class T>(T &&cached_serialized) -> tl::expected<hs_shared_database, error> { + if (cached_serialized.get_size() <= offset) { + return tl::make_unexpected(error{"Invalid offset", EINVAL, error_category::CRITICAL}); + } +#if defined(HS_MAJOR) && defined(HS_MINOR) && HS_MAJOR >= 5 && HS_MINOR >= 4 + auto unserialized_fname = fmt::format("{}.unser", fname); + auto unserialized_file = raii_locked_file::create(unserialized_fname.c_str(), O_CREAT | O_RDWR | O_EXCL, + 00644) + .and_then([&](auto &&new_file_locked) -> tl::expected<raii_file, error> { + auto tmpfile_pattern = fmt::format("{}{}hsmp-XXXXXXXXXXXXXXXXXX", + cached_serialized.get_file().get_dir(), G_DIR_SEPARATOR); + auto tmpfile = raii_locked_file::mkstemp(tmpfile_pattern.data(), O_CREAT | O_RDWR | O_EXCL, + 00644); + + if (!tmpfile) { + return tl::make_unexpected(tmpfile.error()); + } + else { + auto &tmpfile_checked = tmpfile.value(); + // Store owned string + auto tmpfile_name = std::string{tmpfile_checked.get_name()}; + std::size_t unserialized_size; + + if (auto ret = hs_serialized_database_size(((const char *) cached_serialized.get_map()) + offset, + cached_serialized.get_size() - offset, &unserialized_size); + ret != HS_SUCCESS) { + return tl::make_unexpected(error{ + fmt::format("cannot get unserialized database size: {}", ret), + EINVAL, + error_category::IMPORTANT}); + } + + msg_debug_hyperscan_lambda("multipattern: create new database in %s; %Hz size", + tmpfile_name.c_str(), unserialized_size); + void *buf; +#ifdef HAVE_GETPAGESIZE + auto page_size = getpagesize(); +#else + auto page_size = sysconf(_SC_PAGESIZE); +#endif + if (page_size == -1) { + page_size = 4096; + } + auto errcode = posix_memalign(&buf, page_size, unserialized_size); + if (errcode != 0 || buf == nullptr) { + return tl::make_unexpected(error{"Cannot allocate memory", + errno, error_category::CRITICAL}); + } + + if (auto ret = hs_deserialize_database_at(((const char *) cached_serialized.get_map()) + offset, + cached_serialized.get_size() - offset, (hs_database_t *) buf); + ret != HS_SUCCESS) { + return tl::make_unexpected(error{ + fmt::format("cannot deserialize hyperscan database: {}", ret), ret}); + } + else { + if (write(tmpfile_checked.get_fd(), buf, unserialized_size) == -1) { + free(buf); + return tl::make_unexpected(error{fmt::format("cannot write to {}: {}", + tmpfile_name, ::strerror(errno)), + errno, error_category::CRITICAL}); + } + else { + free(buf); + /* + * Unlink target file before renaming to avoid + * race condition. + * So what we have is that `new_file_locked` + * will have flock on that file, so it will be + * replaced after unlink safely, and also unlocked. + */ + (void) unlink(unserialized_fname.c_str()); + if (rename(tmpfile_name.c_str(), + unserialized_fname.c_str()) == -1) { + if (errno != EEXIST) { + msg_info_hyperscan_lambda("cannot rename %s -> %s: %s", + tmpfile_name.c_str(), + unserialized_fname.c_str(), + strerror(errno)); + } + } + else { + /* Unlock file but mark it as immortal first to avoid deletion */ + tmpfile_checked.make_immortal(); + (void) tmpfile_checked.unlock(); + } + } + } + /* Reopen in RO mode */ + return raii_file::open(unserialized_fname.c_str(), O_RDONLY); + }; + }) + .or_else([&](auto unused) -> tl::expected<raii_file, error> { + // Cannot create file, so try to open it in RO mode + return raii_file::open(unserialized_fname.c_str(), O_RDONLY); + }); + + tl::expected<hs_shared_database, error> ret; + + if (unserialized_file.has_value()) { + + auto &unserialized_checked = unserialized_file.value(); + + if (unserialized_checked.get_size() == 0) { + /* + * This is a case when we have a file that is currently + * being created by another process. + * We cannot use it! + */ + ret = hs_shared_from_serialized(hs_cache, std::forward<T>(cached_serialized), offset); + } + else { + ret = raii_mmaped_file::mmap_shared(std::move(unserialized_checked), PROT_READ) + .and_then([&]<class U>(U &&mmapped_unserialized) -> auto { + return hs_shared_from_unserialized(hs_cache, std::forward<U>(mmapped_unserialized)); + }); + } + } + else { + ret = hs_shared_from_serialized(hs_cache, std::forward<T>(cached_serialized), offset); + } +#else // defined(HS_MAJOR) && defined(HS_MINOR) && HS_MAJOR >= 5 && HS_MINOR >= 4 + auto ret = hs_shared_from_serialized(hs_cache, std::forward<T>(cached_serialized), offset); +#endif// defined(HS_MAJOR) && defined(HS_MINOR) && HS_MAJOR >= 5 && HS_MINOR >= 4 \ + // Add serialized file to cache merely if we have successfully loaded the actual db + if (ret.has_value()) { + hs_cache.add_cached_file(cached_serialized.get_file()); + } + return ret; + }); +} +}// namespace rspamd::util + +/* C API */ + +#define CXX_DB_FROM_C(obj) (reinterpret_cast<rspamd::util::hs_shared_database *>(obj)) +#define C_DB_FROM_CXX(obj) (reinterpret_cast<rspamd_hyperscan_t *>(obj)) + +rspamd_hyperscan_t * +rspamd_hyperscan_maybe_load(const char *filename, goffset offset) +{ + auto maybe_db = rspamd::util::load_cached_hs_file(filename, offset); + + if (maybe_db.has_value()) { + auto *ndb = new rspamd::util::hs_shared_database; + *ndb = std::move(maybe_db.value()); + return C_DB_FROM_CXX(ndb); + } + else { + auto error = maybe_db.error(); + + switch (error.category) { + case rspamd::util::error_category::CRITICAL: + msg_err_hyperscan("critical error when trying to load cached hyperscan: %s", + error.error_message.data()); + break; + case rspamd::util::error_category::IMPORTANT: + msg_info_hyperscan("error when trying to load cached hyperscan: %s", + error.error_message.data()); + break; + default: + msg_debug_hyperscan("error when trying to load cached hyperscan: %s", + error.error_message.data()); + break; + } + } + + return nullptr; +} + +hs_database_t * +rspamd_hyperscan_get_database(rspamd_hyperscan_t *db) +{ + auto *real_db = CXX_DB_FROM_C(db); + return real_db->db; +} + +rspamd_hyperscan_t * +rspamd_hyperscan_from_raw_db(hs_database_t *db, const char *fname) +{ + auto *ndb = new rspamd::util::hs_shared_database{db, fname}; + + return C_DB_FROM_CXX(ndb); +} + +void rspamd_hyperscan_free(rspamd_hyperscan_t *db, bool invalid) +{ + auto *real_db = CXX_DB_FROM_C(db); + + if (invalid && !real_db->cached_path.empty()) { + rspamd::util::hs_known_files_cache::get().delete_cached_file(real_db->cached_path.c_str()); + } + delete real_db; +} + +void rspamd_hyperscan_notice_known(const char *fname) +{ + rspamd::util::hs_known_files_cache::get().add_cached_file(fname); + + if (rspamd_current_worker != nullptr) { + /* Also notify main process */ + struct rspamd_srv_command notice_cmd; + + if (strlen(fname) >= sizeof(notice_cmd.cmd.hyperscan_cache_file.path)) { + msg_err("internal error: length of the filename %d ('%s') is larger than control buffer path: %d", + (int) strlen(fname), fname, (int) sizeof(notice_cmd.cmd.hyperscan_cache_file.path)); + } + else { + notice_cmd.type = RSPAMD_SRV_NOTICE_HYPERSCAN_CACHE; + rspamd_strlcpy(notice_cmd.cmd.hyperscan_cache_file.path, fname, sizeof(notice_cmd.cmd.hyperscan_cache_file.path)); + rspamd_srv_send_command(rspamd_current_worker, + rspamd_current_worker->srv->event_loop, ¬ice_cmd, -1, + nullptr, + nullptr); + } + } +} + +void rspamd_hyperscan_cleanup_maybe(void) +{ + rspamd::util::hs_known_files_cache::get().cleanup_maybe(); +} + +void rspamd_hyperscan_notice_loaded(void) +{ + rspamd::util::hs_known_files_cache::get().notice_loaded(); +} + +#endif// WITH_HYPERSCAN
\ No newline at end of file diff --git a/src/libserver/hyperscan_tools.h b/src/libserver/hyperscan_tools.h new file mode 100644 index 0000000..624b7b0 --- /dev/null +++ b/src/libserver/hyperscan_tools.h @@ -0,0 +1,77 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" + +#ifndef RSPAMD_HYPERSCAN_TOOLS_H +#define RSPAMD_HYPERSCAN_TOOLS_H + +#ifdef WITH_HYPERSCAN + +#include "hs.h" + +G_BEGIN_DECLS + +/** + * Opaque structure that represents hyperscan (maybe shared/cached database) + */ +typedef struct rspamd_hyperscan_s rspamd_hyperscan_t; + +/** + * Maybe load or mmap shared a hyperscan from a file + * @param filename + * @return cached database if available + */ +rspamd_hyperscan_t *rspamd_hyperscan_maybe_load(const char *filename, goffset offset); + +/** + * Creates a wrapper for a raw hs db. Ownership is transferred to the enclosing object returned + * @param filename + * @return + */ +rspamd_hyperscan_t *rspamd_hyperscan_from_raw_db(hs_database_t *db, const char *fname); +/** + * Get the internal database + * @param db + * @return + */ +hs_database_t *rspamd_hyperscan_get_database(rspamd_hyperscan_t *db); +/** + * Free the database + * @param db + */ +void rspamd_hyperscan_free(rspamd_hyperscan_t *db, bool invalid); + +/** + * Notice a known hyperscan file (e.g. externally serialized) + * @param fname + */ +void rspamd_hyperscan_notice_known(const char *fname); + +/** + * Notice that hyperscan files are all loaded (e.g. in the main process), so we can cleanup old files on termination + */ +void rspamd_hyperscan_notice_loaded(void); + +/** + * Cleans up old files. This method should be called on config free (in the main process) + */ +void rspamd_hyperscan_cleanup_maybe(void); + +G_END_DECLS + +#endif + +#endif diff --git a/src/libserver/logger.h b/src/libserver/logger.h new file mode 100644 index 0000000..8d4e313 --- /dev/null +++ b/src/libserver/logger.h @@ -0,0 +1,403 @@ +#ifndef RSPAMD_LOGGER_H +#define RSPAMD_LOGGER_H + +#include "config.h" +#include "radix.h" +#include "util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef G_LOG_LEVEL_USER_SHIFT +#define G_LOG_LEVEL_USER_SHIFT 8 +#endif + +#define RSPAMD_LOG_ID_LEN 6 + +struct rspamd_config; + +enum rspamd_log_flags { + RSPAMD_LOG_FORCED = (1 << G_LOG_LEVEL_USER_SHIFT), + RSPAMD_LOG_ENCRYPTED = (1 << (G_LOG_LEVEL_USER_SHIFT + 1)), + RSPAMD_LOG_LEVEL_MASK = ~(RSPAMD_LOG_FORCED | RSPAMD_LOG_ENCRYPTED) +}; + +typedef struct rspamd_logger_s rspamd_logger_t; +typedef bool (*rspamd_log_func_t)(const gchar *module, const gchar *id, + const gchar *function, + gint level_flags, + const gchar *message, + gsize mlen, + rspamd_logger_t *logger, + gpointer arg); +typedef void *(*rspamd_log_init_func)(rspamd_logger_t *logger, + struct rspamd_config *cfg, + uid_t uid, gid_t gid, + GError **err); +typedef bool (*rspamd_log_on_fork_func)(rspamd_logger_t *logger, + struct rspamd_config *cfg, + gpointer arg, + GError **err); +typedef void *(*rspamd_log_reload_func)(rspamd_logger_t *logger, + struct rspamd_config *cfg, + gpointer arg, + uid_t uid, gid_t gid, + GError **err); +typedef void (*rspamd_log_dtor_func)(rspamd_logger_t *logger, + gpointer arg); + +struct rspamd_logger_funcs { + rspamd_log_init_func init; + rspamd_log_reload_func reload; + rspamd_log_dtor_func dtor; + rspamd_log_func_t log; + rspamd_log_on_fork_func on_fork; + gpointer specific; +}; + +#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64) +#define RSPAMD_LOGBUF_SIZE 8192 +#else +/* Use a smaller buffer */ +#define RSPAMD_LOGBUF_SIZE 2048 +#endif + +/** + * Opens a new (initial) logger with console type + * This logger is also used as an emergency logger + * @return new rspamd logger object + */ +rspamd_logger_t *rspamd_log_open_emergency(rspamd_mempool_t *pool, gint flags); + +/** + * Open specific (configured logging) + * @param pool + * @param config + * @param uid + * @param gid + * @return + */ +rspamd_logger_t *rspamd_log_open_specific(rspamd_mempool_t *pool, + struct rspamd_config *config, + const gchar *ptype, + uid_t uid, gid_t gid); + +/** + * Set log level (from GLogLevelFlags) + * @param logger + * @param level + */ +void rspamd_log_set_log_level(rspamd_logger_t *logger, gint level); +gint rspamd_log_get_log_level(rspamd_logger_t *logger); +const gchar *rspamd_get_log_severity_string(gint level_flags); +/** + * Set log flags (from enum rspamd_log_flags) + * @param logger + * @param flags + */ +void rspamd_log_set_log_flags(rspamd_logger_t *logger, gint flags); + +/** + * Close log file or destroy other structures + */ +void rspamd_log_close(rspamd_logger_t *logger); + + +rspamd_logger_t *rspamd_log_default_logger(void); +rspamd_logger_t *rspamd_log_emergency_logger(void); + +/** + * Close and open log again for privileged processes + */ +bool rspamd_log_reopen(rspamd_logger_t *logger, struct rspamd_config *cfg, + uid_t uid, gid_t gid); + +/** + * Set log pid + */ +void rspamd_log_on_fork(GQuark ptype, struct rspamd_config *cfg, + rspamd_logger_t *logger); + +/** + * Log function that is compatible for glib messages + */ +void rspamd_glib_log_function(const gchar *log_domain, + GLogLevelFlags log_level, + const gchar *message, + gpointer arg); + +/** + * Log function for printing glib assertions + */ +void rspamd_glib_printerr_function(const gchar *message); + +/** + * Function with variable number of arguments support + */ +bool rspamd_common_log_function(rspamd_logger_t *logger, + gint level_flags, + const gchar *module, const gchar *id, + const gchar *function, const gchar *fmt, ...); + +bool rspamd_common_logv(rspamd_logger_t *logger, gint level_flags, + const gchar *module, const gchar *id, const gchar *function, + const gchar *fmt, va_list args); + +/** + * Add new logging module, returns module ID + * @param mod + * @return + */ +gint rspamd_logger_add_debug_module(const gchar *mod); + +/* + * Macro to use for faster debug modules + */ +#define INIT_LOG_MODULE(mname) \ + static gint rspamd_##mname##_log_id = -1; \ + RSPAMD_CONSTRUCTOR(rspamd_##mname##_log_init) \ + { \ + rspamd_##mname##_log_id = rspamd_logger_add_debug_module(#mname); \ + } + + +#define INIT_LOG_MODULE_PUBLIC(mname) \ + gint rspamd_##mname##_log_id = -1; \ + RSPAMD_CONSTRUCTOR(rspamd_##mname##_log_init) \ + { \ + rspamd_##mname##_log_id = rspamd_logger_add_debug_module(#mname); \ + } + +#define EXTERN_LOG_MODULE_DEF(mname) \ + extern gint rspamd_##mname##_log_id + +void rspamd_logger_configure_modules(GHashTable *mods_enabled); + +/** + * Conditional debug function + */ +bool rspamd_conditional_debug(rspamd_logger_t *logger, + rspamd_inet_addr_t *addr, const gchar *module, const gchar *id, + const gchar *function, const gchar *fmt, ...); + +bool rspamd_conditional_debug_fast(rspamd_logger_t *logger, + rspamd_inet_addr_t *addr, + gint mod_id, + const gchar *module, const gchar *id, + const gchar *function, const gchar *fmt, ...); +bool rspamd_conditional_debug_fast_num_id(rspamd_logger_t *logger, + rspamd_inet_addr_t *addr, + gint mod_id, + const gchar *module, guint64 id, + const gchar *function, const gchar *fmt, ...); +gboolean rspamd_logger_need_log(rspamd_logger_t *rspamd_log, + GLogLevelFlags log_level, + gint module_id); + +/** + * Function with variable number of arguments support that uses static default logger + */ +bool rspamd_default_log_function(gint level_flags, + const gchar *module, const gchar *id, + const gchar *function, + const gchar *fmt, + ...); + +/** + * Varargs version of default log function + * @param log_level + * @param function + * @param fmt + * @param args + */ +bool rspamd_default_logv(gint level_flags, + const gchar *module, const gchar *id, + const gchar *function, + const gchar *fmt, + va_list args); + +/** + * Temporary turn on debug + */ +void rspamd_log_debug(rspamd_logger_t *logger); + +/** + * Turn off debug + */ +void rspamd_log_nodebug(rspamd_logger_t *logger); + +/** + * Return array of counters (4 numbers): + * 0 - errors + * 1 - warnings + * 2 - info messages + * 3 - debug messages + */ +const guint64 *rspamd_log_counters(rspamd_logger_t *logger); + +/** + * Returns errors ring buffer as ucl array + * @param logger + * @return + */ +ucl_object_t *rspamd_log_errorbuf_export(const rspamd_logger_t *logger); + +/** + * Sets new logger functions and initialise logging if needed + * @param logger + * @param nfuncs + * @return static pointer to the old functions (so this function is not reentrant) + */ +struct rspamd_logger_funcs *rspamd_logger_set_log_function(rspamd_logger_t *logger, + struct rspamd_logger_funcs *nfuncs); + +/* Typical functions */ + +extern guint rspamd_task_log_id; +#ifdef __cplusplus +#define RSPAMD_LOG_FUNC __func__ +#else +#define RSPAMD_LOG_FUNC G_STRFUNC +#endif + +/* Logging in postfix style */ +#define msg_err(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + NULL, NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + NULL, NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + NULL, NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_notice(...) rspamd_default_log_function(G_LOG_LEVEL_MESSAGE, \ + NULL, NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug(...) rspamd_default_log_function(G_LOG_LEVEL_DEBUG, \ + NULL, NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +#define debug_task(...) rspamd_conditional_debug_fast(NULL, \ + task->from_addr, \ + rspamd_task_log_id, "task", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +/* Use the following macros if you have `task` in the function */ +#define msg_err_task(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + task->task_pool->tag.tagname, task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_err_task_lambda(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + task->task_pool->tag.tagname, task->task_pool->tag.uid, \ + log_func, \ + __VA_ARGS__) +#define msg_warn_task(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + task->task_pool->tag.tagname, task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_notice_task(...) rspamd_default_log_function(G_LOG_LEVEL_MESSAGE, \ + task->task_pool->tag.tagname, task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_task(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + task->task_pool->tag.tagname, task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_task_lambda(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + task->task_pool->tag.tagname, task->task_pool->tag.uid, \ + log_func, \ + __VA_ARGS__) +#define msg_debug_task(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \ + rspamd_task_log_id, "task", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_task_lambda(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \ + rspamd_task_log_id, "task", task->task_pool->tag.uid, \ + log_func, \ + __VA_ARGS__) +#define msg_err_task_encrypted(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL | RSPAMD_LOG_ENCRYPTED, \ + task->task_pool->tag.tagname, task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_task_encrypted(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING | RSPAMD_LOG_ENCRYPTED, \ + task->task_pool->tag.tagname, task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_notice_task_encrypted(...) rspamd_default_log_function(G_LOG_LEVEL_MESSAGE | RSPAMD_LOG_ENCRYPTED, \ + task->task_pool->tag.tagname, task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_task_encrypted(...) rspamd_default_log_function(G_LOG_LEVEL_INFO | RSPAMD_LOG_ENCRYPTED, \ + task->task_pool->tag.tagname, task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +/* Check for NULL pointer first */ +#define msg_err_task_check(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + task ? task->task_pool->tag.tagname : NULL, task ? task->task_pool->tag.uid : NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_task_check(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + task ? task->task_pool->tag.tagname : NULL, task ? task->task_pool->tag.uid : NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_task_check(...) rspamd_default_log_function(G_LOG_LEVEL_MESSAGE, \ + task ? task->task_pool->tag.tagname : NULL, task ? task->task_pool->tag.uid : NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_notice_task_check(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + task ? task->task_pool->tag.tagname : NULL, task ? task->task_pool->tag.uid : NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_task_check(...) rspamd_conditional_debug_fast(NULL, \ + task ? task->from_addr : NULL, \ + rspamd_task_log_id, "task", task ? task->task_pool->tag.uid : NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +/* Use the following macros if you have `pool` in the function */ +#define msg_err_pool(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + pool->tag.tagname, pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_pool(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + pool->tag.tagname, pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_pool(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + pool->tag.tagname, pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_pool(...) rspamd_conditional_debug(NULL, NULL, \ + pool->tag.tagname, pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +/* Check for NULL pointer first */ +#define msg_err_pool_check(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + pool ? pool->tag.tagname : NULL, pool ? pool->tag.uid : NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_pool_check(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + pool ? pool->tag.tagname : NULL, pool ? pool->tag.uid : NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_pool_check(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + pool ? pool->tag.tagname : NULL, pool ? pool->tag.uid : NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_pool_check(...) rspamd_conditional_debug(NULL, NULL, \ + pool ? pool->tag.tagname : NULL, pool ? pool->tag.uid : NULL, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/logger/logger.c b/src/libserver/logger/logger.c new file mode 100644 index 0000000..2dae632 --- /dev/null +++ b/src/libserver/logger/logger.c @@ -0,0 +1,1319 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "logger.h" +#include "rspamd.h" +#include "libserver/maps/map.h" +#include "libserver/maps/map_helpers.h" +#include "ottery.h" +#include "unix-std.h" +#include "logger_private.h" + + +static rspamd_logger_t *default_logger = NULL; +static rspamd_logger_t *emergency_logger = NULL; +static struct rspamd_log_modules *log_modules = NULL; + +static const gchar lf_chr = '\n'; + +guint rspamd_task_log_id = (guint) -1; +RSPAMD_CONSTRUCTOR(rspamd_task_log_init) +{ + rspamd_task_log_id = rspamd_logger_add_debug_module("task"); +} + +rspamd_logger_t * +rspamd_log_default_logger(void) +{ + return default_logger; +} + +rspamd_logger_t * +rspamd_log_emergency_logger(void) +{ + return emergency_logger; +} + +void rspamd_log_set_log_level(rspamd_logger_t *logger, gint level) +{ + if (logger == NULL) { + logger = default_logger; + } + + logger->log_level = level; +} + +gint rspamd_log_get_log_level(rspamd_logger_t *logger) +{ + if (logger == NULL) { + logger = default_logger; + } + + return logger->log_level; +} + +void rspamd_log_set_log_flags(rspamd_logger_t *logger, gint flags) +{ + g_assert(logger != NULL); + + logger->flags = flags; +} + +void rspamd_log_close(rspamd_logger_t *logger) +{ + g_assert(logger != NULL); + + if (logger->closed) { + return; + } + + logger->closed = TRUE; + + if (logger->debug_ip) { + rspamd_map_helper_destroy_radix(logger->debug_ip); + } + + if (logger->pk) { + rspamd_pubkey_unref(logger->pk); + } + + if (logger->keypair) { + rspamd_keypair_unref(logger->keypair); + } + + logger->ops.dtor(logger, logger->ops.specific); + + /* TODO: Do we really need that ? */ + if (logger == default_logger) { + default_logger = NULL; + } + + if (logger == emergency_logger) { + emergency_logger = NULL; + } + + if (!logger->pool) { + g_free(logger); + } +} + +bool rspamd_log_reopen(rspamd_logger_t *rspamd_log, struct rspamd_config *cfg, + uid_t uid, gid_t gid) +{ + void *nspec; + GError *err = NULL; + + g_assert(rspamd_log != NULL); + + nspec = rspamd_log->ops.reload(rspamd_log, cfg, rspamd_log->ops.specific, + uid, gid, &err); + + if (nspec != NULL) { + rspamd_log->ops.specific = nspec; + } + else { + } + + return nspec != NULL; +} + +static void +rspamd_emergency_logger_dtor(gpointer d) +{ + rspamd_logger_t *logger = (rspamd_logger_t *) d; + + rspamd_log_close(logger); +} + +rspamd_logger_t * +rspamd_log_open_emergency(rspamd_mempool_t *pool, gint flags) +{ + rspamd_logger_t *logger; + GError *err = NULL; + + g_assert(default_logger == NULL); + g_assert(emergency_logger == NULL); + + if (pool) { + logger = rspamd_mempool_alloc0(pool, sizeof(rspamd_logger_t)); + logger->mtx = rspamd_mempool_get_mutex(pool); + } + else { + logger = g_malloc0(sizeof(rspamd_logger_t)); + } + + logger->flags = flags; + logger->pool = pool; + logger->process_type = "main"; + logger->pid = getpid(); + + const struct rspamd_logger_funcs *funcs = &console_log_funcs; + memcpy(&logger->ops, funcs, sizeof(*funcs)); + + logger->ops.specific = logger->ops.init(logger, NULL, -1, -1, &err); + + if (logger->ops.specific == NULL) { + rspamd_fprintf(stderr, "fatal error: cannot init console logging: %e\n", + err); + g_error_free(err); + + exit(EXIT_FAILURE); + } + + default_logger = logger; + emergency_logger = logger; + + rspamd_mempool_add_destructor(pool, rspamd_emergency_logger_dtor, + emergency_logger); + + return logger; +} + +rspamd_logger_t * +rspamd_log_open_specific(rspamd_mempool_t *pool, + struct rspamd_config *cfg, + const gchar *ptype, + uid_t uid, gid_t gid) +{ + rspamd_logger_t *logger; + GError *err = NULL; + + if (pool) { + logger = rspamd_mempool_alloc0(pool, sizeof(rspamd_logger_t)); + logger->mtx = rspamd_mempool_get_mutex(pool); + } + else { + logger = g_malloc0(sizeof(rspamd_logger_t)); + } + + logger->pool = pool; + + if (cfg) { + if (cfg->log_error_elts > 0 && pool) { + logger->errlog = rspamd_mempool_alloc0_shared(pool, + sizeof(*logger->errlog)); + logger->errlog->pool = pool; + logger->errlog->max_elts = cfg->log_error_elts; + logger->errlog->elt_len = cfg->log_error_elt_maxlen; + logger->errlog->elts = rspamd_mempool_alloc0_shared(pool, + sizeof(struct rspamd_logger_error_elt) * cfg->log_error_elts + + cfg->log_error_elt_maxlen * cfg->log_error_elts); + } + + logger->log_level = cfg->log_level; + logger->flags = cfg->log_flags; + + if (!(logger->flags & RSPAMD_LOG_FLAG_ENFORCED)) { + logger->log_level = cfg->log_level; + } + } + + const struct rspamd_logger_funcs *funcs = NULL; + + if (cfg) { + switch (cfg->log_type) { + case RSPAMD_LOG_CONSOLE: + funcs = &console_log_funcs; + break; + case RSPAMD_LOG_SYSLOG: + funcs = &syslog_log_funcs; + break; + case RSPAMD_LOG_FILE: + funcs = &file_log_funcs; + break; + } + } + else { + funcs = &console_log_funcs; + } + + g_assert(funcs != NULL); + memcpy(&logger->ops, funcs, sizeof(*funcs)); + + logger->ops.specific = logger->ops.init(logger, cfg, uid, gid, &err); + + if (emergency_logger && logger->ops.specific == NULL) { + rspamd_common_log_function(emergency_logger, G_LOG_LEVEL_CRITICAL, + "logger", NULL, G_STRFUNC, + "cannot open specific logger: %e", err); + g_error_free(err); + + return NULL; + } + + logger->pid = getpid(); + logger->process_type = ptype; + logger->enabled = TRUE; + + /* Set up conditional logging */ + if (cfg) { + if (cfg->debug_ip_map != NULL) { + /* Try to add it as map first of all */ + if (logger->debug_ip) { + rspamd_map_helper_destroy_radix(logger->debug_ip); + } + + logger->debug_ip = NULL; + rspamd_config_radix_from_ucl(cfg, + cfg->debug_ip_map, + "IP addresses for which debug logs are enabled", + &logger->debug_ip, + NULL, + NULL, "debug ip"); + } + + if (cfg->log_encryption_key) { + logger->pk = rspamd_pubkey_ref(cfg->log_encryption_key); + logger->keypair = rspamd_keypair_new(RSPAMD_KEYPAIR_KEX, + RSPAMD_CRYPTOBOX_MODE_25519); + rspamd_pubkey_calculate_nm(logger->pk, logger->keypair); + } + } + + default_logger = logger; + + return logger; +} + + +/** + * Used after fork() for updating structure params + */ +void rspamd_log_on_fork(GQuark ptype, struct rspamd_config *cfg, + rspamd_logger_t *logger) +{ + logger->pid = getpid(); + logger->process_type = g_quark_to_string(ptype); + + if (logger->ops.on_fork) { + GError *err = NULL; + + bool ret = logger->ops.on_fork(logger, cfg, logger->ops.specific, &err); + + if (!ret && emergency_logger) { + rspamd_common_log_function(emergency_logger, G_LOG_LEVEL_CRITICAL, + "logger", NULL, G_STRFUNC, + "cannot update logging on fork: %e", err); + g_error_free(err); + } + } +} + +inline gboolean +rspamd_logger_need_log(rspamd_logger_t *rspamd_log, GLogLevelFlags log_level, + gint module_id) +{ + g_assert(rspamd_log != NULL); + + if ((log_level & RSPAMD_LOG_FORCED) || + (log_level & (RSPAMD_LOG_LEVEL_MASK & G_LOG_LEVEL_MASK)) <= rspamd_log->log_level) { + return TRUE; + } + + if (module_id != -1 && isset(log_modules->bitset, module_id)) { + return TRUE; + } + + return FALSE; +} + +static gchar * +rspamd_log_encrypt_message(const gchar *begin, const gchar *end, gsize *enc_len, + rspamd_logger_t *rspamd_log) +{ + guchar *out; + gchar *b64; + guchar *p, *nonce, *mac; + const guchar *comp; + guint len, inlen; + + g_assert(end > begin); + /* base64 (pubkey | nonce | message) */ + inlen = rspamd_cryptobox_nonce_bytes(RSPAMD_CRYPTOBOX_MODE_25519) + + rspamd_cryptobox_pk_bytes(RSPAMD_CRYPTOBOX_MODE_25519) + + rspamd_cryptobox_mac_bytes(RSPAMD_CRYPTOBOX_MODE_25519) + + (end - begin); + out = g_malloc(inlen); + + p = out; + comp = rspamd_pubkey_get_pk(rspamd_log->pk, &len); + memcpy(p, comp, len); + p += len; + ottery_rand_bytes(p, rspamd_cryptobox_nonce_bytes(RSPAMD_CRYPTOBOX_MODE_25519)); + nonce = p; + p += rspamd_cryptobox_nonce_bytes(RSPAMD_CRYPTOBOX_MODE_25519); + mac = p; + p += rspamd_cryptobox_mac_bytes(RSPAMD_CRYPTOBOX_MODE_25519); + memcpy(p, begin, end - begin); + comp = rspamd_pubkey_get_nm(rspamd_log->pk, rspamd_log->keypair); + g_assert(comp != NULL); + rspamd_cryptobox_encrypt_nm_inplace(p, end - begin, nonce, comp, mac, + RSPAMD_CRYPTOBOX_MODE_25519); + b64 = rspamd_encode_base64(out, inlen, 0, enc_len); + g_free(out); + + return b64; +} + +static void +rspamd_log_write_ringbuffer(rspamd_logger_t *rspamd_log, + const gchar *module, const gchar *id, + const gchar *data, glong len) +{ + guint32 row_num; + struct rspamd_logger_error_log *elog; + struct rspamd_logger_error_elt *elt; + + if (!rspamd_log->errlog) { + return; + } + + elog = rspamd_log->errlog; + + g_atomic_int_compare_and_exchange(&elog->cur_row, elog->max_elts, 0); +#if ((GLIB_MAJOR_VERSION == 2) && (GLIB_MINOR_VERSION > 30)) + row_num = g_atomic_int_add(&elog->cur_row, 1); +#else + row_num = g_atomic_int_exchange_and_add(&elog->cur_row, 1); +#endif + + if (row_num < elog->max_elts) { + elt = (struct rspamd_logger_error_elt *) (((guchar *) elog->elts) + + (sizeof(*elt) + elog->elt_len) * row_num); + g_atomic_int_set(&elt->completed, 0); + } + else { + /* Race condition */ + elog->cur_row = 0; + return; + } + + elt->pid = rspamd_log->pid; + elt->ptype = g_quark_from_string(rspamd_log->process_type); + elt->ts = rspamd_get_calendar_ticks(); + + if (id) { + rspamd_strlcpy(elt->id, id, sizeof(elt->id)); + } + else { + rspamd_strlcpy(elt->id, "", sizeof(elt->id)); + } + + if (module) { + rspamd_strlcpy(elt->module, module, sizeof(elt->module)); + } + else { + rspamd_strlcpy(elt->module, "", sizeof(elt->module)); + } + + rspamd_strlcpy(elt->message, data, MIN(len + 1, elog->elt_len)); + g_atomic_int_set(&elt->completed, 1); +} + +bool rspamd_common_logv(rspamd_logger_t *rspamd_log, gint level_flags, + const gchar *module, const gchar *id, const gchar *function, + const gchar *fmt, va_list args) +{ + gchar *end; + gint level = level_flags & (RSPAMD_LOG_LEVEL_MASK & G_LOG_LEVEL_MASK), mod_id; + bool ret = false; + gchar logbuf[RSPAMD_LOGBUF_SIZE], *log_line; + gsize nescaped; + + if (G_UNLIKELY(rspamd_log == NULL)) { + rspamd_log = default_logger; + } + + log_line = logbuf; + + if (G_UNLIKELY(rspamd_log == NULL)) { + /* Just fprintf message to stderr */ + if (level >= G_LOG_LEVEL_INFO) { + end = rspamd_vsnprintf(logbuf, sizeof(logbuf), fmt, args); + rspamd_fprintf(stderr, "%*s\n", (gint) (end - log_line), + log_line); + } + } + else { + if (level == G_LOG_LEVEL_DEBUG) { + mod_id = rspamd_logger_add_debug_module(module); + } + else { + mod_id = -1; + } + + if (rspamd_logger_need_log(rspamd_log, level_flags, mod_id)) { + end = rspamd_vsnprintf(logbuf, sizeof(logbuf), fmt, args); + + if (!(rspamd_log->flags & RSPAMD_LOG_FLAG_RSPAMADM)) { + if ((nescaped = rspamd_log_line_need_escape(logbuf, end - logbuf)) != 0) { + gsize unescaped_len = end - logbuf; + gchar *logbuf_escaped = g_alloca(unescaped_len + nescaped * 4); + log_line = logbuf_escaped; + + end = rspamd_log_line_hex_escape(logbuf, unescaped_len, + logbuf_escaped, unescaped_len + nescaped * 4); + } + } + + if ((level_flags & RSPAMD_LOG_ENCRYPTED) && rspamd_log->pk) { + gchar *encrypted; + gsize enc_len; + + encrypted = rspamd_log_encrypt_message(log_line, end, &enc_len, + rspamd_log); + ret = rspamd_log->ops.log(module, id, + function, + level_flags, + encrypted, + enc_len, + rspamd_log, + rspamd_log->ops.specific); + g_free(encrypted); + } + else { + ret = rspamd_log->ops.log(module, id, + function, + level_flags, + log_line, + end - log_line, + rspamd_log, + rspamd_log->ops.specific); + } + + switch (level) { + case G_LOG_LEVEL_CRITICAL: + rspamd_log->log_cnt[0]++; + rspamd_log_write_ringbuffer(rspamd_log, module, id, log_line, + end - log_line); + break; + case G_LOG_LEVEL_WARNING: + rspamd_log->log_cnt[1]++; + break; + case G_LOG_LEVEL_INFO: + rspamd_log->log_cnt[2]++; + break; + case G_LOG_LEVEL_DEBUG: + rspamd_log->log_cnt[3]++; + break; + default: + break; + } + } + } + + return ret; +} + +/** + * This log functions select real logger and write message if level is less or equal to configured log level + */ +bool rspamd_common_log_function(rspamd_logger_t *rspamd_log, + gint level_flags, + const gchar *module, const gchar *id, + const gchar *function, + const gchar *fmt, + ...) +{ + va_list vp; + + va_start(vp, fmt); + bool ret = rspamd_common_logv(rspamd_log, level_flags, module, id, function, fmt, vp); + va_end(vp); + + return ret; +} + +bool rspamd_default_logv(gint level_flags, const gchar *module, const gchar *id, + const gchar *function, + const gchar *fmt, va_list args) +{ + return rspamd_common_logv(NULL, level_flags, module, id, function, fmt, args); +} + +bool rspamd_default_log_function(gint level_flags, + const gchar *module, const gchar *id, + const gchar *function, const gchar *fmt, ...) +{ + + va_list vp; + + va_start(vp, fmt); + bool ret = rspamd_default_logv(level_flags, module, id, function, fmt, vp); + va_end(vp); + + return ret; +} + + +/** + * Main file interface for logging + */ +/** + * Write log line depending on ip + */ +bool rspamd_conditional_debug(rspamd_logger_t *rspamd_log, + rspamd_inet_addr_t *addr, const gchar *module, const gchar *id, + const gchar *function, const gchar *fmt, ...) +{ + static gchar logbuf[LOGBUF_LEN]; + va_list vp; + gchar *end; + gint mod_id; + + if (rspamd_log == NULL) { + rspamd_log = default_logger; + } + + mod_id = rspamd_logger_add_debug_module(module); + + if (rspamd_logger_need_log(rspamd_log, G_LOG_LEVEL_DEBUG, mod_id) || + rspamd_log->is_debug) { + if (rspamd_log->debug_ip && addr != NULL) { + if (rspamd_match_radix_map_addr(rspamd_log->debug_ip, + addr) == NULL) { + return false; + } + } + + va_start(vp, fmt); + end = rspamd_vsnprintf(logbuf, sizeof(logbuf), fmt, vp); + *end = '\0'; + va_end(vp); + return rspamd_log->ops.log(module, id, + function, + G_LOG_LEVEL_DEBUG | RSPAMD_LOG_FORCED, + logbuf, + end - logbuf, + rspamd_log, + rspamd_log->ops.specific); + } + + return false; +} + +bool rspamd_conditional_debug_fast(rspamd_logger_t *rspamd_log, + rspamd_inet_addr_t *addr, + gint mod_id, const gchar *module, const gchar *id, + const gchar *function, const gchar *fmt, ...) +{ + static gchar logbuf[LOGBUF_LEN]; + va_list vp; + gchar *end; + + if (rspamd_log == NULL) { + rspamd_log = default_logger; + } + + if (rspamd_logger_need_log(rspamd_log, G_LOG_LEVEL_DEBUG, mod_id) || + rspamd_log->is_debug) { + if (rspamd_log->debug_ip && addr != NULL) { + if (rspamd_match_radix_map_addr(rspamd_log->debug_ip, addr) == NULL) { + return false; + } + } + + va_start(vp, fmt); + end = rspamd_vsnprintf(logbuf, sizeof(logbuf), fmt, vp); + *end = '\0'; + va_end(vp); + return rspamd_log->ops.log(module, id, + function, + G_LOG_LEVEL_DEBUG | RSPAMD_LOG_FORCED, + logbuf, + end - logbuf, + rspamd_log, + rspamd_log->ops.specific); + } + + return false; +} + +bool rspamd_conditional_debug_fast_num_id(rspamd_logger_t *rspamd_log, + rspamd_inet_addr_t *addr, + gint mod_id, const gchar *module, guint64 id, + const gchar *function, const gchar *fmt, ...) +{ + static gchar logbuf[LOGBUF_LEN], idbuf[64]; + va_list vp; + gchar *end; + + if (rspamd_log == NULL) { + rspamd_log = default_logger; + } + + if (rspamd_logger_need_log(rspamd_log, G_LOG_LEVEL_DEBUG, mod_id) || + rspamd_log->is_debug) { + if (rspamd_log->debug_ip && addr != NULL) { + if (rspamd_match_radix_map_addr(rspamd_log->debug_ip, addr) == NULL) { + return false; + } + } + + rspamd_snprintf(idbuf, sizeof(idbuf), "%XuL", id); + va_start(vp, fmt); + end = rspamd_vsnprintf(logbuf, sizeof(logbuf), fmt, vp); + *end = '\0'; + va_end(vp); + return rspamd_log->ops.log(module, idbuf, + function, + G_LOG_LEVEL_DEBUG | RSPAMD_LOG_FORCED, + logbuf, + end - logbuf, + rspamd_log, + rspamd_log->ops.specific); + } + + return false; +} + +/** + * Wrapper for glib logger + */ +void rspamd_glib_log_function(const gchar *log_domain, + GLogLevelFlags log_level, + const gchar *message, + gpointer arg) +{ + rspamd_logger_t *rspamd_log = (rspamd_logger_t *) arg; + + if (rspamd_log->enabled && + rspamd_logger_need_log(rspamd_log, log_level, -1)) { + rspamd_log->ops.log("glib", NULL, + NULL, + log_level, + message, + strlen(message), + rspamd_log, + rspamd_log->ops.specific); + } +} + +void rspamd_glib_printerr_function(const gchar *message) +{ + rspamd_common_log_function(NULL, G_LOG_LEVEL_CRITICAL, "glib", + NULL, G_STRFUNC, + "%s", message); +} + +/** + * Temporary turn on debugging + */ +void rspamd_log_debug(rspamd_logger_t *rspamd_log) +{ + rspamd_log->is_debug = TRUE; +} + +/** + * Turn off temporary debugging + */ +void rspamd_log_nodebug(rspamd_logger_t *rspamd_log) +{ + rspamd_log->is_debug = FALSE; +} + +const guint64 * +rspamd_log_counters(rspamd_logger_t *logger) +{ + if (logger) { + return logger->log_cnt; + } + + return NULL; +} + +static gint +rspamd_log_errlog_cmp(const ucl_object_t **o1, const ucl_object_t **o2) +{ + const ucl_object_t *ts1, *ts2; + + ts1 = ucl_object_lookup(*o1, "ts"); + ts2 = ucl_object_lookup(*o2, "ts"); + + if (ts1 && ts2) { + gdouble t1 = ucl_object_todouble(ts1), t2 = ucl_object_todouble(ts2); + + if (t1 > t2) { + return -1; + } + else if (t2 > t1) { + return 1; + } + } + + return 0; +} + +ucl_object_t * +rspamd_log_errorbuf_export(const rspamd_logger_t *logger) +{ + struct rspamd_logger_error_elt *cpy, *cur; + ucl_object_t *top = ucl_object_typed_new(UCL_ARRAY); + guint i; + + if (logger->errlog == NULL) { + return top; + } + + cpy = g_malloc0_n(logger->errlog->max_elts, + sizeof(*cpy) + logger->errlog->elt_len); + memcpy(cpy, logger->errlog->elts, logger->errlog->max_elts * (sizeof(*cpy) + logger->errlog->elt_len)); + + for (i = 0; i < logger->errlog->max_elts; i++) { + cur = (struct rspamd_logger_error_elt *) ((guchar *) cpy + + i * ((sizeof(*cpy) + logger->errlog->elt_len))); + if (cur->completed) { + ucl_object_t *obj = ucl_object_typed_new(UCL_OBJECT); + + ucl_object_insert_key(obj, ucl_object_fromdouble(cur->ts), + "ts", 0, false); + ucl_object_insert_key(obj, ucl_object_fromint(cur->pid), + "pid", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromstring(g_quark_to_string(cur->ptype)), + "type", 0, false); + ucl_object_insert_key(obj, ucl_object_fromstring(cur->id), + "id", 0, false); + ucl_object_insert_key(obj, ucl_object_fromstring(cur->module), + "module", 0, false); + ucl_object_insert_key(obj, ucl_object_fromstring(cur->message), + "message", 0, false); + + ucl_array_append(top, obj); + } + } + + ucl_object_array_sort(top, rspamd_log_errlog_cmp); + g_free(cpy); + + return top; +} + +static guint +rspamd_logger_allocate_mod_bit(void) +{ + if (log_modules->bitset_allocated * NBBY > log_modules->bitset_len + 1) { + log_modules->bitset_len++; + return log_modules->bitset_len - 1; + } + else { + /* Need to expand */ + log_modules->bitset_allocated *= 2; + log_modules->bitset = g_realloc(log_modules->bitset, + log_modules->bitset_allocated); + + return rspamd_logger_allocate_mod_bit(); + } +} + +RSPAMD_DESTRUCTOR(rspamd_debug_modules_dtor) +{ + if (log_modules) { + g_hash_table_unref(log_modules->modules); + g_free(log_modules->bitset); + g_free(log_modules); + } +} + +gint rspamd_logger_add_debug_module(const gchar *mname) +{ + struct rspamd_log_module *m; + + if (mname == NULL) { + return -1; + } + + if (log_modules == NULL) { + /* + * This is usually called from constructors, so we call init check + * each time to avoid dependency issues between ctors calls + */ + log_modules = g_malloc0(sizeof(*log_modules)); + log_modules->modules = g_hash_table_new_full(rspamd_strcase_hash, + rspamd_strcase_equal, g_free, g_free); + log_modules->bitset_allocated = 16; + log_modules->bitset_len = 0; + log_modules->bitset = g_malloc0(log_modules->bitset_allocated); + } + + if ((m = g_hash_table_lookup(log_modules->modules, mname)) == NULL) { + m = g_malloc0(sizeof(*m)); + m->mname = g_strdup(mname); + m->id = rspamd_logger_allocate_mod_bit(); + clrbit(log_modules->bitset, m->id); + g_hash_table_insert(log_modules->modules, m->mname, m); + } + + return m->id; +} + +void rspamd_logger_configure_modules(GHashTable *mods_enabled) +{ + GHashTableIter it; + gpointer k, v; + guint id; + + /* Clear all in bitset_allocated -> this are bytes not bits */ + memset(log_modules->bitset, 0, log_modules->bitset_allocated); + /* On first iteration, we go through all modules enabled and add missing ones */ + g_hash_table_iter_init(&it, mods_enabled); + + while (g_hash_table_iter_next(&it, &k, &v)) { + rspamd_logger_add_debug_module((const gchar *) k); + } + + g_hash_table_iter_init(&it, mods_enabled); + + while (g_hash_table_iter_next(&it, &k, &v)) { + id = rspamd_logger_add_debug_module((const gchar *) k); + + if (isclr(log_modules->bitset, id)) { + msg_info("enable debugging for module %s (%d)", (const gchar *) k, + id); + setbit(log_modules->bitset, id); + } + } +} + +struct rspamd_logger_funcs * +rspamd_logger_set_log_function(rspamd_logger_t *logger, + struct rspamd_logger_funcs *nfuncs) +{ + /* TODO: write this */ + + return NULL; +} + + +gchar * +rspamd_log_line_hex_escape(const guchar *src, gsize srclen, + gchar *dst, gsize dstlen) +{ + static const gchar hexdigests[16] = "0123456789ABCDEF"; + gchar *d = dst; + + static guint32 escape[] = { + 0xffffffff, /* 1111 1111 1111 1111 1111 1111 1111 1111 */ + + /* ?>=< ;:98 7654 3210 /.-, +*)( '&%$ #"! */ + 0x00000000, /* 0000 0000 0000 0000 0000 0000 0000 0100 */ + + /* _^]\ [ZYX WVUT SRQP ONML KJIH GFED CBA@ */ + 0x00000000, /* 0001 0000 0000 0000 0000 0000 0000 0000 */ + + /* ~}| {zyx wvut srqp onml kjih gfed cba` */ + 0x80000000, /* 1000 0000 0000 0000 0000 0000 0000 0000 */ + + /* Allow all 8bit characters (assuming they are valid utf8) */ + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + }; + + while (srclen && dstlen) { + if (escape[*src >> 5] & (1U << (*src & 0x1f))) { + if (dstlen >= 4) { + *d++ = '\\'; + *d++ = 'x'; + *d++ = hexdigests[*src >> 4]; + *d++ = hexdigests[*src & 0xf]; + src++; + dstlen -= 4; + } + else { + /* Overflow */ + break; + } + } + else { + *d++ = *src++; + dstlen--; + } + + srclen--; + } + + return d; +} + +gsize rspamd_log_line_need_escape(const guchar *src, gsize srclen) +{ + static guint32 escape[] = { + 0xffffffff, /* 1111 1111 1111 1111 1111 1111 1111 1111 */ + + /* ?>=< ;:98 7654 3210 /.-, +*)( '&%$ #"! */ + 0x00000000, /* 0000 0000 0000 0000 0000 0000 0000 0100 */ + + /* _^]\ [ZYX WVUT SRQP ONML KJIH GFED CBA@ */ + 0x00000000, /* 0001 0000 0000 0000 0000 0000 0000 0000 */ + + /* ~}| {zyx wvut srqp onml kjih gfed cba` */ + 0x80000000, /* 1000 0000 0000 0000 0000 0000 0000 0000 */ + + /* Allow all 8bit characters (assuming they are valid utf8) */ + 0x00000000, + 0x00000000, + 0x00000000, + 0x00000000, + }; + gsize n = 0; + + while (srclen) { + if (escape[*src >> 5] & (1U << (*src & 0x1f))) { + n++; + } + + src++; + srclen--; + } + + return n; +} + +const gchar * +rspamd_get_log_severity_string(gint level_flags) +{ + unsigned int bitnum; + static const char *level_strs[G_LOG_LEVEL_USER_SHIFT] = { + "", /* G_LOG_FLAG_RECURSION */ + "", /* G_LOG_FLAG_FATAL */ + "crit", + "error", + "warn", + "notice", + "info", + "debug"}; + level_flags &= ((1u << G_LOG_LEVEL_USER_SHIFT) - 1u) & ~(G_LOG_FLAG_RECURSION | G_LOG_FLAG_FATAL); +#ifdef __GNUC__ + /* We assume gcc >= 3 and clang >= 5 anyway */ + bitnum = __builtin_ffs(level_flags) - 1; +#else + bitnum = ffs(level_flags) - 1; +#endif + return level_strs[bitnum]; +} + +static inline void +log_time(gdouble now, rspamd_logger_t *rspamd_log, gchar *timebuf, + size_t len) +{ + time_t sec = (time_t) now; + gsize r; + struct tm tms; + + rspamd_localtime(sec, &tms); + r = strftime(timebuf, len, "%F %H:%M:%S", &tms); + + if (rspamd_log->flags & RSPAMD_LOG_FLAG_USEC) { + gchar usec_buf[16]; + + rspamd_snprintf(usec_buf, sizeof(usec_buf), "%.5f", + now - (gdouble) sec); + rspamd_snprintf(timebuf + r, len - r, + "%s", usec_buf + 1); + } +} + +void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx, + double ts, + const gchar *module, + const gchar *id, + const gchar *function, + gint level_flags, + const gchar *message, + gsize mlen, + rspamd_logger_t *logger) +{ + bool log_color = (logger->flags & RSPAMD_LOG_FLAG_COLOR); + bool log_severity = (logger->flags & RSPAMD_LOG_FLAG_SEVERITY); + bool log_rspamadm = (logger->flags & RSPAMD_LOG_FLAG_RSPAMADM); + bool log_systemd = (logger->flags & RSPAMD_LOG_FLAG_SYSTEMD); + bool log_json = (logger->flags & RSPAMD_LOG_FLAG_JSON); + + if (log_json) { + /* Some sanity to avoid too many branches */ + log_color = false; + log_severity = true; + log_systemd = false; + } + + glong r; + static gchar timebuf[64], modulebuf[64]; + static gchar tmpbuf[256]; + + if (!log_json && !log_systemd) { + log_time(ts, logger, timebuf, sizeof(timebuf)); + } + + if (G_UNLIKELY(log_json)) { + /* Perform JSON logging */ + guint slen = id ? strlen(id) : strlen("(NULL)"); + slen = MIN(RSPAMD_LOG_ID_LEN, slen); + r = rspamd_snprintf(tmpbuf, sizeof(tmpbuf), "{\"ts\": %f, " + "\"pid\": %P, " + "\"severity\": \"%s\", " + "\"worker_type\": \"%s\", " + "\"id\": \"%*.s\", " + "\"module\": \"%s\", " + "\"function\": \"%s\", " + "\"message\": \"", + ts, + logger->pid, + rspamd_get_log_severity_string(level_flags), + logger->process_type, + slen, id, + module, + function); + iov_ctx->iov[0].iov_base = tmpbuf; + iov_ctx->iov[0].iov_len = r; + /* TODO: is it possible to have other 'bad' symbols here? */ + if (rspamd_memcspn(message, "\"\\\r\n\b\t\v", mlen) == mlen) { + iov_ctx->iov[1].iov_base = (void *) message; + iov_ctx->iov[1].iov_len = mlen; + } + else { + /* We need to do JSON escaping of the quotes */ + const char *p, *end = message + mlen; + long escaped_len; + + for (p = message, escaped_len = 0; p < end; p++, escaped_len++) { + switch (*p) { + case '\v': + case '\0': + escaped_len += 5; + break; + case '\\': + case '"': + case '\n': + case '\r': + case '\b': + case '\t': + escaped_len++; + break; + default: + break; + } + } + + + struct rspamd_logger_iov_thrash_stack *thrash_stack_elt = g_malloc( + sizeof(struct rspamd_logger_iov_thrash_stack) + + escaped_len); + + char *dst = ((char *) thrash_stack_elt) + sizeof(struct rspamd_logger_iov_thrash_stack); + char *d; + + thrash_stack_elt->prev = iov_ctx->thrash_stack; + iov_ctx->thrash_stack = thrash_stack_elt; + + for (p = message, d = dst; p < end; p++, d++) { + switch (*p) { + case '\n': + *d++ = '\\'; + *d = 'n'; + break; + case '\r': + *d++ = '\\'; + *d = 'r'; + break; + case '\b': + *d++ = '\\'; + *d = 'b'; + break; + case '\t': + *d++ = '\\'; + *d = 't'; + break; + case '\f': + *d++ = '\\'; + *d = 'f'; + break; + case '\0': + *d++ = '\\'; + *d++ = 'u'; + *d++ = '0'; + *d++ = '0'; + *d++ = '0'; + *d = '0'; + break; + case '\v': + *d++ = '\\'; + *d++ = 'u'; + *d++ = '0'; + *d++ = '0'; + *d++ = '0'; + *d = 'B'; + break; + case '\\': + *d++ = '\\'; + *d = '\\'; + break; + case '"': + *d++ = '\\'; + *d = '"'; + break; + default: + *d = *p; + break; + } + } + + iov_ctx->iov[1].iov_base = dst; + iov_ctx->iov[1].iov_len = d - dst; + } + iov_ctx->iov[2].iov_base = (void *) "\"}\n"; + iov_ctx->iov[2].iov_len = sizeof("\"}\n") - 1; + + iov_ctx->niov = 3; + } + else if (G_LIKELY(!log_rspamadm)) { + if (!log_systemd) { + if (log_color) { + if (level_flags & (G_LOG_LEVEL_INFO | G_LOG_LEVEL_MESSAGE)) { + /* White */ + r = rspamd_snprintf(tmpbuf, sizeof(tmpbuf), "\033[0;37m"); + } + else if (level_flags & G_LOG_LEVEL_WARNING) { + /* Magenta */ + r = rspamd_snprintf(tmpbuf, sizeof(tmpbuf), "\033[0;32m"); + } + else if (level_flags & G_LOG_LEVEL_CRITICAL) { + /* Red */ + r = rspamd_snprintf(tmpbuf, sizeof(tmpbuf), "\033[1;31m"); + } + else { + r = 0; + } + } + else { + r = 0; + } + + if (log_severity) { + r += rspamd_snprintf(tmpbuf + r, + sizeof(tmpbuf) - r, + "%s [%s] #%P(%s) ", + timebuf, + rspamd_get_log_severity_string(level_flags), + logger->pid, + logger->process_type); + } + else { + r += rspamd_snprintf(tmpbuf + r, + sizeof(tmpbuf) - r, + "%s #%P(%s) ", + timebuf, + logger->pid, + logger->process_type); + } + } + else { + r = 0; + r += rspamd_snprintf(tmpbuf + r, + sizeof(tmpbuf) - r, + "(%s) ", + logger->process_type); + } + + glong mremain, mr; + char *m; + + modulebuf[0] = '\0'; + mremain = sizeof(modulebuf); + m = modulebuf; + + if (id != NULL) { + guint slen = strlen(id); + slen = MIN(RSPAMD_LOG_ID_LEN, slen); + mr = rspamd_snprintf(m, mremain, "<%*.s>; ", slen, + id); + m += mr; + mremain -= mr; + } + if (module != NULL) { + mr = rspamd_snprintf(m, mremain, "%s; ", module); + m += mr; + mremain -= mr; + } + if (function != NULL) { + mr = rspamd_snprintf(m, mremain, "%s: ", function); + m += mr; + mremain -= mr; + } + else { + mr = rspamd_snprintf(m, mremain, ": "); + m += mr; + mremain -= mr; + } + + /* Ensure that we have a space at the end */ + if (m > modulebuf && *(m - 1) != ' ') { + *(m - 1) = ' '; + } + + /* Construct IOV for log line */ + iov_ctx->iov[0].iov_base = tmpbuf; + iov_ctx->iov[0].iov_len = r; + iov_ctx->iov[1].iov_base = modulebuf; + iov_ctx->iov[1].iov_len = m - modulebuf; + iov_ctx->iov[2].iov_base = (void *) message; + iov_ctx->iov[2].iov_len = mlen; + iov_ctx->iov[3].iov_base = (void *) &lf_chr; + iov_ctx->iov[3].iov_len = 1; + + iov_ctx->niov = 4; + + if (log_color) { + iov_ctx->iov[4].iov_base = "\033[0m"; + iov_ctx->iov[4].iov_len = sizeof("\033[0m") - 1; + + iov_ctx->niov = 5; + } + } + else { + /* Rspamadm case */ + int niov = 0; + + if (logger->log_level == G_LOG_LEVEL_DEBUG) { + iov_ctx->iov[niov].iov_base = (void *) timebuf; + iov_ctx->iov[niov++].iov_len = strlen(timebuf); + iov_ctx->iov[niov].iov_base = (void *) " "; + iov_ctx->iov[niov++].iov_len = 1; + } + + iov_ctx->iov[niov].iov_base = (void *) message; + iov_ctx->iov[niov++].iov_len = mlen; + iov_ctx->iov[niov].iov_base = (void *) &lf_chr; + iov_ctx->iov[niov++].iov_len = 1; + + iov_ctx->niov = niov; + } + + // this is kind of "after-the-fact" check, but it's mostly for debugging-only + g_assert(iov_ctx->niov <= G_N_ELEMENTS(iov_ctx->iov)); +} + +void rspamd_log_iov_free(struct rspamd_logger_iov_ctx *iov_ctx) +{ + struct rspamd_logger_iov_thrash_stack *st = iov_ctx->thrash_stack; + + while (st) { + struct rspamd_logger_iov_thrash_stack *nst = st->prev; + g_free(st); + st = nst; + } +}
\ No newline at end of file diff --git a/src/libserver/logger/logger_console.c b/src/libserver/logger/logger_console.c new file mode 100644 index 0000000..7f3c770 --- /dev/null +++ b/src/libserver/logger/logger_console.c @@ -0,0 +1,211 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "logger.h" +#include "libserver/cfg_file.h" +#include "libcryptobox/cryptobox.h" +#include "unix-std.h" + +#include "logger_private.h" + +#define CONSOLE_LOG_QUARK g_quark_from_static_string("console_logger") + +static const gchar lf_chr = '\n'; +struct rspamd_console_logger_priv { + gint fd; + gint crit_fd; +}; + +/* Copy & paste :( */ +static inline void +log_time(gdouble now, rspamd_logger_t *rspamd_log, gchar *timebuf, + size_t len) +{ + time_t sec = (time_t) now; + gsize r; + struct tm tms; + + rspamd_localtime(sec, &tms); + r = strftime(timebuf, len, "%F %H:%M:%S", &tms); + + if (rspamd_log->flags & RSPAMD_LOG_FLAG_USEC) { + gchar usec_buf[16]; + + rspamd_snprintf(usec_buf, sizeof(usec_buf), "%.5f", + now - (gdouble) sec); + rspamd_snprintf(timebuf + r, len - r, + "%s", usec_buf + 1); + } +} + +void * +rspamd_log_console_init(rspamd_logger_t *logger, struct rspamd_config *cfg, + uid_t uid, gid_t gid, GError **err) +{ + struct rspamd_console_logger_priv *priv; + + priv = g_malloc0(sizeof(*priv)); + + if (logger->flags & RSPAMD_LOG_FLAG_RSPAMADM) { + priv->fd = dup(STDOUT_FILENO); + priv->crit_fd = dup(STDERR_FILENO); + } + else { + priv->fd = dup(STDERR_FILENO); + priv->crit_fd = priv->fd; + } + + if (priv->fd == -1) { + g_set_error(err, CONSOLE_LOG_QUARK, errno, + "open_log: cannot dup console fd: %s\n", + strerror(errno)); + rspamd_log_console_dtor(logger, priv); + + return NULL; + } + + if (!isatty(priv->fd)) { + if (logger->flags & RSPAMD_LOG_FLAG_COLOR) { + /* Disable colors for not a tty */ + logger->flags &= ~RSPAMD_LOG_FLAG_COLOR; + } + } + + return priv; +} + +void * +rspamd_log_console_reload(rspamd_logger_t *logger, struct rspamd_config *cfg, + gpointer arg, uid_t uid, gid_t gid, GError **err) +{ + struct rspamd_console_logger_priv *npriv; + + npriv = rspamd_log_console_init(logger, cfg, uid, gid, err); + + if (npriv) { + /* Close old */ + rspamd_log_console_dtor(logger, arg); + } + + return npriv; +} + +void rspamd_log_console_dtor(rspamd_logger_t *logger, gpointer arg) +{ + struct rspamd_console_logger_priv *priv = (struct rspamd_console_logger_priv *) arg; + + if (priv->fd != -1) { + if (priv->fd != priv->crit_fd) { + /* Two different FD case */ + if (close(priv->crit_fd) == -1) { + rspamd_fprintf(stderr, "cannot close log crit_fd %d: %s\n", + priv->crit_fd, strerror(errno)); + } + } + + if (close(priv->fd) == -1) { + rspamd_fprintf(stderr, "cannot close log fd %d: %s\n", + priv->fd, strerror(errno)); + } + + /* Avoid the next if to be executed as crit_fd is equal to fd */ + priv->crit_fd = -1; + } + + if (priv->crit_fd != -1) { + if (close(priv->crit_fd) == -1) { + rspamd_fprintf(stderr, "cannot close log crit_fd %d: %s\n", + priv->crit_fd, strerror(errno)); + } + } + + g_free(priv); +} + +bool rspamd_log_console_log(const gchar *module, const gchar *id, + const gchar *function, + gint level_flags, + const gchar *message, + gsize mlen, + rspamd_logger_t *rspamd_log, + gpointer arg) +{ + struct rspamd_console_logger_priv *priv = (struct rspamd_console_logger_priv *) arg; + gint fd, r; + double now; + + if (level_flags & G_LOG_LEVEL_CRITICAL) { + fd = priv->crit_fd; + } + else { + /* Use stderr if we are in rspamadm mode and severity is more than WARNING */ + if ((rspamd_log->flags & RSPAMD_LOG_FLAG_RSPAMADM) && (level_flags & G_LOG_LEVEL_WARNING)) { + fd = priv->crit_fd; + } + else { + fd = priv->fd; + } + } + +#ifndef DISABLE_PTHREAD_MUTEX + if (rspamd_log->mtx) { + rspamd_mempool_lock_mutex(rspamd_log->mtx); + } + else { + rspamd_file_lock(fd, FALSE); + } +#else + rspamd_file_lock(fd, FALSE); +#endif + + now = rspamd_get_calendar_ticks(); + + struct rspamd_logger_iov_ctx iov_ctx; + memset(&iov_ctx, 0, sizeof(iov_ctx)); + rspamd_log_fill_iov(&iov_ctx, now, module, id, + function, level_flags, message, + mlen, rspamd_log); + +again: + r = writev(fd, iov_ctx.iov, iov_ctx.niov); + + if (r == -1) { + if (errno == EAGAIN || errno == EINTR) { + goto again; + } + + if (rspamd_log->mtx) { + rspamd_mempool_unlock_mutex(rspamd_log->mtx); + } + else { + rspamd_file_unlock(fd, FALSE); + } + + rspamd_log_iov_free(&iov_ctx); + return false; + } + + if (rspamd_log->mtx) { + rspamd_mempool_unlock_mutex(rspamd_log->mtx); + } + else { + rspamd_file_unlock(fd, FALSE); + } + + rspamd_log_iov_free(&iov_ctx); + return true; +}
\ No newline at end of file diff --git a/src/libserver/logger/logger_file.c b/src/libserver/logger/logger_file.c new file mode 100644 index 0000000..20b04b8 --- /dev/null +++ b/src/libserver/logger/logger_file.c @@ -0,0 +1,510 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "logger.h" +#include "libserver/cfg_file.h" +#include "libcryptobox/cryptobox.h" +#include "unix-std.h" + +#include "logger_private.h" + +#define FILE_LOG_QUARK g_quark_from_static_string("file_logger") + +struct rspamd_file_logger_priv { + gint fd; + struct { + guint32 size; + guint32 used; + u_char *buf; + } io_buf; + gboolean throttling; + gchar *log_file; + gboolean is_buffered; + gboolean log_severity; + time_t throttling_time; + guint32 repeats; + guint64 last_line_cksum; + gchar *saved_message; + gsize saved_mlen; + gchar *saved_function; + gchar *saved_module; + gchar *saved_id; + guint saved_loglevel; +}; + +/** + * Calculate checksum for log line (used for repeating logic) + */ +static inline guint64 +rspamd_log_calculate_cksum(const gchar *message, size_t mlen) +{ + return rspamd_cryptobox_fast_hash(message, mlen, rspamd_hash_seed()); +} + +/* + * Write a line to log file (unbuffered) + */ +static bool +direct_write_log_line(rspamd_logger_t *rspamd_log, + struct rspamd_file_logger_priv *priv, + void *data, + gsize count, + gboolean is_iov, + gint level_flags) +{ + struct iovec *iov; + const gchar *line; + glong r; + gint fd; + gboolean locked = FALSE; + + iov = (struct iovec *) data; + fd = priv->fd; + + if (!rspamd_log->no_lock) { + gsize tlen; + + if (is_iov) { + tlen = 0; + + for (guint i = 0; i < count; i++) { + tlen += iov[i].iov_len; + } + } + else { + tlen = count; + } + + if (tlen > PIPE_BUF) { + locked = TRUE; + +#ifndef DISABLE_PTHREAD_MUTEX + if (rspamd_log->mtx) { + rspamd_mempool_lock_mutex(rspamd_log->mtx); + } + else { + rspamd_file_lock(fd, FALSE); + } +#else + rspamd_file_lock(fd, FALSE); +#endif + } + } + + if (is_iov) { + r = writev(fd, iov, count); + } + else { + line = (const gchar *) data; + r = write(fd, line, count); + } + + if (locked) { +#ifndef DISABLE_PTHREAD_MUTEX + if (rspamd_log->mtx) { + rspamd_mempool_unlock_mutex(rspamd_log->mtx); + } + else { + rspamd_file_unlock(fd, FALSE); + } +#else + rspamd_file_unlock(fd, FALSE); +#endif + } + + if (r == -1) { + /* We cannot write message to file, so we need to detect error and make decision */ + if (errno == EINTR) { + /* Try again */ + return direct_write_log_line(rspamd_log, priv, data, count, is_iov, level_flags); + } + + if (errno == EFAULT || errno == EINVAL || errno == EFBIG || + errno == ENOSPC) { + /* Rare case */ + priv->throttling = TRUE; + priv->throttling_time = time(NULL); + } + else if (errno == EPIPE || errno == EBADF) { + /* We write to some pipe and it disappears, disable logging or we has opened bad file descriptor */ + rspamd_log->enabled = FALSE; + } + + return false; + } + else if (priv->throttling) { + priv->throttling = FALSE; + } + + return true; +} + +/** + * Fill buffer with message (limits must be checked BEFORE this call) + */ +static void +fill_buffer(rspamd_logger_t *rspamd_log, + struct rspamd_file_logger_priv *priv, + const struct iovec *iov, gint iovcnt) +{ + gint i; + + for (i = 0; i < iovcnt; i++) { + memcpy(priv->io_buf.buf + priv->io_buf.used, + iov[i].iov_base, + iov[i].iov_len); + priv->io_buf.used += iov[i].iov_len; + } +} + +static void +rspamd_log_flush(rspamd_logger_t *rspamd_log, struct rspamd_file_logger_priv *priv) +{ + if (priv->is_buffered) { + direct_write_log_line(rspamd_log, + priv, + priv->io_buf.buf, + priv->io_buf.used, + FALSE, + rspamd_log->log_level); + priv->io_buf.used = 0; + } +} + +/* + * Write message to buffer or to file (using direct_write_log_line function) + */ +static bool +file_log_helper(rspamd_logger_t *rspamd_log, + struct rspamd_file_logger_priv *priv, + const struct iovec *iov, + guint iovcnt, + gint level_flags) +{ + size_t len = 0; + guint i; + + if (!priv->is_buffered) { + /* Write string directly */ + return direct_write_log_line(rspamd_log, priv, (void *) iov, iovcnt, + TRUE, level_flags); + } + else { + /* Calculate total length */ + for (i = 0; i < iovcnt; i++) { + len += iov[i].iov_len; + } + /* Fill buffer */ + if (priv->io_buf.size < len) { + /* Buffer is too small to hold this string, so write it directly */ + rspamd_log_flush(rspamd_log, priv); + return direct_write_log_line(rspamd_log, priv, (void *) iov, iovcnt, + TRUE, level_flags); + } + else if (priv->io_buf.used + len >= priv->io_buf.size) { + /* Buffer is full, try to write it directly */ + rspamd_log_flush(rspamd_log, priv); + fill_buffer(rspamd_log, priv, iov, iovcnt); + } + else { + /* Copy incoming string to buffer */ + fill_buffer(rspamd_log, priv, iov, iovcnt); + } + } + + return true; +} + +static void +rspamd_log_reset_repeated(rspamd_logger_t *rspamd_log, + struct rspamd_file_logger_priv *priv) +{ + gchar tmpbuf[256]; + gssize r; + + if (priv->repeats > REPEATS_MIN) { + r = rspamd_snprintf(tmpbuf, + sizeof(tmpbuf), + "Last message repeated %ud times", + priv->repeats - REPEATS_MIN); + priv->repeats = 0; + + if (priv->saved_message) { + rspamd_log_file_log(priv->saved_module, + priv->saved_id, + priv->saved_function, + priv->saved_loglevel | RSPAMD_LOG_FORCED, + priv->saved_message, + priv->saved_mlen, + rspamd_log, + priv); + + g_free(priv->saved_message); + g_free(priv->saved_function); + g_free(priv->saved_module); + g_free(priv->saved_id); + priv->saved_message = NULL; + priv->saved_function = NULL; + priv->saved_module = NULL; + priv->saved_id = NULL; + } + + /* It is safe to use temporary buffer here as it is not static */ + rspamd_log_file_log(NULL, NULL, + G_STRFUNC, + priv->saved_loglevel | RSPAMD_LOG_FORCED, + tmpbuf, + r, + rspamd_log, + priv); + rspamd_log_flush(rspamd_log, priv); + } +} + +static gint +rspamd_try_open_log_fd(rspamd_logger_t *rspamd_log, + struct rspamd_file_logger_priv *priv, + uid_t uid, gid_t gid, + GError **err) +{ + gint fd; + + fd = open(priv->log_file, + O_CREAT | O_WRONLY | O_APPEND, + S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH); + if (fd == -1) { + g_set_error(err, FILE_LOG_QUARK, errno, + "open_log: cannot open desired log file: %s, %s\n", + priv->log_file, strerror(errno)); + return -1; + } + + if (uid != -1 || gid != -1) { + if (fchown(fd, uid, gid) == -1) { + g_set_error(err, FILE_LOG_QUARK, errno, + "open_log: cannot chown desired log file: %s, %s\n", + priv->log_file, strerror(errno)); + close(fd); + + return -1; + } + } + + return fd; +} + +void * +rspamd_log_file_init(rspamd_logger_t *logger, struct rspamd_config *cfg, + uid_t uid, gid_t gid, GError **err) +{ + struct rspamd_file_logger_priv *priv; + + if (!cfg || !cfg->cfg_name) { + g_set_error(err, FILE_LOG_QUARK, EINVAL, + "no log file specified"); + return NULL; + } + + priv = g_malloc0(sizeof(*priv)); + + if (cfg->log_buffered) { + if (cfg->log_buf_size != 0) { + priv->io_buf.size = cfg->log_buf_size; + } + else { + priv->io_buf.size = LOGBUF_LEN; + } + priv->is_buffered = TRUE; + priv->io_buf.buf = g_malloc(priv->io_buf.size); + } + + if (cfg->log_file) { + priv->log_file = g_strdup(cfg->log_file); + } + + priv->log_severity = (logger->flags & RSPAMD_LOG_FLAG_SEVERITY); + priv->fd = rspamd_try_open_log_fd(logger, priv, uid, gid, err); + + if (priv->fd == -1) { + rspamd_log_file_dtor(logger, priv); + + return NULL; + } + + return priv; +} + +void rspamd_log_file_dtor(rspamd_logger_t *logger, gpointer arg) +{ + struct rspamd_file_logger_priv *priv = (struct rspamd_file_logger_priv *) arg; + + rspamd_log_reset_repeated(logger, priv); + rspamd_log_flush(logger, priv); + + if (priv->fd != -1) { + if (close(priv->fd) == -1) { + rspamd_fprintf(stderr, "cannot close log fd %d: %s; log file = %s\n", + priv->fd, strerror(errno), priv->log_file); + } + } + + g_free(priv->log_file); + g_free(priv); +} + +bool rspamd_log_file_log(const gchar *module, const gchar *id, + const gchar *function, + gint level_flags, + const gchar *message, + gsize mlen, + rspamd_logger_t *rspamd_log, + gpointer arg) +{ + struct rspamd_file_logger_priv *priv = (struct rspamd_file_logger_priv *) arg; + gdouble now; + guint64 cksum; + gboolean got_time = FALSE; + + + if (!(level_flags & RSPAMD_LOG_FORCED) && !rspamd_log->enabled) { + return false; + } + + /* Check throttling due to write errors */ + if (!(level_flags & RSPAMD_LOG_FORCED) && priv->throttling) { + now = rspamd_get_calendar_ticks(); + + if (priv->throttling_time != now) { + priv->throttling_time = now; + got_time = TRUE; + } + else { + /* Do not try to write to file too often while throttling */ + return false; + } + } + + /* Check repeats */ + cksum = rspamd_log_calculate_cksum(message, mlen); + + if (cksum == priv->last_line_cksum) { + priv->repeats++; + + if (priv->repeats > REPEATS_MIN && priv->repeats < + REPEATS_MAX) { + /* Do not log anything but save message for future */ + if (priv->saved_message == NULL) { + priv->saved_function = g_strdup(function); + priv->saved_mlen = mlen; + priv->saved_message = g_malloc(mlen); + memcpy(priv->saved_message, message, mlen); + + if (module) { + priv->saved_module = g_strdup(module); + } + + if (id) { + priv->saved_id = g_strdup(id); + } + + priv->saved_loglevel = level_flags; + } + + return true; + } + else if (priv->repeats > REPEATS_MAX) { + rspamd_log_reset_repeated(rspamd_log, priv); + + bool ret = rspamd_log_file_log(module, id, + function, + level_flags, + message, + mlen, + rspamd_log, + priv); + + /* Probably we have more repeats in future */ + priv->repeats = REPEATS_MIN + 1; + + return ret; + } + } + else { + /* Reset counter if new message differs from saved message */ + priv->last_line_cksum = cksum; + + if (priv->repeats > REPEATS_MIN) { + rspamd_log_reset_repeated(rspamd_log, priv); + return rspamd_log_file_log(module, id, + function, + level_flags, + message, + mlen, + rspamd_log, + arg); + } + else { + priv->repeats = 0; + } + } + if (!got_time) { + now = rspamd_get_calendar_ticks(); + } + + struct rspamd_logger_iov_ctx iov_ctx; + memset(&iov_ctx, 0, sizeof(iov_ctx)); + rspamd_log_fill_iov(&iov_ctx, now, module, id, function, level_flags, message, + mlen, rspamd_log); + + bool ret = file_log_helper(rspamd_log, priv, iov_ctx.iov, iov_ctx.niov, level_flags); + rspamd_log_iov_free(&iov_ctx); + + return ret; +} + +void * +rspamd_log_file_reload(rspamd_logger_t *logger, struct rspamd_config *cfg, + gpointer arg, uid_t uid, gid_t gid, GError **err) +{ + struct rspamd_file_logger_priv *npriv; + + if (!cfg->cfg_name) { + g_set_error(err, FILE_LOG_QUARK, EINVAL, + "no log file specified"); + return NULL; + } + + npriv = rspamd_log_file_init(logger, cfg, uid, gid, err); + + if (npriv) { + /* Close old */ + rspamd_log_file_dtor(logger, arg); + } + + return npriv; +} + +bool rspamd_log_file_on_fork(rspamd_logger_t *logger, struct rspamd_config *cfg, + gpointer arg, GError **err) +{ + struct rspamd_file_logger_priv *priv = (struct rspamd_file_logger_priv *) arg; + + rspamd_log_reset_repeated(logger, priv); + rspamd_log_flush(logger, priv); + + return true; +}
\ No newline at end of file diff --git a/src/libserver/logger/logger_private.h b/src/libserver/logger/logger_private.h new file mode 100644 index 0000000..234a207 --- /dev/null +++ b/src/libserver/logger/logger_private.h @@ -0,0 +1,218 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_LOGGER_PRIVATE_H +#define RSPAMD_LOGGER_PRIVATE_H + +#include "logger.h" + +/* How much message should be repeated before it is count to be repeated one */ +#define REPEATS_MIN 3 +#define REPEATS_MAX 300 +#define LOGBUF_LEN 8192 + +struct rspamd_log_module { + gchar *mname; + guint id; +}; + +struct rspamd_log_modules { + guchar *bitset; + guint bitset_len; /* Number of BITS used in bitset */ + guint bitset_allocated; /* Size of bitset allocated in BYTES */ + GHashTable *modules; +}; + +struct rspamd_logger_error_elt { + gint completed; + GQuark ptype; + pid_t pid; + gdouble ts; + gchar id[RSPAMD_LOG_ID_LEN + 1]; + gchar module[9]; + gchar message[]; +}; + +struct rspamd_logger_error_log { + struct rspamd_logger_error_elt *elts; + rspamd_mempool_t *pool; + guint32 max_elts; + guint32 elt_len; + /* Avoid false cache sharing */ + guchar __padding[64 - sizeof(gpointer) * 2 - sizeof(guint64)]; + guint cur_row; +}; + +/** + * Static structure that store logging parameters + * It is NOT shared between processes and is created by main process + */ +struct rspamd_logger_s { + struct rspamd_logger_funcs ops; + gint log_level; + + struct rspamd_logger_error_log *errlog; + struct rspamd_cryptobox_pubkey *pk; + struct rspamd_cryptobox_keypair *keypair; + + guint flags; + gboolean closed; + gboolean enabled; + gboolean is_debug; + gboolean no_lock; + + pid_t pid; + const gchar *process_type; + struct rspamd_radix_map_helper *debug_ip; + rspamd_mempool_mutex_t *mtx; + rspamd_mempool_t *pool; + guint64 log_cnt[4]; +}; + +/* + * Common logging prototypes + */ + +/* + * File logging + */ +void *rspamd_log_file_init(rspamd_logger_t *logger, struct rspamd_config *cfg, + uid_t uid, gid_t gid, GError **err); +void *rspamd_log_file_reload(rspamd_logger_t *logger, struct rspamd_config *cfg, + gpointer arg, uid_t uid, gid_t gid, GError **err); +void rspamd_log_file_dtor(rspamd_logger_t *logger, gpointer arg); +bool rspamd_log_file_log(const gchar *module, const gchar *id, + const gchar *function, + gint level_flags, + const gchar *message, + gsize mlen, + rspamd_logger_t *rspamd_log, + gpointer arg); +bool rspamd_log_file_on_fork(rspamd_logger_t *logger, struct rspamd_config *cfg, + gpointer arg, GError **err); + +struct rspamd_logger_iov_thrash_stack { + struct rspamd_logger_iov_thrash_stack *prev; + char data[0]; +}; +#define RSPAMD_LOGGER_MAX_IOV 8 +struct rspamd_logger_iov_ctx { + struct iovec iov[RSPAMD_LOGGER_MAX_IOV]; + int niov; + struct rspamd_logger_iov_thrash_stack *thrash_stack; +}; +/** + * Fills IOV of logger (usable for file/console logging) + * Warning: this function is NOT reentrant, do not call it twice from a single moment of execution + * @param iov filled by this function + * @param module + * @param id + * @param function + * @param level_flags + * @param message + * @param mlen + * @param rspamd_log + * @return number of iov elements being filled + */ +void rspamd_log_fill_iov(struct rspamd_logger_iov_ctx *iov_ctx, + double ts, + const gchar *module, const gchar *id, + const gchar *function, + gint level_flags, + const gchar *message, + gsize mlen, + rspamd_logger_t *rspamd_log); + +/** + * Frees IOV context + * @param iov_ctx + */ +void rspamd_log_iov_free(struct rspamd_logger_iov_ctx *iov_ctx); +/** + * Escape log line by replacing unprintable characters to hex escapes like \xNN + * @param src + * @param srclen + * @param dst + * @param dstlen + * @return end of the escaped buffer + */ +gchar *rspamd_log_line_hex_escape(const guchar *src, gsize srclen, + gchar *dst, gsize dstlen); +/** + * Returns number of characters to be escaped, e.g. a caller can allocate a new buffer + * the desired number of characters + * @param src + * @param srclen + * @return number of characters to be escaped + */ +gsize rspamd_log_line_need_escape(const guchar *src, gsize srclen); + +static const struct rspamd_logger_funcs file_log_funcs = { + .init = rspamd_log_file_init, + .dtor = rspamd_log_file_dtor, + .reload = rspamd_log_file_reload, + .log = rspamd_log_file_log, + .on_fork = rspamd_log_file_on_fork, +}; + +/* + * Syslog logging + */ +void *rspamd_log_syslog_init(rspamd_logger_t *logger, struct rspamd_config *cfg, + uid_t uid, gid_t gid, GError **err); +void *rspamd_log_syslog_reload(rspamd_logger_t *logger, struct rspamd_config *cfg, + gpointer arg, uid_t uid, gid_t gid, GError **err); +void rspamd_log_syslog_dtor(rspamd_logger_t *logger, gpointer arg); +bool rspamd_log_syslog_log(const gchar *module, const gchar *id, + const gchar *function, + gint level_flags, + const gchar *message, + gsize mlen, + rspamd_logger_t *rspamd_log, + gpointer arg); + +static const struct rspamd_logger_funcs syslog_log_funcs = { + .init = rspamd_log_syslog_init, + .dtor = rspamd_log_syslog_dtor, + .reload = rspamd_log_syslog_reload, + .log = rspamd_log_syslog_log, + .on_fork = NULL, +}; + +/* + * Console logging + */ +void *rspamd_log_console_init(rspamd_logger_t *logger, struct rspamd_config *cfg, + uid_t uid, gid_t gid, GError **err); +void *rspamd_log_console_reload(rspamd_logger_t *logger, struct rspamd_config *cfg, + gpointer arg, uid_t uid, gid_t gid, GError **err); +void rspamd_log_console_dtor(rspamd_logger_t *logger, gpointer arg); +bool rspamd_log_console_log(const gchar *module, const gchar *id, + const gchar *function, + gint level_flags, + const gchar *message, + gsize mlen, + rspamd_logger_t *rspamd_log, + gpointer arg); + +static const struct rspamd_logger_funcs console_log_funcs = { + .init = rspamd_log_console_init, + .dtor = rspamd_log_console_dtor, + .reload = rspamd_log_console_reload, + .log = rspamd_log_console_log, + .on_fork = NULL, +}; + +#endif diff --git a/src/libserver/logger/logger_syslog.c b/src/libserver/logger/logger_syslog.c new file mode 100644 index 0000000..3c4f7f7 --- /dev/null +++ b/src/libserver/logger/logger_syslog.c @@ -0,0 +1,143 @@ +/*- + * Copyright 2020 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "logger.h" +#include "libserver/cfg_file.h" +#include "logger_private.h" + +#define SYSLOG_LOG_QUARK g_quark_from_static_string("syslog_logger") + +struct rspamd_syslog_logger_priv { + gint log_facility; +}; + +#ifdef HAVE_SYSLOG_H +#include <syslog.h> + +void * +rspamd_log_syslog_init(rspamd_logger_t *logger, struct rspamd_config *cfg, + uid_t uid, gid_t gid, GError **err) +{ + struct rspamd_syslog_logger_priv *priv; + + if (!cfg) { + g_set_error(err, SYSLOG_LOG_QUARK, EINVAL, + "no log config specified"); + return NULL; + } + + priv = g_malloc0(sizeof(*priv)); + + priv->log_facility = cfg->log_facility; + openlog("rspamd", LOG_NDELAY | LOG_PID, priv->log_facility); + + return priv; +} + +void rspamd_log_syslog_dtor(rspamd_logger_t *logger, gpointer arg) +{ + struct rspamd_syslog_logger_priv *priv = (struct rspamd_syslog_logger_priv *) arg; + + closelog(); + g_free(priv); +} +bool rspamd_log_syslog_log(const gchar *module, const gchar *id, + const gchar *function, + gint level_flags, + const gchar *message, + gsize mlen, + rspamd_logger_t *rspamd_log, + gpointer arg) +{ + static const struct { + GLogLevelFlags glib_level; + gint syslog_level; + } levels_match[] = { + {G_LOG_LEVEL_DEBUG, LOG_DEBUG}, + {G_LOG_LEVEL_INFO, LOG_INFO}, + {G_LOG_LEVEL_WARNING, LOG_WARNING}, + {G_LOG_LEVEL_CRITICAL, LOG_ERR}}; + unsigned i; + gint syslog_level; + + if (!(level_flags & RSPAMD_LOG_FORCED) && !rspamd_log->enabled) { + return false; + } + + /* Detect level */ + syslog_level = LOG_DEBUG; + + for (i = 0; i < G_N_ELEMENTS(levels_match); i++) { + if (level_flags & levels_match[i].glib_level) { + syslog_level = levels_match[i].syslog_level; + break; + } + } + + syslog(syslog_level, "<%.*s>; %s; %s: %.*s", + RSPAMD_LOG_ID_LEN, id != NULL ? id : "", + module != NULL ? module : "", + function != NULL ? function : "", + (gint) mlen, message); + + return true; +} + +#else + +void * +rspamd_log_syslog_init(rspamd_logger_t *logger, struct rspamd_config *cfg, + uid_t uid, gid_t gid, GError **err) +{ + g_set_error(err, SYSLOG_LOG_QUARK, EINVAL, "syslog support is not compiled in"); + + return NULL; +} + +bool rspamd_log_syslog_log(const gchar *module, const gchar *id, + const gchar *function, + gint level_flags, + const gchar *message, + gsize mlen, + rspamd_logger_t *rspamd_log, + gpointer arg) +{ + return false; +} + +void rspamd_log_syslog_dtor(rspamd_logger_t *logger, gpointer arg) +{ + /* Left blank intentionally */ +} + +#endif + +void * +rspamd_log_syslog_reload(rspamd_logger_t *logger, struct rspamd_config *cfg, + gpointer arg, uid_t uid, gid_t gid, GError **err) +{ + struct rspamd_syslog_logger_priv *npriv; + + npriv = rspamd_log_syslog_init(logger, cfg, uid, gid, err); + + if (npriv) { + /* Close old */ + rspamd_log_syslog_dtor(logger, arg); + } + + return npriv; +} diff --git a/src/libserver/maps/map.c b/src/libserver/maps/map.c new file mode 100644 index 0000000..7f6a48f --- /dev/null +++ b/src/libserver/maps/map.c @@ -0,0 +1,3195 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Implementation of map files handling + */ + +#include "config.h" +#include "map.h" +#include "map_private.h" +#include "libserver/http/http_connection.h" +#include "libserver/http/http_private.h" +#include "rspamd.h" +#include "contrib/libev/ev.h" +#include "contrib/uthash/utlist.h" + +#ifdef SYS_ZSTD +#include "zstd.h" +#else +#include "contrib/zstd/zstd.h" +#endif + +#undef MAP_DEBUG_REFS +#ifdef MAP_DEBUG_REFS +#define MAP_RETAIN(x, t) \ + do { \ + msg_err(G_GNUC_PRETTY_FUNCTION ": " t ": retain ref %p, refcount: %d -> %d", (x), (x)->ref.refcount, (x)->ref.refcount + 1); \ + REF_RETAIN(x); \ + } while (0) + +#define MAP_RELEASE(x, t) \ + do { \ + msg_err(G_GNUC_PRETTY_FUNCTION ": " t ": release ref %p, refcount: %d -> %d", (x), (x)->ref.refcount, (x)->ref.refcount - 1); \ + REF_RELEASE(x); \ + } while (0) +#else +#define MAP_RETAIN(x, t) REF_RETAIN(x) +#define MAP_RELEASE(x, t) REF_RELEASE(x) +#endif + +enum rspamd_map_periodic_opts { + RSPAMD_MAP_SCHEDULE_NORMAL = 0, + RSPAMD_MAP_SCHEDULE_ERROR = (1u << 0u), + RSPAMD_MAP_SCHEDULE_LOCKED = (1u << 1u), + RSPAMD_MAP_SCHEDULE_INIT = (1u << 2u), +}; + +static void free_http_cbdata_common(struct http_callback_data *cbd, + gboolean plan_new); +static void free_http_cbdata_dtor(gpointer p); +static void free_http_cbdata(struct http_callback_data *cbd); +static void rspamd_map_process_periodic(struct map_periodic_cbdata *cbd); +static void rspamd_map_schedule_periodic(struct rspamd_map *map, int how); +static gboolean read_map_file_chunks(struct rspamd_map *map, + struct map_cb_data *cbdata, + const gchar *fname, + gsize len, + goffset off); +static gboolean rspamd_map_save_http_cached_file(struct rspamd_map *map, + struct rspamd_map_backend *bk, + struct http_map_data *htdata, + const guchar *data, + gsize len); +static gboolean rspamd_map_update_http_cached_file(struct rspamd_map *map, + struct rspamd_map_backend *bk, + struct http_map_data *htdata); + +guint rspamd_map_log_id = (guint) -1; +RSPAMD_CONSTRUCTOR(rspamd_map_log_init) +{ + rspamd_map_log_id = rspamd_logger_add_debug_module("map"); +} + +/** + * Write HTTP request + */ +static void +write_http_request(struct http_callback_data *cbd) +{ + gchar datebuf[128]; + struct rspamd_http_message *msg; + + msg = rspamd_http_new_message(HTTP_REQUEST); + if (cbd->check) { + msg->method = HTTP_HEAD; + } + + msg->url = rspamd_fstring_append(msg->url, + cbd->data->path, strlen(cbd->data->path)); + + if (cbd->check) { + if (cbd->data->last_modified != 0) { + rspamd_http_date_format(datebuf, sizeof(datebuf), + cbd->data->last_modified); + rspamd_http_message_add_header(msg, "If-Modified-Since", + datebuf); + } + if (cbd->data->etag) { + rspamd_http_message_add_header_len(msg, "If-None-Match", + cbd->data->etag->str, cbd->data->etag->len); + } + } + + msg->url = rspamd_fstring_append(msg->url, cbd->data->rest, + strlen(cbd->data->rest)); + + if (cbd->data->userinfo) { + rspamd_http_message_add_header(msg, "Authorization", + cbd->data->userinfo); + } + + MAP_RETAIN(cbd, "http_callback_data"); + rspamd_http_connection_write_message(cbd->conn, + msg, + cbd->data->host, + NULL, + cbd, + cbd->timeout); +} + +/** + * Callback for destroying HTTP callback data + */ +static void +free_http_cbdata_common(struct http_callback_data *cbd, gboolean plan_new) +{ + struct map_periodic_cbdata *periodic = cbd->periodic; + + if (cbd->shmem_data) { + rspamd_http_message_shmem_unref(cbd->shmem_data); + } + + if (cbd->pk) { + rspamd_pubkey_unref(cbd->pk); + } + + if (cbd->conn) { + rspamd_http_connection_unref(cbd->conn); + cbd->conn = NULL; + } + + if (cbd->addrs) { + rspamd_inet_addr_t *addr; + guint i; + + PTR_ARRAY_FOREACH(cbd->addrs, i, addr) + { + rspamd_inet_address_free(addr); + } + + g_ptr_array_free(cbd->addrs, TRUE); + } + + + MAP_RELEASE(cbd->bk, "rspamd_map_backend"); + + if (periodic) { + /* Detached in case of HTTP error */ + MAP_RELEASE(periodic, "periodic"); + } + + g_free(cbd); +} + +static void +free_http_cbdata(struct http_callback_data *cbd) +{ + cbd->map->tmp_dtor = NULL; + cbd->map->tmp_dtor_data = NULL; + + free_http_cbdata_common(cbd, TRUE); +} + +static void +free_http_cbdata_dtor(gpointer p) +{ + struct http_callback_data *cbd = p; + struct rspamd_map *map; + + map = cbd->map; + if (cbd->stage == http_map_http_conn) { + REF_RELEASE(cbd); + } + else { + /* We cannot terminate DNS requests sent */ + cbd->stage = http_map_terminated; + } + + msg_warn_map("%s: " + "connection with http server is terminated: worker is stopping", + map->name); +} + +/* + * HTTP callbacks + */ +static void +http_map_error(struct rspamd_http_connection *conn, + GError *err) +{ + struct http_callback_data *cbd = conn->ud; + struct rspamd_map *map; + + map = cbd->map; + + if (cbd->periodic) { + cbd->periodic->errored = TRUE; + msg_err_map("error reading %s(%s): " + "connection with http server terminated incorrectly: %e", + cbd->bk->uri, + cbd->addr ? rspamd_inet_address_to_string_pretty(cbd->addr) : "", + err); + + rspamd_map_process_periodic(cbd->periodic); + } + + MAP_RELEASE(cbd, "http_callback_data"); +} + +static void +rspamd_map_cache_cb(struct ev_loop *loop, ev_timer *w, int revents) +{ + struct rspamd_http_map_cached_cbdata *cache_cbd = (struct rspamd_http_map_cached_cbdata *) + w->data; + struct rspamd_map *map; + struct http_map_data *data; + + map = cache_cbd->map; + data = cache_cbd->data; + + if (cache_cbd->gen != cache_cbd->data->gen) { + /* We have another update, so this cache element is obviously expired */ + /* + * Important!: we do not set cache availability to zero here, as there + * might be fresh cache + */ + msg_info_map("cached data is now expired (gen mismatch %L != %L) for %s; shm name=%s; refcount=%d", + cache_cbd->gen, cache_cbd->data->gen, map->name, cache_cbd->shm->shm_name, + cache_cbd->shm->ref.refcount); + MAP_RELEASE(cache_cbd->shm, "rspamd_http_map_cached_cbdata"); + ev_timer_stop(loop, &cache_cbd->timeout); + g_free(cache_cbd); + } + else if (cache_cbd->data->last_checked >= cache_cbd->last_checked) { + /* + * We checked map but we have not found anything more recent, + * reschedule cache check + */ + if (cache_cbd->map->poll_timeout > + rspamd_get_calendar_ticks() - cache_cbd->data->last_checked) { + w->repeat = cache_cbd->map->poll_timeout - + (rspamd_get_calendar_ticks() - cache_cbd->data->last_checked); + } + else { + w->repeat = cache_cbd->map->poll_timeout; + } + + if (w->repeat < 0) { + msg_info_map("cached data for %s has skewed check time: %d last checked, " + "%d poll timeout, %.2f diff; shm name=%s; refcount=%d", + map->name, (int) cache_cbd->data->last_checked, + (int) cache_cbd->map->poll_timeout, + (rspamd_get_calendar_ticks() - cache_cbd->data->last_checked), + cache_cbd->shm->shm_name, + cache_cbd->shm->ref.refcount); + w->repeat = 0.0; + } + + cache_cbd->last_checked = cache_cbd->data->last_checked; + msg_debug_map("cached data is up to date for %s", map->name); + ev_timer_again(loop, &cache_cbd->timeout); + } + else { + data->cur_cache_cbd = NULL; + g_atomic_int_set(&data->cache->available, 0); + msg_info_map("cached data is now expired for %s; shm name=%s; refcount=%d", + map->name, + cache_cbd->shm->shm_name, + cache_cbd->shm->ref.refcount); + MAP_RELEASE(cache_cbd->shm, "rspamd_http_map_cached_cbdata"); + ev_timer_stop(loop, &cache_cbd->timeout); + g_free(cache_cbd); + } +} + +static int +http_map_finish(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg) +{ + struct http_callback_data *cbd = conn->ud; + struct rspamd_map *map; + struct rspamd_map_backend *bk; + struct http_map_data *data; + struct rspamd_http_map_cached_cbdata *cache_cbd; + const rspamd_ftok_t *expires_hdr, *etag_hdr; + char next_check_date[128]; + guchar *in = NULL; + gsize dlen = 0; + + map = cbd->map; + bk = cbd->bk; + data = bk->data.hd; + + if (msg->code == 200) { + + if (cbd->check) { + msg_info_map("need to reread map from %s", cbd->bk->uri); + cbd->periodic->need_modify = TRUE; + /* Reset the whole chain */ + cbd->periodic->cur_backend = 0; + /* Reset cache, old cached data will be cleaned on timeout */ + g_atomic_int_set(&data->cache->available, 0); + data->cur_cache_cbd = NULL; + + rspamd_map_process_periodic(cbd->periodic); + MAP_RELEASE(cbd, "http_callback_data"); + + return 0; + } + + cbd->data->last_checked = msg->date; + + if (msg->last_modified) { + cbd->data->last_modified = msg->last_modified; + } + else { + cbd->data->last_modified = msg->date; + } + + + /* Unsigned version - just open file */ + cbd->shmem_data = rspamd_http_message_shmem_ref(msg); + cbd->data_len = msg->body_buf.len; + + if (cbd->data_len == 0) { + msg_err_map("cannot read empty map"); + goto err; + } + + g_assert(cbd->shmem_data != NULL); + + in = rspamd_shmem_xmap(cbd->shmem_data->shm_name, PROT_READ, &dlen); + + if (in == NULL) { + msg_err_map("cannot read tempfile %s: %s", + cbd->shmem_data->shm_name, + strerror(errno)); + goto err; + } + + /* Check for expires */ + double cached_timeout = map->poll_timeout * 2; + + expires_hdr = rspamd_http_message_find_header(msg, "Expires"); + + if (expires_hdr) { + time_t hdate; + + hdate = rspamd_http_parse_date(expires_hdr->begin, expires_hdr->len); + + if (hdate != (time_t) -1 && hdate > msg->date) { + cached_timeout = map->next_check - msg->date + + map->poll_timeout * 2; + + map->next_check = hdate; + } + else { + msg_info_map("invalid expires header: %T, ignore it", expires_hdr); + map->next_check = 0; + } + } + + /* Check for etag */ + etag_hdr = rspamd_http_message_find_header(msg, "ETag"); + + if (etag_hdr) { + if (cbd->data->etag) { + /* Remove old etag */ + rspamd_fstring_free(cbd->data->etag); + } + + cbd->data->etag = rspamd_fstring_new_init(etag_hdr->begin, + etag_hdr->len); + } + else { + if (cbd->data->etag) { + /* Remove and clear old etag */ + rspamd_fstring_free(cbd->data->etag); + cbd->data->etag = NULL; + } + } + + MAP_RETAIN(cbd->shmem_data, "shmem_data"); + cbd->data->gen++; + /* + * We know that a map is in the locked state + */ + g_atomic_int_set(&data->cache->available, 1); + /* Store cached data */ + rspamd_strlcpy(data->cache->shmem_name, cbd->shmem_data->shm_name, + sizeof(data->cache->shmem_name)); + data->cache->len = cbd->data_len; + data->cache->last_modified = cbd->data->last_modified; + cache_cbd = g_malloc0(sizeof(*cache_cbd)); + cache_cbd->shm = cbd->shmem_data; + cache_cbd->event_loop = cbd->event_loop; + cache_cbd->map = map; + cache_cbd->data = cbd->data; + cache_cbd->last_checked = cbd->data->last_checked; + cache_cbd->gen = cbd->data->gen; + MAP_RETAIN(cache_cbd->shm, "shmem_data"); + msg_info_map("stored map data in a shared memory cache: %s", + cache_cbd->shm->shm_name); + + ev_timer_init(&cache_cbd->timeout, rspamd_map_cache_cb, cached_timeout, + 0.0); + ev_timer_start(cbd->event_loop, &cache_cbd->timeout); + cache_cbd->timeout.data = cache_cbd; + data->cur_cache_cbd = cache_cbd; + + if (map->next_check) { + rspamd_http_date_format(next_check_date, sizeof(next_check_date), + map->next_check); + } + else { + rspamd_http_date_format(next_check_date, sizeof(next_check_date), + rspamd_get_calendar_ticks() + map->poll_timeout); + } + + + if (cbd->bk->is_compressed) { + ZSTD_DStream *zstream; + ZSTD_inBuffer zin; + ZSTD_outBuffer zout; + guchar *out; + gsize outlen, r; + + zstream = ZSTD_createDStream(); + ZSTD_initDStream(zstream); + + zin.pos = 0; + zin.src = in; + zin.size = dlen; + + if ((outlen = ZSTD_getDecompressedSize(zin.src, zin.size)) == 0) { + outlen = ZSTD_DStreamOutSize(); + } + + out = g_malloc(outlen); + + zout.dst = out; + zout.pos = 0; + zout.size = outlen; + + while (zin.pos < zin.size) { + r = ZSTD_decompressStream(zstream, &zout, &zin); + + if (ZSTD_isError(r)) { + msg_err_map("%s(%s): cannot decompress data: %s", + cbd->bk->uri, + rspamd_inet_address_to_string_pretty(cbd->addr), + ZSTD_getErrorName(r)); + ZSTD_freeDStream(zstream); + g_free(out); + MAP_RELEASE(cbd->shmem_data, "shmem_data"); + goto err; + } + + if (zout.pos == zout.size) { + /* We need to extend output buffer */ + zout.size = zout.size * 2 + 1.0; + out = g_realloc(zout.dst, zout.size); + zout.dst = out; + } + } + + ZSTD_freeDStream(zstream); + msg_info_map("%s(%s): read map data %z bytes compressed, " + "%z uncompressed, next check at %s", + cbd->bk->uri, + rspamd_inet_address_to_string_pretty(cbd->addr), + dlen, zout.pos, next_check_date); + map->read_callback(out, zout.pos, &cbd->periodic->cbdata, TRUE); + rspamd_map_save_http_cached_file(map, bk, cbd->data, out, zout.pos); + g_free(out); + } + else { + msg_info_map("%s(%s): read map data %z bytes, next check at %s", + cbd->bk->uri, + rspamd_inet_address_to_string_pretty(cbd->addr), + dlen, next_check_date); + rspamd_map_save_http_cached_file(map, bk, cbd->data, in, cbd->data_len); + map->read_callback(in, cbd->data_len, &cbd->periodic->cbdata, TRUE); + } + + MAP_RELEASE(cbd->shmem_data, "shmem_data"); + + cbd->periodic->cur_backend++; + munmap(in, dlen); + rspamd_map_process_periodic(cbd->periodic); + } + else if (msg->code == 304 && cbd->check) { + cbd->data->last_checked = msg->date; + + if (msg->last_modified) { + cbd->data->last_modified = msg->last_modified; + } + else { + cbd->data->last_modified = msg->date; + } + + expires_hdr = rspamd_http_message_find_header(msg, "Expires"); + + if (expires_hdr) { + time_t hdate; + + hdate = rspamd_http_parse_date(expires_hdr->begin, expires_hdr->len); + if (hdate != (time_t) -1 && hdate > msg->date) { + map->next_check = hdate; + } + else { + msg_info_map("invalid expires header: %T, ignore it", expires_hdr); + map->next_check = 0; + } + } + + etag_hdr = rspamd_http_message_find_header(msg, "ETag"); + + if (etag_hdr) { + if (cbd->data->etag) { + /* Remove old etag */ + rspamd_fstring_free(cbd->data->etag); + cbd->data->etag = rspamd_fstring_new_init(etag_hdr->begin, + etag_hdr->len); + } + } + + if (map->next_check) { + rspamd_http_date_format(next_check_date, sizeof(next_check_date), + map->next_check); + msg_info_map("data is not modified for server %s, next check at %s " + "(http cache based: %T)", + cbd->data->host, next_check_date, expires_hdr); + } + else { + rspamd_http_date_format(next_check_date, sizeof(next_check_date), + rspamd_get_calendar_ticks() + map->poll_timeout); + msg_info_map("data is not modified for server %s, next check at %s " + "(timer based)", + cbd->data->host, next_check_date); + } + + rspamd_map_update_http_cached_file(map, bk, cbd->data); + cbd->periodic->cur_backend++; + rspamd_map_process_periodic(cbd->periodic); + } + else { + msg_info_map("cannot load map %s from %s: HTTP error %d", + bk->uri, cbd->data->host, msg->code); + goto err; + } + + MAP_RELEASE(cbd, "http_callback_data"); + return 0; + +err: + cbd->periodic->errored = 1; + rspamd_map_process_periodic(cbd->periodic); + MAP_RELEASE(cbd, "http_callback_data"); + + return 0; +} + +static gboolean +read_map_file_chunks(struct rspamd_map *map, struct map_cb_data *cbdata, + const gchar *fname, gsize len, goffset off) +{ + gint fd; + gssize r, avail; + gsize buflen = 1024 * 1024; + gchar *pos, *bytes; + + fd = rspamd_file_xopen(fname, O_RDONLY, 0, TRUE); + + if (fd == -1) { + msg_err_map("can't open map for buffered reading %s: %s", + fname, strerror(errno)); + return FALSE; + } + + if (lseek(fd, off, SEEK_SET) == -1) { + msg_err_map("can't seek in map to pos %d for buffered reading %s: %s", + (gint) off, fname, strerror(errno)); + close(fd); + + return FALSE; + } + + buflen = MIN(len, buflen); + bytes = g_malloc(buflen); + avail = buflen; + pos = bytes; + + while ((r = read(fd, pos, avail)) > 0) { + gchar *end = bytes + (pos - bytes) + r; + msg_debug_map("%s: read map chunk, %z bytes", fname, + r); + pos = map->read_callback(bytes, end - bytes, cbdata, r == len); + + if (pos && pos > bytes && pos < end) { + guint remain = end - pos; + + memmove(bytes, pos, remain); + pos = bytes + remain; + /* Need to preserve the remain */ + avail = ((gssize) buflen) - remain; + + if (avail <= 0) { + /* Try realloc, too large element */ + g_assert(buflen >= remain); + bytes = g_realloc(bytes, buflen * 2); + + pos = bytes + remain; /* Adjust */ + avail += buflen; + buflen *= 2; + } + } + else { + avail = buflen; + pos = bytes; + } + + len -= r; + } + + if (r == -1) { + msg_err_map("can't read from map %s: %s", fname, strerror(errno)); + close(fd); + g_free(bytes); + + return FALSE; + } + + close(fd); + g_free(bytes); + + return TRUE; +} + +static gboolean +rspamd_map_check_sig_pk_mem(const guchar *sig, + gsize siglen, + struct rspamd_map *map, + const guchar *input, + gsize inlen, + struct rspamd_cryptobox_pubkey *pk) +{ + GString *b32_key; + gboolean ret = TRUE; + + if (siglen != rspamd_cryptobox_signature_bytes(RSPAMD_CRYPTOBOX_MODE_25519)) { + msg_err_map("can't open signature for %s: invalid size: %z", map->name, siglen); + + ret = FALSE; + } + + if (ret && !rspamd_cryptobox_verify(sig, siglen, input, inlen, + rspamd_pubkey_get_pk(pk, NULL), RSPAMD_CRYPTOBOX_MODE_25519)) { + msg_err_map("can't verify signature for %s: incorrect signature", map->name); + + ret = FALSE; + } + + if (ret) { + b32_key = rspamd_pubkey_print(pk, + RSPAMD_KEYPAIR_BASE32 | RSPAMD_KEYPAIR_PUBKEY); + msg_info_map("verified signature for %s using trusted key %v", + map->name, b32_key); + g_string_free(b32_key, TRUE); + } + + return ret; +} + +static gboolean +rspamd_map_check_file_sig(const char *fname, + struct rspamd_map *map, + struct rspamd_map_backend *bk, + const guchar *input, + gsize inlen) +{ + guchar *data; + struct rspamd_cryptobox_pubkey *pk = NULL; + GString *b32_key; + gboolean ret = TRUE; + gsize len = 0; + gchar fpath[PATH_MAX]; + + if (bk->trusted_pubkey == NULL) { + /* Try to load and check pubkey */ + rspamd_snprintf(fpath, sizeof(fpath), "%s.pub", fname); + data = rspamd_file_xmap(fpath, PROT_READ, &len, TRUE); + + if (data == NULL) { + msg_err_map("can't open pubkey %s: %s", fpath, strerror(errno)); + return FALSE; + } + + pk = rspamd_pubkey_from_base32(data, len, RSPAMD_KEYPAIR_SIGN, + RSPAMD_CRYPTOBOX_MODE_25519); + munmap(data, len); + + if (pk == NULL) { + msg_err_map("can't load pubkey %s", fpath); + return FALSE; + } + + /* We just check pk against the trusted db of keys */ + b32_key = rspamd_pubkey_print(pk, + RSPAMD_KEYPAIR_BASE32 | RSPAMD_KEYPAIR_PUBKEY); + g_assert(b32_key != NULL); + + if (g_hash_table_lookup(map->cfg->trusted_keys, b32_key->str) == NULL) { + msg_err_map("pubkey loaded from %s is untrusted: %v", fpath, + b32_key); + g_string_free(b32_key, TRUE); + rspamd_pubkey_unref(pk); + + return FALSE; + } + + g_string_free(b32_key, TRUE); + } + else { + pk = rspamd_pubkey_ref(bk->trusted_pubkey); + } + + rspamd_snprintf(fpath, sizeof(fpath), "%s.sig", fname); + data = rspamd_shmem_xmap(fpath, PROT_READ, &len); + + if (data == NULL) { + msg_err_map("can't open signature %s: %s", fpath, strerror(errno)); + ret = FALSE; + } + + if (ret) { + ret = rspamd_map_check_sig_pk_mem(data, len, map, input, inlen, pk); + munmap(data, len); + } + + rspamd_pubkey_unref(pk); + + return ret; +} + +/** + * Callback for reading data from file + */ +static gboolean +read_map_file(struct rspamd_map *map, struct file_map_data *data, + struct rspamd_map_backend *bk, struct map_periodic_cbdata *periodic) +{ + gchar *bytes; + gsize len; + struct stat st; + + if (map->read_callback == NULL || map->fin_callback == NULL) { + msg_err_map("%s: bad callback for reading map file", + data->filename); + return FALSE; + } + + if (stat(data->filename, &st) == -1) { + /* File does not exist, skipping */ + if (errno != ENOENT) { + msg_err_map("%s: map file is unavailable for reading: %s", + data->filename, strerror(errno)); + + return FALSE; + } + else { + msg_info_map("%s: map file is not found; " + "it will be read automatically if created", + data->filename); + return TRUE; + } + } + + ev_stat_stat(map->event_loop, &data->st_ev); + len = st.st_size; + + if (bk->is_signed) { + bytes = rspamd_file_xmap(data->filename, PROT_READ, &len, TRUE); + + if (bytes == NULL) { + msg_err_map("can't open map %s: %s", data->filename, strerror(errno)); + return FALSE; + } + + if (!rspamd_map_check_file_sig(data->filename, map, bk, bytes, len)) { + munmap(bytes, len); + + return FALSE; + } + + munmap(bytes, len); + } + + if (len > 0) { + if (map->no_file_read) { + /* We just call read callback with backend name */ + map->read_callback(data->filename, strlen(data->filename), + &periodic->cbdata, TRUE); + } + else { + if (bk->is_compressed) { + bytes = rspamd_file_xmap(data->filename, PROT_READ, &len, TRUE); + + if (bytes == NULL) { + msg_err_map("can't open map %s: %s", data->filename, strerror(errno)); + return FALSE; + } + + ZSTD_DStream *zstream; + ZSTD_inBuffer zin; + ZSTD_outBuffer zout; + guchar *out; + gsize outlen, r; + + zstream = ZSTD_createDStream(); + ZSTD_initDStream(zstream); + + zin.pos = 0; + zin.src = bytes; + zin.size = len; + + if ((outlen = ZSTD_getDecompressedSize(zin.src, zin.size)) == 0) { + outlen = ZSTD_DStreamOutSize(); + } + + out = g_malloc(outlen); + + zout.dst = out; + zout.pos = 0; + zout.size = outlen; + + while (zin.pos < zin.size) { + r = ZSTD_decompressStream(zstream, &zout, &zin); + + if (ZSTD_isError(r)) { + msg_err_map("%s: cannot decompress data: %s", + data->filename, + ZSTD_getErrorName(r)); + ZSTD_freeDStream(zstream); + g_free(out); + munmap(bytes, len); + return FALSE; + } + + if (zout.pos == zout.size) { + /* We need to extend output buffer */ + zout.size = zout.size * 2 + 1; + out = g_realloc(zout.dst, zout.size); + zout.dst = out; + } + } + + ZSTD_freeDStream(zstream); + msg_info_map("%s: read map data, %z bytes compressed, " + "%z uncompressed)", + data->filename, + len, zout.pos); + map->read_callback(out, zout.pos, &periodic->cbdata, TRUE); + g_free(out); + + munmap(bytes, len); + } + else { + /* Perform buffered read: fail-safe */ + if (!read_map_file_chunks(map, &periodic->cbdata, data->filename, + len, 0)) { + return FALSE; + } + } + } + } + else { + /* Empty map */ + map->read_callback(NULL, 0, &periodic->cbdata, TRUE); + } + + return TRUE; +} + +static gboolean +read_map_static(struct rspamd_map *map, struct static_map_data *data, + struct rspamd_map_backend *bk, struct map_periodic_cbdata *periodic) +{ + guchar *bytes; + gsize len; + + if (map->read_callback == NULL || map->fin_callback == NULL) { + msg_err_map("%s: bad callback for reading map file", map->name); + data->processed = TRUE; + return FALSE; + } + + bytes = data->data; + len = data->len; + + if (len > 0) { + if (bk->is_compressed) { + ZSTD_DStream *zstream; + ZSTD_inBuffer zin; + ZSTD_outBuffer zout; + guchar *out; + gsize outlen, r; + + zstream = ZSTD_createDStream(); + ZSTD_initDStream(zstream); + + zin.pos = 0; + zin.src = bytes; + zin.size = len; + + if ((outlen = ZSTD_getDecompressedSize(zin.src, zin.size)) == 0) { + outlen = ZSTD_DStreamOutSize(); + } + + out = g_malloc(outlen); + + zout.dst = out; + zout.pos = 0; + zout.size = outlen; + + while (zin.pos < zin.size) { + r = ZSTD_decompressStream(zstream, &zout, &zin); + + if (ZSTD_isError(r)) { + msg_err_map("%s: cannot decompress data: %s", + map->name, + ZSTD_getErrorName(r)); + ZSTD_freeDStream(zstream); + g_free(out); + + return FALSE; + } + + if (zout.pos == zout.size) { + /* We need to extend output buffer */ + zout.size = zout.size * 2 + 1; + out = g_realloc(zout.dst, zout.size); + zout.dst = out; + } + } + + ZSTD_freeDStream(zstream); + msg_info_map("%s: read map data, %z bytes compressed, " + "%z uncompressed)", + map->name, + len, zout.pos); + map->read_callback(out, zout.pos, &periodic->cbdata, TRUE); + g_free(out); + } + else { + msg_info_map("%s: read map data, %z bytes", + map->name, len); + map->read_callback(bytes, len, &periodic->cbdata, TRUE); + } + } + else { + map->read_callback(NULL, 0, &periodic->cbdata, TRUE); + } + + data->processed = TRUE; + + return TRUE; +} + +static void +rspamd_map_periodic_dtor(struct map_periodic_cbdata *periodic) +{ + struct rspamd_map *map; + + map = periodic->map; + msg_debug_map("periodic dtor %p", periodic); + + if (periodic->need_modify || periodic->cbdata.errored) { + /* Need to notify the real data structure */ + periodic->map->fin_callback(&periodic->cbdata, periodic->map->user_data); + + if (map->on_load_function) { + map->on_load_function(map, map->on_load_ud); + } + } + else { + /* Not modified */ + } + + if (periodic->locked) { + g_atomic_int_set(periodic->map->locked, 0); + msg_debug_map("unlocked map %s", periodic->map->name); + + if (periodic->map->wrk->state == rspamd_worker_state_running) { + rspamd_map_schedule_periodic(periodic->map, + RSPAMD_SYMBOL_RESULT_NORMAL); + } + else { + msg_debug_map("stop scheduling periodics for %s; terminating state", + periodic->map->name); + } + } + + g_free(periodic); +} + +/* Called on timer execution */ +static void +rspamd_map_periodic_callback(struct ev_loop *loop, ev_timer *w, int revents) +{ + struct map_periodic_cbdata *cbd = (struct map_periodic_cbdata *) w->data; + + MAP_RETAIN(cbd, "periodic"); + ev_timer_stop(loop, w); + rspamd_map_process_periodic(cbd); + MAP_RELEASE(cbd, "periodic"); +} + +static void +rspamd_map_schedule_periodic(struct rspamd_map *map, int how) +{ + const gdouble error_mult = 20.0, lock_mult = 0.1; + static const gdouble min_timer_interval = 2.0; + const gchar *reason = "unknown reason"; + gdouble jittered_sec; + gdouble timeout; + struct map_periodic_cbdata *cbd; + + if (map->scheduled_check || (map->wrk && + map->wrk->state != rspamd_worker_state_running)) { + /* + * Do not schedule check if some check is already scheduled or + * if worker is going to die + */ + return; + } + + if (!(how & RSPAMD_MAP_SCHEDULE_INIT) && map->static_only) { + /* No need to schedule anything for static maps */ + return; + } + + if (map->non_trivial && map->next_check != 0) { + timeout = map->next_check - rspamd_get_calendar_ticks(); + map->next_check = 0; + + if (timeout > 0 && timeout < map->poll_timeout) { + /* Early check case, jitter */ + gdouble poll_timeout = map->poll_timeout; + + if (how & RSPAMD_MAP_SCHEDULE_ERROR) { + poll_timeout = map->poll_timeout * error_mult; + reason = "early active non-trivial check (after error)"; + } + else if (how & RSPAMD_MAP_SCHEDULE_LOCKED) { + poll_timeout = map->poll_timeout * lock_mult; + reason = "early active non-trivial check (after being locked)"; + } + else { + reason = "early active non-trivial check"; + } + + jittered_sec = MIN(timeout, poll_timeout); + } + else if (timeout <= 0) { + /* Data is already expired, need to check */ + if (how & RSPAMD_MAP_SCHEDULE_ERROR) { + /* In case of error we still need to increase delay */ + jittered_sec = map->poll_timeout * error_mult; + reason = "expired non-trivial data (after error)"; + } + else { + jittered_sec = 0.0; + reason = "expired non-trivial data"; + } + } + else { + /* No need to check now, wait till next_check */ + jittered_sec = timeout; + reason = "valid non-trivial data"; + } + } + else { + /* No valid information when to check a map, plan a timer based check */ + timeout = map->poll_timeout; + + if (how & RSPAMD_MAP_SCHEDULE_INIT) { + if (map->active_http) { + /* Spill maps load to get better chances to hit ssl cache */ + timeout = rspamd_time_jitter(0.0, 2.0); + } + else { + timeout = 0.0; + } + + reason = "init scheduled check"; + } + else { + if (how & RSPAMD_MAP_SCHEDULE_ERROR) { + timeout = map->poll_timeout * error_mult; + reason = "errored scheduled check"; + } + else if (how & RSPAMD_MAP_SCHEDULE_LOCKED) { + timeout = map->poll_timeout * lock_mult; + reason = "locked scheduled check"; + } + else { + reason = "normal scheduled check"; + } + } + + jittered_sec = rspamd_time_jitter(timeout, 0); + } + + /* Now, we do some sanity checks for jittered seconds */ + if (!(how & RSPAMD_MAP_SCHEDULE_INIT)) { + /* Never allow too low interval between timer checks, it is expensive */ + if (jittered_sec < min_timer_interval) { + jittered_sec = rspamd_time_jitter(min_timer_interval, 0); + } + + if (map->non_trivial) { + /* + * Even if we are reported that we need to reload cache often, we + * still want to be sane in terms of events... + */ + if (jittered_sec < min_timer_interval * 2.0) { + if (map->nelts > 0) { + jittered_sec = min_timer_interval * 3.0; + } + } + } + } + + cbd = g_malloc0(sizeof(*cbd)); + cbd->cbdata.prev_data = *map->user_data; + cbd->cbdata.cur_data = NULL; + cbd->cbdata.map = map; + cbd->map = map; + map->scheduled_check = cbd; + REF_INIT_RETAIN(cbd, rspamd_map_periodic_dtor); + + cbd->ev.data = cbd; + ev_timer_init(&cbd->ev, rspamd_map_periodic_callback, jittered_sec, 0.0); + ev_timer_start(map->event_loop, &cbd->ev); + + msg_debug_map("schedule new periodic event %p in %.3f seconds for %s; reason: %s", + cbd, jittered_sec, map->name, reason); +} + +static gint +rspamd_map_af_to_weight(const rspamd_inet_addr_t *addr) +{ + int ret; + + switch (rspamd_inet_address_get_af(addr)) { + case AF_UNIX: + ret = 2; + break; + case AF_INET: + ret = 1; + break; + default: + ret = 0; + break; + } + + return ret; +} + +static gint +rspamd_map_dns_address_sort_func(gconstpointer a, gconstpointer b) +{ + const rspamd_inet_addr_t *ip1 = *(const rspamd_inet_addr_t **) a, + *ip2 = *(const rspamd_inet_addr_t **) b; + gint w1, w2; + + w1 = rspamd_map_af_to_weight(ip1); + w2 = rspamd_map_af_to_weight(ip2); + + /* Inverse order */ + return w2 - w1; +} + +static void +rspamd_map_dns_callback(struct rdns_reply *reply, void *arg) +{ + struct http_callback_data *cbd = arg; + struct rdns_reply_entry *cur_rep; + struct rspamd_map *map; + guint flags = RSPAMD_HTTP_CLIENT_SIMPLE | RSPAMD_HTTP_CLIENT_SHARED; + + map = cbd->map; + + msg_debug_map("got dns reply with code %s on stage %d", + rdns_strerror(reply->code), cbd->stage); + + if (cbd->stage == http_map_terminated) { + MAP_RELEASE(cbd, "http_callback_data"); + return; + } + + if (reply->code == RDNS_RC_NOERROR) { + DL_FOREACH(reply->entries, cur_rep) + { + rspamd_inet_addr_t *addr; + addr = rspamd_inet_address_from_rnds(cur_rep); + + if (addr != NULL) { + rspamd_inet_address_set_port(addr, cbd->data->port); + g_ptr_array_add(cbd->addrs, (void *) addr); + } + } + + if (cbd->stage == http_map_resolve_host2) { + /* We have still one request pending */ + cbd->stage = http_map_resolve_host1; + } + else if (cbd->stage == http_map_resolve_host1) { + cbd->stage = http_map_http_conn; + } + } + else if (cbd->stage < http_map_http_conn) { + if (cbd->stage == http_map_resolve_host2) { + /* We have still one request pending */ + cbd->stage = http_map_resolve_host1; + } + else if (cbd->addrs->len == 0) { + /* We could not resolve host, so cowardly fail here */ + msg_err_map("cannot resolve %s: %s", cbd->data->host, + rdns_strerror(reply->code)); + cbd->periodic->errored = 1; + rspamd_map_process_periodic(cbd->periodic); + } + else { + /* We have at least one address, so we can continue... */ + cbd->stage = http_map_http_conn; + } + } + + if (cbd->stage == http_map_http_conn && cbd->addrs->len > 0) { + rspamd_ptr_array_shuffle(cbd->addrs); + gint idx = 0; + /* + * For the existing addr we can just select any address as we have + * data available + */ + if (cbd->map->nelts > 0 && rspamd_random_double_fast() > 0.5) { + /* Already shuffled, use whatever is the first */ + cbd->addr = (rspamd_inet_addr_t *) g_ptr_array_index(cbd->addrs, idx); + } + else { + /* Always prefer IPv4 as IPv6 is almost all the time broken */ + g_ptr_array_sort(cbd->addrs, rspamd_map_dns_address_sort_func); + cbd->addr = (rspamd_inet_addr_t *) g_ptr_array_index(cbd->addrs, idx); + } + + retry: + msg_debug_map("try open http connection to %s", + rspamd_inet_address_to_string_pretty(cbd->addr)); + if (cbd->bk->protocol == MAP_PROTO_HTTPS) { + flags |= RSPAMD_HTTP_CLIENT_SSL; + } + cbd->conn = rspamd_http_connection_new_client(NULL, + NULL, + http_map_error, + http_map_finish, + flags, + cbd->addr); + + if (cbd->conn != NULL) { + write_http_request(cbd); + } + else { + if (idx < cbd->addrs->len - 1) { + /* We can retry */ + idx++; + rspamd_inet_addr_t *prev_addr = cbd->addr; + cbd->addr = (rspamd_inet_addr_t *) g_ptr_array_index(cbd->addrs, idx); + msg_info_map("cannot connect to %s to get data for %s: %s, retry with %s (%d of %d)", + rspamd_inet_address_to_string_pretty(prev_addr), + cbd->bk->uri, + strerror(errno), + rspamd_inet_address_to_string_pretty(cbd->addr), + idx + 1, cbd->addrs->len); + goto retry; + } + else { + /* Nothing else left */ + cbd->periodic->errored = TRUE; + msg_err_map("error reading %s(%s): " + "connection with http server terminated incorrectly: %s", + cbd->bk->uri, + cbd->addr ? rspamd_inet_address_to_string_pretty(cbd->addr) : "", + strerror(errno)); + + rspamd_map_process_periodic(cbd->periodic); + } + } + } + + MAP_RELEASE(cbd, "http_callback_data"); +} + +static gboolean +rspamd_map_read_cached(struct rspamd_map *map, struct rspamd_map_backend *bk, + struct map_periodic_cbdata *periodic, const gchar *host) +{ + gsize mmap_len, len; + gpointer in; + struct http_map_data *data; + + data = bk->data.hd; + + in = rspamd_shmem_xmap(data->cache->shmem_name, PROT_READ, &mmap_len); + + if (in == NULL) { + msg_err("cannot map cache from %s: %s", data->cache->shmem_name, + strerror(errno)); + return FALSE; + } + + if (mmap_len < data->cache->len) { + msg_err("cannot map cache from %s: truncated length %z, %z expected", + data->cache->shmem_name, + mmap_len, data->cache->len); + munmap(in, mmap_len); + + return FALSE; + } + + /* + * Len is taken from the shmem file size that can be larger than the + * actual data length, as we use shared memory as a growing buffer for the + * HTTP input. + * Hence, we need to use len from the saved cache data, counting that it is + * at least not more than the cached file length (this is checked above). + */ + len = data->cache->len; + + if (bk->is_compressed) { + ZSTD_DStream *zstream; + ZSTD_inBuffer zin; + ZSTD_outBuffer zout; + guchar *out; + gsize outlen, r; + + zstream = ZSTD_createDStream(); + ZSTD_initDStream(zstream); + + zin.pos = 0; + zin.src = in; + zin.size = len; + + if ((outlen = ZSTD_getDecompressedSize(zin.src, zin.size)) == 0) { + outlen = ZSTD_DStreamOutSize(); + } + + out = g_malloc(outlen); + + zout.dst = out; + zout.pos = 0; + zout.size = outlen; + + while (zin.pos < zin.size) { + r = ZSTD_decompressStream(zstream, &zout, &zin); + + if (ZSTD_isError(r)) { + msg_err_map("%s: cannot decompress data: %s", + bk->uri, + ZSTD_getErrorName(r)); + ZSTD_freeDStream(zstream); + g_free(out); + munmap(in, mmap_len); + return FALSE; + } + + if (zout.pos == zout.size) { + /* We need to extend output buffer */ + zout.size = zout.size * 2 + 1; + out = g_realloc(zout.dst, zout.size); + zout.dst = out; + } + } + + ZSTD_freeDStream(zstream); + msg_info_map("%s: read map data cached %z bytes compressed, " + "%z uncompressed", + bk->uri, + len, zout.pos); + map->read_callback(out, zout.pos, &periodic->cbdata, TRUE); + g_free(out); + } + else { + msg_info_map("%s: read map data cached %z bytes", bk->uri, len); + map->read_callback(in, len, &periodic->cbdata, TRUE); + } + + munmap(in, mmap_len); + + return TRUE; +} + +static gboolean +rspamd_map_has_http_cached_file(struct rspamd_map *map, + struct rspamd_map_backend *bk) +{ + gchar path[PATH_MAX]; + guchar digest[rspamd_cryptobox_HASHBYTES]; + struct rspamd_config *cfg = map->cfg; + struct stat st; + + if (cfg->maps_cache_dir == NULL || cfg->maps_cache_dir[0] == '\0') { + return FALSE; + } + + rspamd_cryptobox_hash(digest, bk->uri, strlen(bk->uri), NULL, 0); + rspamd_snprintf(path, sizeof(path), "%s%c%*xs.map", cfg->maps_cache_dir, + G_DIR_SEPARATOR, 20, digest); + + if (stat(path, &st) != -1 && st.st_size > + sizeof(struct rspamd_http_file_data)) { + return TRUE; + } + + return FALSE; +} + +static gboolean +rspamd_map_save_http_cached_file(struct rspamd_map *map, + struct rspamd_map_backend *bk, + struct http_map_data *htdata, + const guchar *data, + gsize len) +{ + gchar path[PATH_MAX]; + guchar digest[rspamd_cryptobox_HASHBYTES]; + struct rspamd_config *cfg = map->cfg; + gint fd; + struct rspamd_http_file_data header; + + if (cfg->maps_cache_dir == NULL || cfg->maps_cache_dir[0] == '\0') { + return FALSE; + } + + rspamd_cryptobox_hash(digest, bk->uri, strlen(bk->uri), NULL, 0); + rspamd_snprintf(path, sizeof(path), "%s%c%*xs.map", cfg->maps_cache_dir, + G_DIR_SEPARATOR, 20, digest); + + fd = rspamd_file_xopen(path, O_WRONLY | O_TRUNC | O_CREAT, + 00600, FALSE); + + if (fd == -1) { + return FALSE; + } + + if (!rspamd_file_lock(fd, FALSE)) { + msg_err_map("cannot lock file %s: %s", path, strerror(errno)); + close(fd); + + return FALSE; + } + + memcpy(header.magic, rspamd_http_file_magic, sizeof(rspamd_http_file_magic)); + header.mtime = htdata->last_modified; + header.next_check = map->next_check; + header.data_off = sizeof(header); + + if (htdata->etag) { + header.data_off += RSPAMD_FSTRING_LEN(htdata->etag); + header.etag_len = RSPAMD_FSTRING_LEN(htdata->etag); + } + else { + header.etag_len = 0; + } + + if (write(fd, &header, sizeof(header)) != sizeof(header)) { + msg_err_map("cannot write file %s (header stage): %s", path, strerror(errno)); + rspamd_file_unlock(fd, FALSE); + close(fd); + + return FALSE; + } + + if (header.etag_len > 0) { + if (write(fd, RSPAMD_FSTRING_DATA(htdata->etag), header.etag_len) != + header.etag_len) { + msg_err_map("cannot write file %s (etag stage): %s", path, strerror(errno)); + rspamd_file_unlock(fd, FALSE); + close(fd); + + return FALSE; + } + } + + /* Now write the rest */ + if (write(fd, data, len) != len) { + msg_err_map("cannot write file %s (data stage): %s", path, strerror(errno)); + rspamd_file_unlock(fd, FALSE); + close(fd); + + return FALSE; + } + + rspamd_file_unlock(fd, FALSE); + close(fd); + + msg_info_map("saved data from %s in %s, %uz bytes", bk->uri, path, len + sizeof(header) + header.etag_len); + + return TRUE; +} + +static gboolean +rspamd_map_update_http_cached_file(struct rspamd_map *map, + struct rspamd_map_backend *bk, + struct http_map_data *htdata) +{ + gchar path[PATH_MAX]; + guchar digest[rspamd_cryptobox_HASHBYTES]; + struct rspamd_config *cfg = map->cfg; + gint fd; + struct rspamd_http_file_data header; + + if (!rspamd_map_has_http_cached_file(map, bk)) { + return FALSE; + } + + rspamd_cryptobox_hash(digest, bk->uri, strlen(bk->uri), NULL, 0); + rspamd_snprintf(path, sizeof(path), "%s%c%*xs.map", cfg->maps_cache_dir, + G_DIR_SEPARATOR, 20, digest); + + fd = rspamd_file_xopen(path, O_WRONLY, + 00600, FALSE); + + if (fd == -1) { + return FALSE; + } + + if (!rspamd_file_lock(fd, FALSE)) { + msg_err_map("cannot lock file %s: %s", path, strerror(errno)); + close(fd); + + return FALSE; + } + + memcpy(header.magic, rspamd_http_file_magic, sizeof(rspamd_http_file_magic)); + header.mtime = htdata->last_modified; + header.next_check = map->next_check; + header.data_off = sizeof(header); + + if (htdata->etag) { + header.data_off += RSPAMD_FSTRING_LEN(htdata->etag); + header.etag_len = RSPAMD_FSTRING_LEN(htdata->etag); + } + else { + header.etag_len = 0; + } + + if (write(fd, &header, sizeof(header)) != sizeof(header)) { + msg_err_map("cannot update file %s (header stage): %s", path, strerror(errno)); + rspamd_file_unlock(fd, FALSE); + close(fd); + + return FALSE; + } + + if (header.etag_len > 0) { + if (write(fd, RSPAMD_FSTRING_DATA(htdata->etag), header.etag_len) != + header.etag_len) { + msg_err_map("cannot update file %s (etag stage): %s", path, strerror(errno)); + rspamd_file_unlock(fd, FALSE); + close(fd); + + return FALSE; + } + } + + rspamd_file_unlock(fd, FALSE); + close(fd); + + return TRUE; +} + + +static gboolean +rspamd_map_read_http_cached_file(struct rspamd_map *map, + struct rspamd_map_backend *bk, + struct http_map_data *htdata, + struct map_cb_data *cbdata) +{ + gchar path[PATH_MAX]; + guchar digest[rspamd_cryptobox_HASHBYTES]; + struct rspamd_config *cfg = map->cfg; + gint fd; + struct stat st; + struct rspamd_http_file_data header; + + if (cfg->maps_cache_dir == NULL || cfg->maps_cache_dir[0] == '\0') { + return FALSE; + } + + rspamd_cryptobox_hash(digest, bk->uri, strlen(bk->uri), NULL, 0); + rspamd_snprintf(path, sizeof(path), "%s%c%*xs.map", cfg->maps_cache_dir, + G_DIR_SEPARATOR, 20, digest); + + fd = rspamd_file_xopen(path, O_RDONLY, 00600, FALSE); + + if (fd == -1) { + return FALSE; + } + + if (!rspamd_file_lock(fd, FALSE)) { + msg_err_map("cannot lock file %s: %s", path, strerror(errno)); + close(fd); + + return FALSE; + } + + (void) fstat(fd, &st); + + if (read(fd, &header, sizeof(header)) != sizeof(header)) { + msg_err_map("cannot read file %s (header stage): %s", path, strerror(errno)); + rspamd_file_unlock(fd, FALSE); + close(fd); + + return FALSE; + } + + if (memcmp(header.magic, rspamd_http_file_magic, + sizeof(rspamd_http_file_magic)) != 0) { + msg_warn_map("invalid or old version magic in file %s; ignore it", path); + rspamd_file_unlock(fd, FALSE); + close(fd); + + return FALSE; + } + + double now = rspamd_get_calendar_ticks(); + + if (header.next_check > now) { + map->next_check = header.next_check; + } + else { + map->next_check = now; + } + + htdata->last_modified = header.mtime; + + if (header.etag_len > 0) { + rspamd_fstring_t *etag = rspamd_fstring_sized_new(header.etag_len); + + if (read(fd, RSPAMD_FSTRING_DATA(etag), header.etag_len) != header.etag_len) { + msg_err_map("cannot read file %s (etag stage): %s", path, + strerror(errno)); + rspamd_file_unlock(fd, FALSE); + rspamd_fstring_free(etag); + close(fd); + + return FALSE; + } + + etag->len = header.etag_len; + + if (htdata->etag) { + /* FIXME: should be dealt somehow better */ + msg_warn_map("etag is already defined as %V; cached is %V; ignore cached", + htdata->etag, etag); + rspamd_fstring_free(etag); + } + else { + htdata->etag = etag; + } + } + + rspamd_file_unlock(fd, FALSE); + close(fd); + + /* Now read file data */ + /* Perform buffered read: fail-safe */ + if (!read_map_file_chunks(map, cbdata, path, + st.st_size - header.data_off, header.data_off)) { + return FALSE; + } + + struct tm tm; + gchar ncheck_buf[32], lm_buf[32]; + + rspamd_localtime(map->next_check, &tm); + strftime(ncheck_buf, sizeof(ncheck_buf) - 1, "%Y-%m-%d %H:%M:%S", &tm); + rspamd_localtime(htdata->last_modified, &tm); + strftime(lm_buf, sizeof(lm_buf) - 1, "%Y-%m-%d %H:%M:%S", &tm); + + msg_info_map("read cached data for %s from %s, %uz bytes; next check at: %s;" + " last modified on: %s; etag: %V", + bk->uri, + path, + (size_t) (st.st_size - header.data_off), + ncheck_buf, + lm_buf, + htdata->etag); + + return TRUE; +} + +/** + * Async HTTP callback + */ +static void +rspamd_map_common_http_callback(struct rspamd_map *map, + struct rspamd_map_backend *bk, + struct map_periodic_cbdata *periodic, + gboolean check) +{ + struct http_map_data *data; + struct http_callback_data *cbd; + guint flags = RSPAMD_HTTP_CLIENT_SIMPLE | RSPAMD_HTTP_CLIENT_SHARED; + + data = bk->data.hd; + + if (g_atomic_int_get(&data->cache->available) == 1) { + /* Read cached data */ + if (check) { + if (data->last_modified < data->cache->last_modified) { + msg_info_map("need to reread cached map triggered by %s " + "(%d our modify time, %d cached modify time)", + bk->uri, + (int) data->last_modified, + (int) data->cache->last_modified); + periodic->need_modify = TRUE; + /* Reset the whole chain */ + periodic->cur_backend = 0; + rspamd_map_process_periodic(periodic); + } + else { + if (map->active_http) { + /* Check even if there is a cached version */ + goto check; + } + else { + /* Switch to the next backend */ + periodic->cur_backend++; + rspamd_map_process_periodic(periodic); + } + } + + return; + } + else { + if (map->active_http && + data->last_modified > data->cache->last_modified) { + goto check; + } + else if (rspamd_map_read_cached(map, bk, periodic, data->host)) { + /* Switch to the next backend */ + periodic->cur_backend++; + data->last_modified = data->cache->last_modified; + rspamd_map_process_periodic(periodic); + + return; + } + } + } + else if (!map->active_http) { + /* Switch to the next backend */ + periodic->cur_backend++; + rspamd_map_process_periodic(periodic); + + return; + } + +check: + cbd = g_malloc0(sizeof(struct http_callback_data)); + + cbd->event_loop = map->event_loop; + cbd->addrs = g_ptr_array_sized_new(4); + cbd->map = map; + cbd->data = data; + cbd->check = check; + cbd->periodic = periodic; + MAP_RETAIN(periodic, "periodic"); + cbd->bk = bk; + MAP_RETAIN(bk, "rspamd_map_backend"); + cbd->stage = http_map_terminated; + REF_INIT_RETAIN(cbd, free_http_cbdata); + + msg_debug_map("%s map data from %s", check ? "checking" : "reading", + data->host); + + /* Try address */ + rspamd_inet_addr_t *addr = NULL; + + if (rspamd_parse_inet_address(&addr, data->host, + strlen(data->host), RSPAMD_INET_ADDRESS_PARSE_DEFAULT)) { + rspamd_inet_address_set_port(addr, cbd->data->port); + g_ptr_array_add(cbd->addrs, (void *) addr); + + if (bk->protocol == MAP_PROTO_HTTPS) { + flags |= RSPAMD_HTTP_CLIENT_SSL; + } + + cbd->conn = rspamd_http_connection_new_client( + NULL, + NULL, + http_map_error, + http_map_finish, + flags, + addr); + + if (cbd->conn != NULL) { + cbd->stage = http_map_http_conn; + write_http_request(cbd); + cbd->addr = addr; + MAP_RELEASE(cbd, "http_callback_data"); + } + else { + msg_warn_map("cannot load map: cannot connect to %s: %s", + data->host, strerror(errno)); + MAP_RELEASE(cbd, "http_callback_data"); + } + + return; + } + else if (map->r->r) { + /* Send both A and AAAA requests */ + guint nreq = 0; + + if (rdns_make_request_full(map->r->r, rspamd_map_dns_callback, cbd, + map->cfg->dns_timeout, map->cfg->dns_retransmits, 1, + data->host, RDNS_REQUEST_A)) { + MAP_RETAIN(cbd, "http_callback_data"); + nreq++; + } + if (rdns_make_request_full(map->r->r, rspamd_map_dns_callback, cbd, + map->cfg->dns_timeout, map->cfg->dns_retransmits, 1, + data->host, RDNS_REQUEST_AAAA)) { + MAP_RETAIN(cbd, "http_callback_data"); + nreq++; + } + + if (nreq == 2) { + cbd->stage = http_map_resolve_host2; + } + else if (nreq == 1) { + cbd->stage = http_map_resolve_host1; + } + + map->tmp_dtor = free_http_cbdata_dtor; + map->tmp_dtor_data = cbd; + } + else { + msg_warn_map("cannot load map: DNS resolver is not initialized"); + cbd->periodic->errored = TRUE; + } + + MAP_RELEASE(cbd, "http_callback_data"); +} + +static void +rspamd_map_http_check_callback(struct map_periodic_cbdata *cbd) +{ + struct rspamd_map *map; + struct rspamd_map_backend *bk; + + map = cbd->map; + bk = g_ptr_array_index(cbd->map->backends, cbd->cur_backend); + + rspamd_map_common_http_callback(map, bk, cbd, TRUE); +} + +static void +rspamd_map_http_read_callback(struct map_periodic_cbdata *cbd) +{ + struct rspamd_map *map; + struct rspamd_map_backend *bk; + + map = cbd->map; + bk = g_ptr_array_index(cbd->map->backends, cbd->cur_backend); + rspamd_map_common_http_callback(map, bk, cbd, FALSE); +} + +static void +rspamd_map_file_check_callback(struct map_periodic_cbdata *periodic) +{ + struct rspamd_map *map; + struct file_map_data *data; + struct rspamd_map_backend *bk; + + map = periodic->map; + bk = g_ptr_array_index(map->backends, periodic->cur_backend); + data = bk->data.fd; + + if (data->need_modify) { + periodic->need_modify = TRUE; + periodic->cur_backend = 0; + data->need_modify = FALSE; + + rspamd_map_process_periodic(periodic); + + return; + } + + map = periodic->map; + /* Switch to the next backend as the rest is handled by ev_stat */ + periodic->cur_backend++; + rspamd_map_process_periodic(periodic); +} + +static void +rspamd_map_static_check_callback(struct map_periodic_cbdata *periodic) +{ + struct rspamd_map *map; + struct static_map_data *data; + struct rspamd_map_backend *bk; + + map = periodic->map; + bk = g_ptr_array_index(map->backends, periodic->cur_backend); + data = bk->data.sd; + + if (!data->processed) { + periodic->need_modify = TRUE; + periodic->cur_backend = 0; + + rspamd_map_process_periodic(periodic); + + return; + } + + /* Switch to the next backend */ + periodic->cur_backend++; + rspamd_map_process_periodic(periodic); +} + +static void +rspamd_map_file_read_callback(struct map_periodic_cbdata *periodic) +{ + struct rspamd_map *map; + struct file_map_data *data; + struct rspamd_map_backend *bk; + + map = periodic->map; + + bk = g_ptr_array_index(map->backends, periodic->cur_backend); + data = bk->data.fd; + + msg_info_map("rereading map file %s", data->filename); + + if (!read_map_file(map, data, bk, periodic)) { + periodic->errored = TRUE; + } + + /* Switch to the next backend */ + periodic->cur_backend++; + rspamd_map_process_periodic(periodic); +} + +static void +rspamd_map_static_read_callback(struct map_periodic_cbdata *periodic) +{ + struct rspamd_map *map; + struct static_map_data *data; + struct rspamd_map_backend *bk; + + map = periodic->map; + + bk = g_ptr_array_index(map->backends, periodic->cur_backend); + data = bk->data.sd; + + msg_info_map("rereading static map"); + + if (!read_map_static(map, data, bk, periodic)) { + periodic->errored = TRUE; + } + + /* Switch to the next backend */ + periodic->cur_backend++; + rspamd_map_process_periodic(periodic); +} + +static void +rspamd_map_process_periodic(struct map_periodic_cbdata *cbd) +{ + struct rspamd_map_backend *bk; + struct rspamd_map *map; + + map = cbd->map; + map->scheduled_check = NULL; + + if (!map->file_only && !cbd->locked) { + if (!g_atomic_int_compare_and_exchange(cbd->map->locked, + 0, 1)) { + msg_debug_map( + "don't try to reread map %s as it is locked by other process, " + "will reread it later", + cbd->map->name); + rspamd_map_schedule_periodic(map, RSPAMD_MAP_SCHEDULE_LOCKED); + MAP_RELEASE(cbd, "periodic"); + + return; + } + else { + msg_debug_map("locked map %s", cbd->map->name); + cbd->locked = TRUE; + } + } + + if (cbd->errored) { + /* We should not check other backends if some backend has failed*/ + rspamd_map_schedule_periodic(cbd->map, RSPAMD_MAP_SCHEDULE_ERROR); + + if (cbd->locked) { + g_atomic_int_set(cbd->map->locked, 0); + cbd->locked = FALSE; + } + + /* Also set error flag for the map consumer */ + cbd->cbdata.errored = true; + + msg_debug_map("unlocked map %s, refcount=%d", cbd->map->name, + cbd->ref.refcount); + MAP_RELEASE(cbd, "periodic"); + + return; + } + + /* For each backend we need to check for modifications */ + if (cbd->cur_backend >= cbd->map->backends->len) { + /* Last backend */ + msg_debug_map("finished map: %d of %d", cbd->cur_backend, + cbd->map->backends->len); + MAP_RELEASE(cbd, "periodic"); + + return; + } + + if (cbd->map->wrk && cbd->map->wrk->state == rspamd_worker_state_running) { + bk = g_ptr_array_index(cbd->map->backends, cbd->cur_backend); + g_assert(bk != NULL); + + if (cbd->need_modify) { + /* Load data from the next backend */ + switch (bk->protocol) { + case MAP_PROTO_HTTP: + case MAP_PROTO_HTTPS: + rspamd_map_http_read_callback(cbd); + break; + case MAP_PROTO_FILE: + rspamd_map_file_read_callback(cbd); + break; + case MAP_PROTO_STATIC: + rspamd_map_static_read_callback(cbd); + break; + } + } + else { + /* Check the next backend */ + switch (bk->protocol) { + case MAP_PROTO_HTTP: + case MAP_PROTO_HTTPS: + rspamd_map_http_check_callback(cbd); + break; + case MAP_PROTO_FILE: + rspamd_map_file_check_callback(cbd); + break; + case MAP_PROTO_STATIC: + rspamd_map_static_check_callback(cbd); + break; + } + } + } +} + +static void +rspamd_map_on_stat(struct ev_loop *loop, ev_stat *w, int revents) +{ + struct rspamd_map *map = (struct rspamd_map *) w->data; + + if (w->attr.st_nlink > 0) { + msg_info_map("old mtime is %t (size = %Hz), " + "new mtime is %t (size = %Hz) for map file %s", + w->prev.st_mtime, (gsize) w->prev.st_size, + w->attr.st_mtime, (gsize) w->attr.st_size, + w->path); + + /* Fire need modify flag */ + struct rspamd_map_backend *bk; + guint i; + + PTR_ARRAY_FOREACH(map->backends, i, bk) + { + if (bk->protocol == MAP_PROTO_FILE) { + bk->data.fd->need_modify = TRUE; + } + } + + map->next_check = 0; + + if (map->scheduled_check) { + ev_timer_stop(map->event_loop, &map->scheduled_check->ev); + MAP_RELEASE(map->scheduled_check, "rspamd_map_on_stat"); + map->scheduled_check = NULL; + } + + rspamd_map_schedule_periodic(map, RSPAMD_MAP_SCHEDULE_INIT); + } +} + +/* Start watching event for all maps */ +void rspamd_map_watch(struct rspamd_config *cfg, + struct ev_loop *event_loop, + struct rspamd_dns_resolver *resolver, + struct rspamd_worker *worker, + enum rspamd_map_watch_type how) +{ + GList *cur = cfg->maps; + struct rspamd_map *map; + struct rspamd_map_backend *bk; + guint i; + + g_assert(how > RSPAMD_MAP_WATCH_MIN && how < RSPAMD_MAP_WATCH_MAX); + + /* First of all do synced read of data */ + while (cur) { + map = cur->data; + map->event_loop = event_loop; + map->r = resolver; + + if (map->wrk == NULL && how != RSPAMD_MAP_WATCH_WORKER) { + /* Generic scanner map */ + map->wrk = worker; + + if (how == RSPAMD_MAP_WATCH_PRIMARY_CONTROLLER) { + map->active_http = TRUE; + } + else { + map->active_http = FALSE; + } + } + else if (map->wrk != NULL && map->wrk == worker) { + /* Map is bound to a specific worker */ + map->active_http = TRUE; + } + else { + /* Skip map for this worker as irrelevant */ + cur = g_list_next(cur); + continue; + } + + if (!map->active_http) { + /* Check cached version more frequently as it is cheap */ + + if (map->poll_timeout >= cfg->map_timeout && + cfg->map_file_watch_multiplier < 1.0) { + map->poll_timeout = + map->poll_timeout * cfg->map_file_watch_multiplier; + } + } + + map->file_only = TRUE; + map->static_only = TRUE; + + PTR_ARRAY_FOREACH(map->backends, i, bk) + { + bk->event_loop = event_loop; + + if (bk->protocol == MAP_PROTO_FILE) { + struct file_map_data *data; + + data = bk->data.fd; + + if (map->user_data == NULL || *map->user_data == NULL) { + /* Map has not been read, init it's reading if possible */ + struct stat st; + + if (stat(data->filename, &st) != -1) { + data->need_modify = TRUE; + } + } + + ev_stat_init(&data->st_ev, rspamd_map_on_stat, + data->filename, map->poll_timeout * cfg->map_file_watch_multiplier); + data->st_ev.data = map; + ev_stat_start(event_loop, &data->st_ev); + map->static_only = FALSE; + } + else if ((bk->protocol == MAP_PROTO_HTTP || + bk->protocol == MAP_PROTO_HTTPS)) { + if (map->active_http) { + map->non_trivial = TRUE; + } + + map->static_only = FALSE; + map->file_only = FALSE; + } + } + + rspamd_map_schedule_periodic(map, RSPAMD_MAP_SCHEDULE_INIT); + + cur = g_list_next(cur); + } +} + +void rspamd_map_preload(struct rspamd_config *cfg) +{ + GList *cur = cfg->maps; + struct rspamd_map *map; + struct rspamd_map_backend *bk; + guint i; + gboolean map_ok; + + /* First of all do synced read of data */ + while (cur) { + map = cur->data; + map_ok = TRUE; + + PTR_ARRAY_FOREACH(map->backends, i, bk) + { + if (!(bk->protocol == MAP_PROTO_FILE || + bk->protocol == MAP_PROTO_STATIC)) { + + if (bk->protocol == MAP_PROTO_HTTP || + bk->protocol == MAP_PROTO_HTTPS) { + if (!rspamd_map_has_http_cached_file(map, bk)) { + + if (!map->fallback_backend) { + map_ok = FALSE; + } + break; + } + else { + continue; /* We are yet fine */ + } + } + map_ok = FALSE; + break; + } + } + + if (map_ok) { + struct map_periodic_cbdata fake_cbd; + gboolean succeed = TRUE; + + memset(&fake_cbd, 0, sizeof(fake_cbd)); + fake_cbd.cbdata.state = 0; + fake_cbd.cbdata.prev_data = *map->user_data; + fake_cbd.cbdata.cur_data = NULL; + fake_cbd.cbdata.map = map; + fake_cbd.map = map; + + PTR_ARRAY_FOREACH(map->backends, i, bk) + { + fake_cbd.cur_backend = i; + + if (bk->protocol == MAP_PROTO_FILE) { + if (!read_map_file(map, bk->data.fd, bk, &fake_cbd)) { + succeed = FALSE; + break; + } + } + else if (bk->protocol == MAP_PROTO_STATIC) { + if (!read_map_static(map, bk->data.sd, bk, &fake_cbd)) { + succeed = FALSE; + break; + } + } + else if (bk->protocol == MAP_PROTO_HTTP || + bk->protocol == MAP_PROTO_HTTPS) { + if (!rspamd_map_read_http_cached_file(map, bk, bk->data.hd, + &fake_cbd.cbdata)) { + + if (map->fallback_backend) { + /* Try fallback */ + g_assert(map->fallback_backend->protocol == + MAP_PROTO_FILE); + if (!read_map_file(map, + map->fallback_backend->data.fd, + map->fallback_backend, &fake_cbd)) { + succeed = FALSE; + break; + } + } + else { + succeed = FALSE; + break; + } + } + } + else { + g_assert_not_reached(); + } + } + + if (succeed) { + map->fin_callback(&fake_cbd.cbdata, map->user_data); + + if (map->on_load_function) { + map->on_load_function(map, map->on_load_ud); + } + } + else { + msg_info_map("preload of %s failed", map->name); + } + } + + cur = g_list_next(cur); + } +} + +void rspamd_map_remove_all(struct rspamd_config *cfg) +{ + struct rspamd_map *map; + GList *cur; + struct rspamd_map_backend *bk; + struct map_cb_data cbdata; + guint i; + + for (cur = cfg->maps; cur != NULL; cur = g_list_next(cur)) { + map = cur->data; + + if (map->tmp_dtor) { + map->tmp_dtor(map->tmp_dtor_data); + } + + if (map->dtor) { + cbdata.prev_data = NULL; + cbdata.map = map; + cbdata.cur_data = *map->user_data; + + map->dtor(&cbdata); + *map->user_data = NULL; + } + + if (map->on_load_ud_dtor) { + map->on_load_ud_dtor(map->on_load_ud); + } + + for (i = 0; i < map->backends->len; i++) { + bk = g_ptr_array_index(map->backends, i); + + MAP_RELEASE(bk, "rspamd_map_backend"); + } + + if (map->fallback_backend) { + MAP_RELEASE(map->fallback_backend, "rspamd_map_backend"); + } + } + + g_list_free(cfg->maps); + cfg->maps = NULL; +} + +static const gchar * +rspamd_map_check_proto(struct rspamd_config *cfg, + const gchar *map_line, struct rspamd_map_backend *bk) +{ + const gchar *pos = map_line, *end, *end_key; + + g_assert(bk != NULL); + g_assert(pos != NULL); + + end = pos + strlen(pos); + + /* Static check */ + if (g_ascii_strcasecmp(pos, "static") == 0) { + bk->protocol = MAP_PROTO_STATIC; + bk->uri = g_strdup(pos); + + return pos; + } + else if (g_ascii_strcasecmp(pos, "zst+static") == 0) { + bk->protocol = MAP_PROTO_STATIC; + bk->uri = g_strdup(pos + 4); + bk->is_compressed = TRUE; + + return pos + 4; + } + + for (;;) { + if (g_ascii_strncasecmp(pos, "sign+", sizeof("sign+") - 1) == 0) { + bk->is_signed = TRUE; + pos += sizeof("sign+") - 1; + } + else if (g_ascii_strncasecmp(pos, "fallback+", sizeof("fallback+") - 1) == 0) { + bk->is_fallback = TRUE; + pos += sizeof("fallback+") - 1; + } + else if (g_ascii_strncasecmp(pos, "key=", sizeof("key=") - 1) == 0) { + pos += sizeof("key=") - 1; + end_key = memchr(pos, '+', end - pos); + + if (end_key != NULL) { + bk->trusted_pubkey = rspamd_pubkey_from_base32(pos, end_key - pos, + RSPAMD_KEYPAIR_SIGN, RSPAMD_CRYPTOBOX_MODE_25519); + + if (bk->trusted_pubkey == NULL) { + msg_err_config("cannot read pubkey from map: %s", + map_line); + return NULL; + } + pos = end_key + 1; + } + else if (end - pos > 64) { + /* Try hex encoding */ + bk->trusted_pubkey = rspamd_pubkey_from_hex(pos, 64, + RSPAMD_KEYPAIR_SIGN, RSPAMD_CRYPTOBOX_MODE_25519); + + if (bk->trusted_pubkey == NULL) { + msg_err_config("cannot read pubkey from map: %s", + map_line); + return NULL; + } + pos += 64; + } + else { + msg_err_config("cannot read pubkey from map: %s", + map_line); + return NULL; + } + + if (*pos == '+' || *pos == ':') { + pos++; + } + } + else { + /* No known flags */ + break; + } + } + + bk->protocol = MAP_PROTO_FILE; + + if (g_ascii_strncasecmp(pos, "http://", sizeof("http://") - 1) == 0) { + bk->protocol = MAP_PROTO_HTTP; + /* Include http:// */ + bk->uri = g_strdup(pos); + pos += sizeof("http://") - 1; + } + else if (g_ascii_strncasecmp(pos, "https://", sizeof("https://") - 1) == 0) { + bk->protocol = MAP_PROTO_HTTPS; + /* Include https:// */ + bk->uri = g_strdup(pos); + pos += sizeof("https://") - 1; + } + else if (g_ascii_strncasecmp(pos, "file://", sizeof("file://") - 1) == 0) { + pos += sizeof("file://") - 1; + /* Exclude file:// */ + bk->uri = g_strdup(pos); + } + else if (*pos == '/') { + /* Trivial file case */ + bk->uri = g_strdup(pos); + } + else { + msg_err_config("invalid map fetching protocol: %s", map_line); + + return NULL; + } + + if (bk->protocol != MAP_PROTO_FILE && bk->is_signed) { + msg_err_config("signed maps are no longer supported for HTTP(s): %s", map_line); + } + + return pos; +} + +gboolean +rspamd_map_is_map(const gchar *map_line) +{ + gboolean ret = FALSE; + + g_assert(map_line != NULL); + + if (map_line[0] == '/') { + ret = TRUE; + } + else if (g_ascii_strncasecmp(map_line, "sign+", sizeof("sign+") - 1) == 0) { + ret = TRUE; + } + else if (g_ascii_strncasecmp(map_line, "fallback+", sizeof("fallback+") - 1) == 0) { + ret = TRUE; + } + else if (g_ascii_strncasecmp(map_line, "file://", sizeof("file://") - 1) == 0) { + ret = TRUE; + } + else if (g_ascii_strncasecmp(map_line, "http://", sizeof("http://") - 1) == 0) { + ret = TRUE; + } + else if (g_ascii_strncasecmp(map_line, "https://", sizeof("https://") - 1) == 0) { + ret = TRUE; + } + + return ret; +} + +static void +rspamd_map_backend_dtor(struct rspamd_map_backend *bk) +{ + switch (bk->protocol) { + case MAP_PROTO_FILE: + if (bk->data.fd) { + ev_stat_stop(bk->event_loop, &bk->data.fd->st_ev); + g_free(bk->data.fd->filename); + g_free(bk->data.fd); + } + break; + case MAP_PROTO_STATIC: + if (bk->data.sd) { + if (bk->data.sd->data) { + g_free(bk->data.sd->data); + } + + g_free(bk->data.sd); + } + break; + case MAP_PROTO_HTTP: + case MAP_PROTO_HTTPS: + if (bk->data.hd) { + struct http_map_data *data = bk->data.hd; + + g_free(data->host); + g_free(data->path); + g_free(data->rest); + + if (data->userinfo) { + g_free(data->userinfo); + } + + if (data->etag) { + rspamd_fstring_free(data->etag); + } + + /* + * Clear cached file, but check if a worker is an active http worker + * as cur_cache_cbd is meaningful merely for active worker, who actually + * owns the cache + */ + if (bk->map && bk->map->active_http) { + if (g_atomic_int_compare_and_exchange(&data->cache->available, 1, 0)) { + if (data->cur_cache_cbd) { + msg_info("clear shared memory cache for a map in %s as backend \"%s\" is closing", + data->cur_cache_cbd->shm->shm_name, + bk->uri); + MAP_RELEASE(data->cur_cache_cbd->shm, + "rspamd_http_map_cached_cbdata"); + ev_timer_stop(data->cur_cache_cbd->event_loop, + &data->cur_cache_cbd->timeout); + g_free(data->cur_cache_cbd); + data->cur_cache_cbd = NULL; + } + } + } + + g_free(bk->data.hd); + } + break; + } + + if (bk->trusted_pubkey) { + rspamd_pubkey_unref(bk->trusted_pubkey); + } + + g_free(bk->uri); + g_free(bk); +} + +static struct rspamd_map_backend * +rspamd_map_parse_backend(struct rspamd_config *cfg, const gchar *map_line) +{ + struct rspamd_map_backend *bk; + struct file_map_data *fdata = NULL; + struct http_map_data *hdata = NULL; + struct static_map_data *sdata = NULL; + struct http_parser_url up; + const gchar *end, *p; + rspamd_ftok_t tok; + + bk = g_malloc0(sizeof(*bk)); + REF_INIT_RETAIN(bk, rspamd_map_backend_dtor); + + if (!rspamd_map_check_proto(cfg, map_line, bk)) { + goto err; + } + + if (bk->is_fallback && bk->protocol != MAP_PROTO_FILE) { + msg_err_config("fallback backend must be file for %s", bk->uri); + + goto err; + } + + end = map_line + strlen(map_line); + if (end - map_line > 5) { + p = end - 5; + if (g_ascii_strcasecmp(p, ".zstd") == 0) { + bk->is_compressed = TRUE; + } + p = end - 4; + if (g_ascii_strcasecmp(p, ".zst") == 0) { + bk->is_compressed = TRUE; + } + } + + /* Now check for each proto separately */ + if (bk->protocol == MAP_PROTO_FILE) { + fdata = g_malloc0(sizeof(struct file_map_data)); + + if (access(bk->uri, R_OK) == -1) { + if (errno != ENOENT) { + msg_err_config("cannot open file '%s': %s", bk->uri, strerror(errno)); + goto err; + } + + msg_info_config( + "map '%s' is not found, but it can be loaded automatically later", + bk->uri); + } + + fdata->filename = g_strdup(bk->uri); + bk->data.fd = fdata; + } + else if (bk->protocol == MAP_PROTO_HTTP || bk->protocol == MAP_PROTO_HTTPS) { + hdata = g_malloc0(sizeof(struct http_map_data)); + + memset(&up, 0, sizeof(up)); + if (http_parser_parse_url(bk->uri, strlen(bk->uri), FALSE, + &up) != 0) { + msg_err_config("cannot parse HTTP url: %s", bk->uri); + goto err; + } + else { + if (!(up.field_set & 1u << UF_HOST)) { + msg_err_config("cannot parse HTTP url: %s: no host", bk->uri); + goto err; + } + + tok.begin = bk->uri + up.field_data[UF_HOST].off; + tok.len = up.field_data[UF_HOST].len; + hdata->host = rspamd_ftokdup(&tok); + + if (up.field_set & (1u << UF_PORT)) { + hdata->port = up.port; + } + else { + if (bk->protocol == MAP_PROTO_HTTP) { + hdata->port = 80; + } + else { + hdata->port = 443; + } + } + + if (up.field_set & (1u << UF_PATH)) { + tok.begin = bk->uri + up.field_data[UF_PATH].off; + tok.len = up.field_data[UF_PATH].len; + + hdata->path = rspamd_ftokdup(&tok); + + /* We also need to check query + fragment */ + if (up.field_set & ((1u << UF_QUERY) | (1u << UF_FRAGMENT))) { + tok.begin = bk->uri + up.field_data[UF_PATH].off + + up.field_data[UF_PATH].len; + tok.len = strlen(tok.begin); + hdata->rest = rspamd_ftokdup(&tok); + } + else { + hdata->rest = g_strdup(""); + } + } + + if (up.field_set & (1u << UF_USERINFO)) { + /* Create authorisation header for basic auth */ + guint len = sizeof("Basic ") + + up.field_data[UF_USERINFO].len * 8 / 5 + 4; + hdata->userinfo = g_malloc(len); + rspamd_snprintf(hdata->userinfo, len, "Basic %*Bs", + (int) up.field_data[UF_USERINFO].len, + bk->uri + up.field_data[UF_USERINFO].off); + + msg_debug("added userinfo for the map from the URL: %s", hdata->host); + } + else { + /* Try to obtain authentication data from options in the configuration */ + const ucl_object_t *auth_obj, *opts_obj; + + opts_obj = ucl_object_lookup(cfg->cfg_ucl_obj, "options"); + if (opts_obj != NULL) { + auth_obj = ucl_object_lookup(opts_obj, "http_auth"); + if (auth_obj != NULL && ucl_object_type(auth_obj) == UCL_OBJECT) { + const ucl_object_t *host_obj; + + /* + * Search first by the full URL and then by the host part + */ + host_obj = ucl_object_lookup(auth_obj, map_line); + + if (host_obj == NULL) { + host_obj = ucl_object_lookup(auth_obj, hdata->host); + } + + if (host_obj != NULL && ucl_object_type(host_obj) == UCL_OBJECT) { + const ucl_object_t *user_obj, *password_obj; + + user_obj = ucl_object_lookup(host_obj, "user"); + password_obj = ucl_object_lookup(host_obj, "password"); + + if (user_obj != NULL && password_obj != NULL && + ucl_object_type(user_obj) == UCL_STRING && + ucl_object_type(password_obj) == UCL_STRING) { + + gchar *tmpbuf; + unsigned tlen; + + /* User + password + ':' */ + tlen = strlen(ucl_object_tostring(user_obj)) + + strlen(ucl_object_tostring(password_obj)) + 1; + tmpbuf = g_malloc(tlen + 1); + rspamd_snprintf(tmpbuf, tlen + 1, "%s:%s", + ucl_object_tostring(user_obj), + ucl_object_tostring(password_obj)); + /* Base64 encoding is not so greedy, but we add some space for simplicity */ + tlen *= 2; + tlen += sizeof("Basic ") - 1; + hdata->userinfo = g_malloc(tlen + 1); + rspamd_snprintf(hdata->userinfo, tlen + 1, "Basic %Bs", tmpbuf); + g_free(tmpbuf); + msg_debug("added userinfo for the map from the configuration: %s", map_line); + } + } + } + } + } + } + + hdata->cache = rspamd_mempool_alloc0_shared(cfg->cfg_pool, + sizeof(*hdata->cache)); + + bk->data.hd = hdata; + } + else if (bk->protocol == MAP_PROTO_STATIC) { + sdata = g_malloc0(sizeof(*sdata)); + bk->data.sd = sdata; + } + + bk->id = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_T1HA, + bk->uri, strlen(bk->uri), + 0xdeadbabe); + + return bk; + +err: + MAP_RELEASE(bk, "rspamd_map_backend"); + + if (hdata) { + g_free(hdata); + } + + if (fdata) { + g_free(fdata); + } + + if (sdata) { + g_free(sdata); + } + + return NULL; +} + +static void +rspamd_map_calculate_hash(struct rspamd_map *map) +{ + struct rspamd_map_backend *bk; + guint i; + rspamd_cryptobox_hash_state_t st; + gchar *cksum_encoded, cksum[rspamd_cryptobox_HASHBYTES]; + + rspamd_cryptobox_hash_init(&st, NULL, 0); + + for (i = 0; i < map->backends->len; i++) { + bk = g_ptr_array_index(map->backends, i); + rspamd_cryptobox_hash_update(&st, bk->uri, strlen(bk->uri)); + } + + rspamd_cryptobox_hash_final(&st, cksum); + cksum_encoded = rspamd_encode_base32(cksum, sizeof(cksum), RSPAMD_BASE32_DEFAULT); + rspamd_strlcpy(map->tag, cksum_encoded, sizeof(map->tag)); + g_free(cksum_encoded); +} + +static gboolean +rspamd_map_add_static_string(struct rspamd_config *cfg, + const ucl_object_t *elt, + GString *target) +{ + gsize sz; + const gchar *dline; + + if (ucl_object_type(elt) != UCL_STRING) { + msg_err_config("map has static backend but `data` is " + "not string like: %s", + ucl_object_type_to_string(elt->type)); + return FALSE; + } + + /* Otherwise, we copy data to the backend */ + dline = ucl_object_tolstring(elt, &sz); + + if (sz == 0) { + msg_err_config("map has static backend but empty no data"); + return FALSE; + } + + g_string_append_len(target, dline, sz); + g_string_append_c(target, '\n'); + + return TRUE; +} + +struct rspamd_map * +rspamd_map_add(struct rspamd_config *cfg, + const gchar *map_line, + const gchar *description, + map_cb_t read_callback, + map_fin_cb_t fin_callback, + map_dtor_t dtor, + void **user_data, + struct rspamd_worker *worker, + int flags) +{ + struct rspamd_map *map; + struct rspamd_map_backend *bk; + + bk = rspamd_map_parse_backend(cfg, map_line); + if (bk == NULL) { + return NULL; + } + + if (bk->is_fallback) { + msg_err_config("cannot add map with fallback only backend: %s", bk->uri); + REF_RELEASE(bk); + + return NULL; + } + + map = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(struct rspamd_map)); + map->read_callback = read_callback; + map->fin_callback = fin_callback; + map->dtor = dtor; + map->user_data = user_data; + map->cfg = cfg; + map->id = rspamd_random_uint64_fast(); + map->locked = + rspamd_mempool_alloc0_shared(cfg->cfg_pool, sizeof(gint)); + map->backends = g_ptr_array_sized_new(1); + map->wrk = worker; + rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard, + map->backends); + g_ptr_array_add(map->backends, bk); + map->name = rspamd_mempool_strdup(cfg->cfg_pool, map_line); + map->no_file_read = (flags & RSPAMD_MAP_FILE_NO_READ); + + if (bk->protocol == MAP_PROTO_FILE) { + map->poll_timeout = (cfg->map_timeout * cfg->map_file_watch_multiplier); + } + else { + map->poll_timeout = cfg->map_timeout; + } + + if (description != NULL) { + map->description = rspamd_mempool_strdup(cfg->cfg_pool, description); + } + + rspamd_map_calculate_hash(map); + msg_info_map("added map %s", bk->uri); + bk->map = map; + + cfg->maps = g_list_prepend(cfg->maps, map); + + return map; +} + +struct rspamd_map * +rspamd_map_add_fake(struct rspamd_config *cfg, + const gchar *description, + const gchar *name) +{ + struct rspamd_map *map; + + map = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(struct rspamd_map)); + map->cfg = cfg; + map->id = rspamd_random_uint64_fast(); + map->name = rspamd_mempool_strdup(cfg->cfg_pool, name); + map->user_data = (void **) ↦ /* to prevent null pointer dereferencing */ + + if (description != NULL) { + map->description = rspamd_mempool_strdup(cfg->cfg_pool, description); + } + + return map; +} + +static inline void +rspamd_map_add_backend(struct rspamd_map *map, struct rspamd_map_backend *bk) +{ + if (bk->is_fallback) { + if (map->fallback_backend) { + msg_warn_map("redefining fallback backend from %s to %s", + map->fallback_backend->uri, bk->uri); + } + + map->fallback_backend = bk; + } + else { + g_ptr_array_add(map->backends, bk); + } + + bk->map = map; +} + +struct rspamd_map * +rspamd_map_add_from_ucl(struct rspamd_config *cfg, + const ucl_object_t *obj, + const gchar *description, + map_cb_t read_callback, + map_fin_cb_t fin_callback, + map_dtor_t dtor, + void **user_data, + struct rspamd_worker *worker, + gint flags) +{ + ucl_object_iter_t it = NULL; + const ucl_object_t *cur, *elt; + struct rspamd_map *map; + struct rspamd_map_backend *bk; + guint i; + + g_assert(obj != NULL); + + if (ucl_object_type(obj) == UCL_STRING) { + /* Just a plain string */ + return rspamd_map_add(cfg, ucl_object_tostring(obj), description, + read_callback, fin_callback, dtor, user_data, worker, flags); + } + + map = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(struct rspamd_map)); + map->read_callback = read_callback; + map->fin_callback = fin_callback; + map->dtor = dtor; + map->user_data = user_data; + map->cfg = cfg; + map->id = rspamd_random_uint64_fast(); + map->locked = + rspamd_mempool_alloc0_shared(cfg->cfg_pool, sizeof(gint)); + map->backends = g_ptr_array_new(); + map->wrk = worker; + map->no_file_read = (flags & RSPAMD_MAP_FILE_NO_READ); + rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard, + map->backends); + map->poll_timeout = cfg->map_timeout; + + if (description) { + map->description = rspamd_mempool_strdup(cfg->cfg_pool, description); + } + + if (ucl_object_type(obj) == UCL_ARRAY) { + /* Add array of maps as multiple backends */ + while ((cur = ucl_object_iterate(obj, &it, true)) != NULL) { + if (ucl_object_type(cur) == UCL_STRING) { + bk = rspamd_map_parse_backend(cfg, ucl_object_tostring(cur)); + + if (bk != NULL) { + rspamd_map_add_backend(map, bk); + + if (!map->name) { + map->name = rspamd_mempool_strdup(cfg->cfg_pool, + ucl_object_tostring(cur)); + } + } + } + else { + msg_err_config("bad map element type: %s", + ucl_object_type_to_string(ucl_object_type(cur))); + } + } + + if (map->backends->len == 0) { + msg_err_config("map has no urls to be loaded: empty list"); + goto err; + } + } + else if (ucl_object_type(obj) == UCL_OBJECT) { + elt = ucl_object_lookup(obj, "name"); + if (elt && ucl_object_type(elt) == UCL_STRING) { + map->name = rspamd_mempool_strdup(cfg->cfg_pool, + ucl_object_tostring(elt)); + } + + elt = ucl_object_lookup(obj, "description"); + if (elt && ucl_object_type(elt) == UCL_STRING) { + map->description = rspamd_mempool_strdup(cfg->cfg_pool, + ucl_object_tostring(elt)); + } + + elt = ucl_object_lookup_any(obj, "timeout", "poll", "poll_time", + "watch_interval", NULL); + if (elt) { + map->poll_timeout = ucl_object_todouble(elt); + } + + elt = ucl_object_lookup_any(obj, "upstreams", "url", "urls", NULL); + if (elt == NULL) { + msg_err_config("map has no urls to be loaded: no elt"); + goto err; + } + + if (ucl_object_type(elt) == UCL_ARRAY) { + /* Add array of maps as multiple backends */ + it = ucl_object_iterate_new(elt); + + while ((cur = ucl_object_iterate_safe(it, true)) != NULL) { + if (ucl_object_type(cur) == UCL_STRING) { + bk = rspamd_map_parse_backend(cfg, ucl_object_tostring(cur)); + + if (bk != NULL) { + rspamd_map_add_backend(map, bk); + + if (!map->name) { + map->name = rspamd_mempool_strdup(cfg->cfg_pool, + ucl_object_tostring(cur)); + } + } + } + else { + msg_err_config("bad map element type: %s", + ucl_object_type_to_string(ucl_object_type(cur))); + ucl_object_iterate_free(it); + goto err; + } + } + + ucl_object_iterate_free(it); + + if (map->backends->len == 0) { + msg_err_config("map has no urls to be loaded: empty object list"); + goto err; + } + } + else if (ucl_object_type(elt) == UCL_STRING) { + bk = rspamd_map_parse_backend(cfg, ucl_object_tostring(elt)); + + if (bk != NULL) { + rspamd_map_add_backend(map, bk); + + if (!map->name) { + map->name = rspamd_mempool_strdup(cfg->cfg_pool, + ucl_object_tostring(elt)); + } + } + } + + if (!map->backends || map->backends->len == 0) { + msg_err_config("map has no urls to be loaded: no valid backends"); + goto err; + } + } + else { + msg_err_config("map has invalid type for value: %s", + ucl_object_type_to_string(ucl_object_type(obj))); + goto err; + } + + gboolean all_local = TRUE; + + PTR_ARRAY_FOREACH(map->backends, i, bk) + { + if (bk->protocol == MAP_PROTO_STATIC) { + GString *map_data; + /* We need data field in ucl */ + elt = ucl_object_lookup(obj, "data"); + + if (elt == NULL) { + msg_err_config("map has static backend but no `data` field"); + goto err; + } + + + if (ucl_object_type(elt) == UCL_STRING) { + map_data = g_string_sized_new(32); + + if (rspamd_map_add_static_string(cfg, elt, map_data)) { + bk->data.sd->data = map_data->str; + bk->data.sd->len = map_data->len; + g_string_free(map_data, FALSE); + } + else { + g_string_free(map_data, TRUE); + msg_err_config("map has static backend with invalid `data` field"); + goto err; + } + } + else if (ucl_object_type(elt) == UCL_ARRAY) { + map_data = g_string_sized_new(32); + it = ucl_object_iterate_new(elt); + + while ((cur = ucl_object_iterate_safe(it, true))) { + if (!rspamd_map_add_static_string(cfg, cur, map_data)) { + g_string_free(map_data, TRUE); + msg_err_config("map has static backend with invalid " + "`data` field"); + ucl_object_iterate_free(it); + goto err; + } + } + + ucl_object_iterate_free(it); + bk->data.sd->data = map_data->str; + bk->data.sd->len = map_data->len; + g_string_free(map_data, FALSE); + } + } + else if (bk->protocol != MAP_PROTO_FILE) { + all_local = FALSE; + } + } + + if (all_local) { + map->poll_timeout = (map->poll_timeout * + cfg->map_file_watch_multiplier); + } + + rspamd_map_calculate_hash(map); + msg_debug_map("added map from ucl"); + + cfg->maps = g_list_prepend(cfg->maps, map); + + return map; + +err: + + if (map) { + PTR_ARRAY_FOREACH(map->backends, i, bk) + { + MAP_RELEASE(bk, "rspamd_map_backend"); + } + } + + return NULL; +} + +rspamd_map_traverse_function +rspamd_map_get_traverse_function(struct rspamd_map *map) +{ + if (map) { + return map->traverse_function; + } + + return NULL; +} + +void rspamd_map_traverse(struct rspamd_map *map, rspamd_map_traverse_cb cb, + gpointer cbdata, gboolean reset_hits) +{ + if (*map->user_data && map->traverse_function) { + map->traverse_function(*map->user_data, cb, cbdata, reset_hits); + } +} + +void rspamd_map_set_on_load_function(struct rspamd_map *map, rspamd_map_on_load_function cb, + gpointer cbdata, GDestroyNotify dtor) +{ + if (map) { + map->on_load_function = cb; + map->on_load_ud = cbdata; + map->on_load_ud_dtor = dtor; + } +} diff --git a/src/libserver/maps/map.h b/src/libserver/maps/map.h new file mode 100644 index 0000000..04df16e --- /dev/null +++ b/src/libserver/maps/map.h @@ -0,0 +1,168 @@ +#ifndef RSPAMD_MAP_H +#define RSPAMD_MAP_H + +#include "config.h" +#include "contrib/libev/ev.h" + +#include "ucl.h" +#include "mem_pool.h" +#include "radix.h" +#include "dns.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Maps API is designed to load lists data from different dynamic sources. + * It monitor files and HTTP locations for modifications and reload them if they are + * modified. + */ +struct map_cb_data; +struct rspamd_worker; + +/** + * Common map object + */ +struct rspamd_config; +struct rspamd_map; + +/** + * Callback types + */ +typedef gchar *(*map_cb_t)(gchar *chunk, gint len, + struct map_cb_data *data, gboolean final); + +typedef void (*map_fin_cb_t)(struct map_cb_data *data, void **target); + +typedef void (*map_dtor_t)(struct map_cb_data *data); + +typedef gboolean (*rspamd_map_traverse_cb)(gconstpointer key, + gconstpointer value, gsize hits, gpointer ud); + +typedef void (*rspamd_map_traverse_function)(void *data, + rspamd_map_traverse_cb cb, + gpointer cbdata, gboolean reset_hits); +typedef void (*rspamd_map_on_load_function)(struct rspamd_map *map, gpointer ud); + +/** + * Callback data for async load + */ +struct map_cb_data { + struct rspamd_map *map; + gint state; + bool errored; + void *prev_data; + void *cur_data; +}; + +/** + * Returns TRUE if line looks like a map definition + * @param map_line + * @return + */ +gboolean rspamd_map_is_map(const gchar *map_line); + +enum rspamd_map_flags { + RSPAMD_MAP_DEFAULT = 0, + RSPAMD_MAP_FILE_ONLY = 1u << 0u, + RSPAMD_MAP_FILE_NO_READ = 1u << 1u, +}; + +/** + * Add map from line + */ +struct rspamd_map *rspamd_map_add(struct rspamd_config *cfg, + const gchar *map_line, + const gchar *description, + map_cb_t read_callback, + map_fin_cb_t fin_callback, + map_dtor_t dtor, + void **user_data, + struct rspamd_worker *worker, + int flags); + +/** + * Add map from ucl + */ +struct rspamd_map *rspamd_map_add_from_ucl(struct rspamd_config *cfg, + const ucl_object_t *obj, + const gchar *description, + map_cb_t read_callback, + map_fin_cb_t fin_callback, + map_dtor_t dtor, + void **user_data, + struct rspamd_worker *worker, + int flags); + +/** + * Adds a fake map structure (for logging purposes mainly) + * @param cfg + * @param description + * @return + */ +struct rspamd_map *rspamd_map_add_fake(struct rspamd_config *cfg, + const gchar *description, + const gchar *name); + + +enum rspamd_map_watch_type { + RSPAMD_MAP_WATCH_MIN = 9, + RSPAMD_MAP_WATCH_PRIMARY_CONTROLLER, + RSPAMD_MAP_WATCH_SCANNER, + RSPAMD_MAP_WATCH_WORKER, + RSPAMD_MAP_WATCH_MAX +}; + +/** + * Start watching of maps by adding events to libevent event loop + */ +void rspamd_map_watch(struct rspamd_config *cfg, + struct ev_loop *event_loop, + struct rspamd_dns_resolver *resolver, + struct rspamd_worker *worker, + enum rspamd_map_watch_type how); + +/** + * Preloads maps where all backends are file + * @param cfg + */ +void rspamd_map_preload(struct rspamd_config *cfg); + +/** + * Remove all maps watched (remove events) + */ +void rspamd_map_remove_all(struct rspamd_config *cfg); + +/** + * Get traverse function for specific map + * @param map + * @return + */ +rspamd_map_traverse_function rspamd_map_get_traverse_function(struct rspamd_map *map); + +/** + * Perform map traverse + * @param map + * @param cb + * @param cbdata + * @param reset_hits + * @return + */ +void rspamd_map_traverse(struct rspamd_map *map, rspamd_map_traverse_cb cb, + gpointer cbdata, gboolean reset_hits); + +/** + * Set map on load callback + * @param map + * @param cb + * @param cbdata + */ +void rspamd_map_set_on_load_function(struct rspamd_map *map, rspamd_map_on_load_function cb, + gpointer cbdata, GDestroyNotify dtor); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/maps/map_helpers.c b/src/libserver/maps/map_helpers.c new file mode 100644 index 0000000..65478c5 --- /dev/null +++ b/src/libserver/maps/map_helpers.c @@ -0,0 +1,1845 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "map_helpers.h" +#include "map_private.h" +#include "khash.h" +#include "radix.h" +#include "rspamd.h" +#include "cryptobox.h" +#include "mempool_vars_internal.h" +#include "contrib/fastutf8/fastutf8.h" +#include "contrib/cdb/cdb.h" + +#ifdef WITH_HYPERSCAN +#include "hs.h" +#include "hyperscan_tools.h" +#endif +#ifndef WITH_PCRE2 +#include <pcre.h> +#else +#include <pcre2.h> +#endif + + +static const guint64 map_hash_seed = 0xdeadbabeULL; +static const gchar *const hash_fill = "1"; + +struct rspamd_map_helper_value { + gsize hits; + gconstpointer key; + gchar value[]; /* Null terminated */ +}; + +#define rspamd_map_ftok_hash(t) (rspamd_icase_hash((t).begin, (t).len, rspamd_hash_seed())) +#define rspamd_map_ftok_equal(a, b) ((a).len == (b).len && rspamd_lc_cmp((a).begin, (b).begin, (a).len) == 0) + +KHASH_INIT(rspamd_map_hash, rspamd_ftok_t, + struct rspamd_map_helper_value *, true, + rspamd_map_ftok_hash, rspamd_map_ftok_equal); + +struct rspamd_radix_map_helper { + rspamd_mempool_t *pool; + khash_t(rspamd_map_hash) * htb; + radix_compressed_t *trie; + struct rspamd_map *map; + rspamd_cryptobox_fast_hash_state_t hst; +}; + +struct rspamd_hash_map_helper { + rspamd_mempool_t *pool; + khash_t(rspamd_map_hash) * htb; + struct rspamd_map *map; + rspamd_cryptobox_fast_hash_state_t hst; +}; + +struct rspamd_cdb_map_helper { + GQueue cdbs; + struct rspamd_map *map; + rspamd_cryptobox_fast_hash_state_t hst; + gsize total_size; +}; + +struct rspamd_regexp_map_helper { + rspamd_cryptobox_hash_state_t hst; + guchar re_digest[rspamd_cryptobox_HASHBYTES]; + rspamd_mempool_t *pool; + struct rspamd_map *map; + GPtrArray *regexps; + GPtrArray *values; + khash_t(rspamd_map_hash) * htb; + enum rspamd_regexp_map_flags map_flags; +#ifdef WITH_HYPERSCAN + rspamd_hyperscan_t *hs_db; + hs_scratch_t *hs_scratch; + gchar **patterns; + gint *flags; + gint *ids; +#endif +}; + +/** + * FSM for parsing lists + */ + +#define MAP_STORE_KEY \ + do { \ + while (g_ascii_isspace(*c) && p > c) { c++; } \ + key = g_malloc(p - c + 1); \ + rspamd_strlcpy(key, c, p - c + 1); \ + stripped_key = g_strstrip(key); \ + } while (0) + +#define MAP_STORE_VALUE \ + do { \ + while (g_ascii_isspace(*c) && p > c) { c++; } \ + value = g_malloc(p - c + 1); \ + rspamd_strlcpy(value, c, p - c + 1); \ + stripped_value = g_strstrip(value); \ + } while (0) + +gchar * +rspamd_parse_kv_list( + gchar *chunk, + gint len, + struct map_cb_data *data, + rspamd_map_insert_func func, + const gchar *default_value, + gboolean final) +{ + enum { + map_skip_spaces_before_key = 0, + map_read_key, + map_read_key_quoted, + map_read_key_slashed, + map_skip_spaces_after_key, + map_backslash_quoted, + map_backslash_slashed, + map_read_key_after_slash, + map_read_value, + map_read_comment_start, + map_skip_comment, + map_read_eol, + }; + + gchar *c, *p, *key = NULL, *value = NULL, *stripped_key, *stripped_value, *end; + struct rspamd_map *map = data->map; + guint line_number = 0; + + p = chunk; + c = p; + end = p + len; + + while (p < end) { + switch (data->state) { + case map_skip_spaces_before_key: + if (g_ascii_isspace(*p)) { + p++; + } + else { + if (*p == '"') { + p++; + c = p; + data->state = map_read_key_quoted; + } + else if (*p == '/') { + /* Note that c is on '/' here as '/' is a part of key */ + c = p; + p++; + data->state = map_read_key_slashed; + } + else { + c = p; + data->state = map_read_key; + } + } + break; + case map_read_key: + /* read key */ + /* Check here comments, eol and end of buffer */ + if (*p == '#' && (p == c || *(p - 1) != '\\')) { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_KEY; + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s; line: %d", + stripped_key, default_value, line_number); + g_free(key); + } + + key = NULL; + data->state = map_read_comment_start; + } + else if (*p == '\r' || *p == '\n') { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_KEY; + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s; line: %d", + stripped_key, default_value, line_number); + g_free(key); + } + + data->state = map_read_eol; + key = NULL; + } + else if (g_ascii_isspace(*p)) { + if (p - c > 0) { + MAP_STORE_KEY; + data->state = map_skip_spaces_after_key; + } + else { + msg_err_map("empty or invalid key found on line %d", line_number); + data->state = map_skip_comment; + } + } + else { + p++; + } + break; + case map_read_key_quoted: + if (*p == '\\') { + data->state = map_backslash_quoted; + p++; + } + else if (*p == '"') { + /* Allow empty keys in this case */ + if (p - c >= 0) { + MAP_STORE_KEY; + data->state = map_skip_spaces_after_key; + } + else { + g_assert_not_reached(); + } + p++; + } + else { + p++; + } + break; + case map_read_key_slashed: + if (*p == '\\') { + data->state = map_backslash_slashed; + p++; + } + else if (*p == '/') { + /* Allow empty keys in this case */ + if (p - c >= 0) { + data->state = map_read_key_after_slash; + } + else { + g_assert_not_reached(); + } + } + else { + p++; + } + break; + case map_read_key_after_slash: + /* + * This state is equal to reading of key but '/' is not + * treated specially + */ + if (*p == '#') { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_KEY; + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s; line: %d", + stripped_key, default_value, line_number); + g_free(key); + key = NULL; + } + + data->state = map_read_comment_start; + } + else if (*p == '\r' || *p == '\n') { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_KEY; + func(data->cur_data, stripped_key, default_value); + + msg_debug_map("insert key only pair: %s -> %s; line: %d", + stripped_key, default_value, line_number); + g_free(key); + key = NULL; + } + + data->state = map_read_eol; + key = NULL; + } + else if (g_ascii_isspace(*p)) { + if (p - c > 0) { + MAP_STORE_KEY; + data->state = map_skip_spaces_after_key; + } + else { + msg_err_map("empty or invalid key found on line %d", line_number); + data->state = map_skip_comment; + } + } + else { + p++; + } + break; + case map_backslash_quoted: + p++; + data->state = map_read_key_quoted; + break; + case map_backslash_slashed: + p++; + data->state = map_read_key_slashed; + break; + case map_skip_spaces_after_key: + if (*p == ' ' || *p == '\t') { + p++; + } + else { + c = p; + data->state = map_read_value; + } + break; + case map_read_value: + if (key == NULL) { + /* Ignore line */ + msg_err_map("empty or invalid key found on line %d", line_number); + data->state = map_skip_comment; + } + else { + if (*p == '#') { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_VALUE; + func(data->cur_data, stripped_key, stripped_value); + msg_debug_map("insert key value pair: %s -> %s; line: %d", + stripped_key, stripped_value, line_number); + g_free(key); + g_free(value); + key = NULL; + value = NULL; + } + else { + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s; line: %d", + stripped_key, default_value, line_number); + g_free(key); + key = NULL; + } + + data->state = map_read_comment_start; + } + else if (*p == '\r' || *p == '\n') { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_VALUE; + func(data->cur_data, stripped_key, stripped_value); + msg_debug_map("insert key value pair: %s -> %s", + stripped_key, stripped_value); + g_free(key); + g_free(value); + key = NULL; + value = NULL; + } + else { + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s", + stripped_key, default_value); + g_free(key); + key = NULL; + } + + data->state = map_read_eol; + key = NULL; + } + else { + p++; + } + } + break; + case map_read_comment_start: + if (*p == '#') { + data->state = map_skip_comment; + p++; + key = NULL; + value = NULL; + } + else { + g_assert_not_reached(); + } + break; + case map_skip_comment: + if (*p == '\r' || *p == '\n') { + data->state = map_read_eol; + } + else { + p++; + } + break; + case map_read_eol: + /* Skip \r\n and whitespaces */ + if (*p == '\r' || *p == '\n') { + if (*p == '\n') { + /* We don't care about \r only line separators, they are too rare */ + line_number++; + } + p++; + } + else { + data->state = map_skip_spaces_before_key; + } + break; + default: + g_assert_not_reached(); + break; + } + } + + if (final) { + /* Examine the state */ + switch (data->state) { + case map_read_key: + case map_read_key_slashed: + case map_read_key_quoted: + case map_read_key_after_slash: + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_KEY; + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s", + stripped_key, default_value); + g_free(key); + key = NULL; + } + break; + case map_read_value: + if (key == NULL) { + /* Ignore line */ + msg_err_map("empty or invalid key found on line %d", line_number); + data->state = map_skip_comment; + } + else { + if (p - c > 0) { + /* Store a single key */ + MAP_STORE_VALUE; + func(data->cur_data, stripped_key, stripped_value); + msg_debug_map("insert key value pair: %s -> %s", + stripped_key, stripped_value); + g_free(key); + g_free(value); + key = NULL; + value = NULL; + } + else { + func(data->cur_data, stripped_key, default_value); + msg_debug_map("insert key only pair: %s -> %s", + stripped_key, default_value); + g_free(key); + key = NULL; + } + } + break; + } + + data->state = map_skip_spaces_before_key; + } + + return c; +} + +/** + * Radix tree helper function + */ +void rspamd_map_helper_insert_radix(gpointer st, gconstpointer key, gconstpointer value) +{ + struct rspamd_radix_map_helper *r = (struct rspamd_radix_map_helper *) st; + struct rspamd_map_helper_value *val; + gsize vlen; + khiter_t k; + gconstpointer nk; + rspamd_ftok_t tok; + gint res; + struct rspamd_map *map; + + map = r->map; + tok.begin = key; + tok.len = strlen(key); + + k = kh_get(rspamd_map_hash, r->htb, tok); + + if (k == kh_end(r->htb)) { + nk = rspamd_mempool_strdup(r->pool, key); + tok.begin = nk; + k = kh_put(rspamd_map_hash, r->htb, tok, &res); + } + else { + val = kh_value(r->htb, k); + + if (strcmp(value, val->value) == 0) { + /* Same element, skip */ + return; + } + else { + msg_warn_map("duplicate radix entry found for map %s: %s (old value: '%s', new: '%s')", + map->name, key, val->value, value); + } + + nk = kh_key(r->htb, k).begin; + val->key = nk; + kh_value(r->htb, k) = val; + + return; /* do not touch radix in case of exact duplicate */ + } + + vlen = strlen(value); + val = rspamd_mempool_alloc0(r->pool, sizeof(*val) + + vlen + 1); + memcpy(val->value, value, vlen); + + nk = kh_key(r->htb, k).begin; + val->key = nk; + kh_value(r->htb, k) = val; + rspamd_radix_add_iplist(key, ",", r->trie, val, FALSE, + r->map->name); + rspamd_cryptobox_fast_hash_update(&r->hst, nk, tok.len); +} + +void rspamd_map_helper_insert_radix_resolve(gpointer st, gconstpointer key, gconstpointer value) +{ + struct rspamd_radix_map_helper *r = (struct rspamd_radix_map_helper *) st; + struct rspamd_map_helper_value *val; + gsize vlen; + khiter_t k; + gconstpointer nk; + rspamd_ftok_t tok; + gint res; + struct rspamd_map *map; + + map = r->map; + + if (!key) { + msg_warn_map("cannot insert NULL value in the map: %s", + map->name); + return; + } + + tok.begin = key; + tok.len = strlen(key); + + k = kh_get(rspamd_map_hash, r->htb, tok); + + if (k == kh_end(r->htb)) { + nk = rspamd_mempool_strdup(r->pool, key); + tok.begin = nk; + k = kh_put(rspamd_map_hash, r->htb, tok, &res); + } + else { + val = kh_value(r->htb, k); + + if (strcmp(value, val->value) == 0) { + /* Same element, skip */ + return; + } + else { + msg_warn_map("duplicate radix entry found for map %s: %s (old value: '%s', new: '%s')", + map->name, key, val->value, value); + } + + nk = kh_key(r->htb, k).begin; + val->key = nk; + kh_value(r->htb, k) = val; + + return; /* do not touch radix in case of exact duplicate */ + } + + vlen = strlen(value); + val = rspamd_mempool_alloc0(r->pool, sizeof(*val) + + vlen + 1); + memcpy(val->value, value, vlen); + nk = kh_key(r->htb, k).begin; + val->key = nk; + kh_value(r->htb, k) = val; + rspamd_radix_add_iplist(key, ",", r->trie, val, TRUE, + r->map->name); + rspamd_cryptobox_fast_hash_update(&r->hst, nk, tok.len); +} + +void rspamd_map_helper_insert_hash(gpointer st, gconstpointer key, gconstpointer value) +{ + struct rspamd_hash_map_helper *ht = st; + struct rspamd_map_helper_value *val; + khiter_t k; + gconstpointer nk; + gsize vlen; + gint r; + rspamd_ftok_t tok; + struct rspamd_map *map; + + tok.begin = key; + tok.len = strlen(key); + map = ht->map; + + k = kh_get(rspamd_map_hash, ht->htb, tok); + + if (k == kh_end(ht->htb)) { + nk = rspamd_mempool_strdup(ht->pool, key); + tok.begin = nk; + k = kh_put(rspamd_map_hash, ht->htb, tok, &r); + } + else { + val = kh_value(ht->htb, k); + + if (strcmp(value, val->value) == 0) { + /* Same element, skip */ + return; + } + else { + msg_warn_map("duplicate hash entry found for map %s: %s (old value: '%s', new: '%s')", + map->name, key, val->value, value); + } + } + + /* Null termination due to alloc0 */ + vlen = strlen(value); + val = rspamd_mempool_alloc0(ht->pool, sizeof(*val) + vlen + 1); + memcpy(val->value, value, vlen); + + tok = kh_key(ht->htb, k); + nk = tok.begin; + val->key = nk; + kh_value(ht->htb, k) = val; + + rspamd_cryptobox_fast_hash_update(&ht->hst, nk, tok.len); +} + +void rspamd_map_helper_insert_re(gpointer st, gconstpointer key, gconstpointer value) +{ + struct rspamd_regexp_map_helper *re_map = st; + struct rspamd_map *map; + rspamd_regexp_t *re; + gchar *escaped; + GError *err = NULL; + gint pcre_flags; + gsize escaped_len; + struct rspamd_map_helper_value *val; + khiter_t k; + rspamd_ftok_t tok; + gconstpointer nk; + gsize vlen; + gint r; + + map = re_map->map; + + tok.begin = key; + tok.len = strlen(key); + + k = kh_get(rspamd_map_hash, re_map->htb, tok); + + if (k == kh_end(re_map->htb)) { + nk = rspamd_mempool_strdup(re_map->pool, key); + tok.begin = nk; + k = kh_put(rspamd_map_hash, re_map->htb, tok, &r); + } + else { + val = kh_value(re_map->htb, k); + + /* Always warn about regexp duplicate as it's likely a bad mistake */ + msg_warn_map("duplicate re entry found for map %s: %s (old value: '%s', new: '%s')", + map->name, key, val->value, value); + + if (strcmp(val->value, value) == 0) { + /* Same value, skip */ + return; + } + + /* Replace value but do not touch regexp */ + nk = kh_key(re_map->htb, k).begin; + val->key = nk; + kh_value(re_map->htb, k) = val; + + return; + } + + /* Check regexp stuff */ + if (re_map->map_flags & RSPAMD_REGEXP_MAP_FLAG_GLOB) { + escaped = rspamd_str_regexp_escape(key, strlen(key), &escaped_len, + RSPAMD_REGEXP_ESCAPE_GLOB | RSPAMD_REGEXP_ESCAPE_UTF); + re = rspamd_regexp_new(escaped, NULL, &err); + g_free(escaped); + } + else { + re = rspamd_regexp_new(key, NULL, &err); + } + + if (re == NULL) { + msg_err_map("cannot parse regexp %s: %e", key, err); + + if (err) { + g_error_free(err); + } + + return; + } + + vlen = strlen(value); + val = rspamd_mempool_alloc0(re_map->pool, sizeof(*val) + + vlen + 1); + memcpy(val->value, value, vlen); /* Null terminated due to alloc0 previously */ + nk = kh_key(re_map->htb, k).begin; + val->key = nk; + kh_value(re_map->htb, k) = val; + rspamd_cryptobox_hash_update(&re_map->hst, nk, tok.len); + + pcre_flags = rspamd_regexp_get_pcre_flags(re); + +#ifndef WITH_PCRE2 + if (pcre_flags & PCRE_FLAG(UTF8)) { + re_map->map_flags |= RSPAMD_REGEXP_MAP_FLAG_UTF; + } +#else + if (pcre_flags & PCRE_FLAG(UTF)) { + re_map->map_flags |= RSPAMD_REGEXP_MAP_FLAG_UTF; + } +#endif + + g_ptr_array_add(re_map->regexps, re); + g_ptr_array_add(re_map->values, val); +} + +static void +rspamd_map_helper_traverse_regexp(void *data, + rspamd_map_traverse_cb cb, + gpointer cbdata, + gboolean reset_hits) +{ + rspamd_ftok_t tok; + struct rspamd_map_helper_value *val; + struct rspamd_regexp_map_helper *re_map = data; + + kh_foreach(re_map->htb, tok, val, { + if (!cb(tok.begin, val->value, val->hits, cbdata)) { + break; + } + + if (reset_hits) { + val->hits = 0; + } + }); +} + +struct rspamd_hash_map_helper * +rspamd_map_helper_new_hash(struct rspamd_map *map) +{ + struct rspamd_hash_map_helper *htb; + rspamd_mempool_t *pool; + + if (map) { + pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + map->tag, 0); + } + else { + pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + NULL, 0); + } + + htb = rspamd_mempool_alloc0_type(pool, struct rspamd_hash_map_helper); + htb->htb = kh_init(rspamd_map_hash); + htb->pool = pool; + htb->map = map; + rspamd_cryptobox_fast_hash_init(&htb->hst, map_hash_seed); + + return htb; +} + +void rspamd_map_helper_destroy_hash(struct rspamd_hash_map_helper *r) +{ + if (r == NULL || r->pool == NULL) { + return; + } + + rspamd_mempool_t *pool = r->pool; + kh_destroy(rspamd_map_hash, r->htb); + memset(r, 0, sizeof(*r)); + rspamd_mempool_delete(pool); +} + +static void +rspamd_map_helper_traverse_hash(void *data, + rspamd_map_traverse_cb cb, + gpointer cbdata, + gboolean reset_hits) +{ + rspamd_ftok_t tok; + struct rspamd_map_helper_value *val; + struct rspamd_hash_map_helper *ht = data; + + kh_foreach(ht->htb, tok, val, { + if (!cb(tok.begin, val->value, val->hits, cbdata)) { + break; + } + + if (reset_hits) { + val->hits = 0; + } + }); +} + +struct rspamd_radix_map_helper * +rspamd_map_helper_new_radix(struct rspamd_map *map) +{ + struct rspamd_radix_map_helper *r; + rspamd_mempool_t *pool; + const gchar *name = "unnamed"; + + if (map) { + pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + map->tag, 0); + name = map->name; + } + else { + pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + NULL, 0); + } + + r = rspamd_mempool_alloc0_type(pool, struct rspamd_radix_map_helper); + r->trie = radix_create_compressed_with_pool(pool, name); + r->htb = kh_init(rspamd_map_hash); + r->pool = pool; + r->map = map; + rspamd_cryptobox_fast_hash_init(&r->hst, map_hash_seed); + + return r; +} + +void rspamd_map_helper_destroy_radix(struct rspamd_radix_map_helper *r) +{ + if (r == NULL || !r->pool) { + return; + } + + kh_destroy(rspamd_map_hash, r->htb); + rspamd_mempool_t *pool = r->pool; + memset(r, 0, sizeof(*r)); + rspamd_mempool_delete(pool); +} + +static void +rspamd_map_helper_traverse_radix(void *data, + rspamd_map_traverse_cb cb, + gpointer cbdata, + gboolean reset_hits) +{ + rspamd_ftok_t tok; + struct rspamd_map_helper_value *val; + struct rspamd_radix_map_helper *r = data; + + kh_foreach(r->htb, tok, val, { + if (!cb(tok.begin, val->value, val->hits, cbdata)) { + break; + } + + if (reset_hits) { + val->hits = 0; + } + }); +} + +struct rspamd_regexp_map_helper * +rspamd_map_helper_new_regexp(struct rspamd_map *map, + enum rspamd_regexp_map_flags flags) +{ + struct rspamd_regexp_map_helper *re_map; + rspamd_mempool_t *pool; + + pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + map->tag, 0); + + re_map = rspamd_mempool_alloc0_type(pool, struct rspamd_regexp_map_helper); + re_map->pool = pool; + re_map->values = g_ptr_array_new(); + re_map->regexps = g_ptr_array_new(); + re_map->map = map; + re_map->map_flags = flags; + re_map->htb = kh_init(rspamd_map_hash); + rspamd_cryptobox_hash_init(&re_map->hst, NULL, 0); + + return re_map; +} + + +void rspamd_map_helper_destroy_regexp(struct rspamd_regexp_map_helper *re_map) +{ + rspamd_regexp_t *re; + guint i; + + if (!re_map || !re_map->regexps) { + return; + } + +#ifdef WITH_HYPERSCAN + if (re_map->hs_scratch) { + hs_free_scratch(re_map->hs_scratch); + } + if (re_map->hs_db) { + rspamd_hyperscan_free(re_map->hs_db, false); + } + if (re_map->patterns) { + for (i = 0; i < re_map->regexps->len; i++) { + g_free(re_map->patterns[i]); + } + + g_free(re_map->patterns); + } + if (re_map->flags) { + g_free(re_map->flags); + } + if (re_map->ids) { + g_free(re_map->ids); + } +#endif + + for (i = 0; i < re_map->regexps->len; i++) { + re = g_ptr_array_index(re_map->regexps, i); + rspamd_regexp_unref(re); + } + + g_ptr_array_free(re_map->regexps, TRUE); + g_ptr_array_free(re_map->values, TRUE); + kh_destroy(rspamd_map_hash, re_map->htb); + + rspamd_mempool_t *pool = re_map->pool; + memset(re_map, 0, sizeof(*re_map)); + rspamd_mempool_delete(pool); +} + +gchar * +rspamd_kv_list_read( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + if (data->cur_data == NULL) { + data->cur_data = rspamd_map_helper_new_hash(data->map); + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_hash, + "", + final); +} + +void rspamd_kv_list_fin(struct map_cb_data *data, void **target) +{ + struct rspamd_map *map = data->map; + struct rspamd_hash_map_helper *htb; + + if (data->errored) { + /* Clean up the current data and do not touch prev data */ + if (data->cur_data) { + msg_info_map("cleanup unfinished new data as error occurred for %s", + map->name); + htb = (struct rspamd_hash_map_helper *) data->cur_data; + rspamd_map_helper_destroy_hash(htb); + data->cur_data = NULL; + } + } + else { + if (data->cur_data) { + htb = (struct rspamd_hash_map_helper *) data->cur_data; + msg_info_map("read hash of %d elements from %s", kh_size(htb->htb), + map->name); + data->map->traverse_function = rspamd_map_helper_traverse_hash; + data->map->nelts = kh_size(htb->htb); + data->map->digest = rspamd_cryptobox_fast_hash_final(&htb->hst); + } + + if (target) { + *target = data->cur_data; + } + + if (data->prev_data) { + htb = (struct rspamd_hash_map_helper *) data->prev_data; + rspamd_map_helper_destroy_hash(htb); + } + } +} + +void rspamd_kv_list_dtor(struct map_cb_data *data) +{ + struct rspamd_hash_map_helper *htb; + + if (data->cur_data) { + htb = (struct rspamd_hash_map_helper *) data->cur_data; + rspamd_map_helper_destroy_hash(htb); + } +} + +gchar * +rspamd_radix_read( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_radix_map_helper *r; + struct rspamd_map *map = data->map; + + if (data->cur_data == NULL) { + r = rspamd_map_helper_new_radix(map); + data->cur_data = r; + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_radix, + hash_fill, + final); +} + +void rspamd_radix_fin(struct map_cb_data *data, void **target) +{ + struct rspamd_map *map = data->map; + struct rspamd_radix_map_helper *r; + + if (data->errored) { + /* Clean up the current data and do not touch prev data */ + if (data->cur_data) { + msg_info_map("cleanup unfinished new data as error occurred for %s", + map->name); + r = (struct rspamd_radix_map_helper *) data->cur_data; + rspamd_map_helper_destroy_radix(r); + data->cur_data = NULL; + } + } + else { + if (data->cur_data) { + r = (struct rspamd_radix_map_helper *) data->cur_data; + msg_info_map("read radix trie of %z elements: %s", + radix_get_size(r->trie), radix_get_info(r->trie)); + data->map->traverse_function = rspamd_map_helper_traverse_radix; + data->map->nelts = kh_size(r->htb); + data->map->digest = rspamd_cryptobox_fast_hash_final(&r->hst); + } + + if (target) { + *target = data->cur_data; + } + + if (data->prev_data) { + r = (struct rspamd_radix_map_helper *) data->prev_data; + rspamd_map_helper_destroy_radix(r); + } + } +} + +void rspamd_radix_dtor(struct map_cb_data *data) +{ + struct rspamd_radix_map_helper *r; + + if (data->cur_data) { + r = (struct rspamd_radix_map_helper *) data->cur_data; + rspamd_map_helper_destroy_radix(r); + } +} + +#ifdef WITH_HYPERSCAN + +static gboolean +rspamd_try_load_re_map_cache(struct rspamd_regexp_map_helper *re_map) +{ + gchar fp[PATH_MAX]; + struct rspamd_map *map; + + map = re_map->map; + + if (!map->cfg->hs_cache_dir) { + return FALSE; + } + + rspamd_snprintf(fp, sizeof(fp), "%s/%*xs.hsmc", + map->cfg->hs_cache_dir, + (gint) rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest); + + re_map->hs_db = rspamd_hyperscan_maybe_load(fp, 0); + + return re_map->hs_db != NULL; +} + +static gboolean +rspamd_try_save_re_map_cache(struct rspamd_regexp_map_helper *re_map) +{ + gchar fp[PATH_MAX], np[PATH_MAX]; + gsize len; + gint fd; + char *bytes = NULL; + struct rspamd_map *map; + + map = re_map->map; + + if (!map->cfg->hs_cache_dir) { + return FALSE; + } + + rspamd_snprintf(fp, sizeof(fp), "%s/hsmc-XXXXXXXXXXXXX", + re_map->map->cfg->hs_cache_dir); + + if ((fd = g_mkstemp_full(fp, O_WRONLY | O_CREAT | O_EXCL, 00644)) != -1) { + if (hs_serialize_database(rspamd_hyperscan_get_database(re_map->hs_db), &bytes, &len) == HS_SUCCESS) { + if (write(fd, bytes, len) == -1) { + msg_warn_map("cannot write hyperscan cache to %s: %s", + fp, strerror(errno)); + unlink(fp); + free(bytes); + } + else { + free(bytes); + fsync(fd); + + rspamd_snprintf(np, sizeof(np), "%s/%*xs.hsmc", + re_map->map->cfg->hs_cache_dir, + (gint) rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest); + + if (rename(fp, np) == -1) { + msg_warn_map("cannot rename hyperscan cache from %s to %s: %s", + fp, np, strerror(errno)); + unlink(fp); + } + else { + msg_info_map("written cached hyperscan data for %s to %s (%Hz length)", + map->name, np, len); + rspamd_hyperscan_notice_known(np); + } + } + } + else { + msg_warn_map("cannot serialize hyperscan cache to %s: %s", + fp, strerror(errno)); + unlink(fp); + } + + + close(fd); + } + + return FALSE; +} + +#endif + +static void +rspamd_re_map_finalize(struct rspamd_regexp_map_helper *re_map) +{ +#ifdef WITH_HYPERSCAN + guint i; + hs_platform_info_t plt; + hs_compile_error_t *err; + struct rspamd_map *map; + rspamd_regexp_t *re; + gint pcre_flags; + + map = re_map->map; + +#if !defined(__aarch64__) && !defined(__powerpc64__) + if (!(map->cfg->libs_ctx->crypto_ctx->cpu_config & CPUID_SSSE3)) { + msg_info_map("disable hyperscan for map %s, ssse3 instructions are not supported by CPU", + map->name); + return; + } +#endif + + if (hs_populate_platform(&plt) != HS_SUCCESS) { + msg_err_map("cannot populate hyperscan platform"); + return; + } + + re_map->patterns = g_new(gchar *, re_map->regexps->len); + re_map->flags = g_new(gint, re_map->regexps->len); + re_map->ids = g_new(gint, re_map->regexps->len); + + for (i = 0; i < re_map->regexps->len; i++) { + const gchar *pat; + gchar *escaped; + gint pat_flags; + + re = g_ptr_array_index(re_map->regexps, i); + pcre_flags = rspamd_regexp_get_pcre_flags(re); + pat = rspamd_regexp_get_pattern(re); + pat_flags = rspamd_regexp_get_flags(re); + + if (pat_flags & RSPAMD_REGEXP_FLAG_UTF) { + escaped = rspamd_str_regexp_escape(pat, strlen(pat), NULL, + RSPAMD_REGEXP_ESCAPE_RE | RSPAMD_REGEXP_ESCAPE_UTF); + re_map->flags[i] |= HS_FLAG_UTF8; + } + else { + escaped = rspamd_str_regexp_escape(pat, strlen(pat), NULL, + RSPAMD_REGEXP_ESCAPE_RE); + } + + re_map->patterns[i] = escaped; + re_map->flags[i] = HS_FLAG_SINGLEMATCH; + +#ifndef WITH_PCRE2 + if (pcre_flags & PCRE_FLAG(UTF8)) { + re_map->flags[i] |= HS_FLAG_UTF8; + } +#else + if (pcre_flags & PCRE_FLAG(UTF)) { + re_map->flags[i] |= HS_FLAG_UTF8; + } +#endif + if (pcre_flags & PCRE_FLAG(CASELESS)) { + re_map->flags[i] |= HS_FLAG_CASELESS; + } + if (pcre_flags & PCRE_FLAG(MULTILINE)) { + re_map->flags[i] |= HS_FLAG_MULTILINE; + } + if (pcre_flags & PCRE_FLAG(DOTALL)) { + re_map->flags[i] |= HS_FLAG_DOTALL; + } + if (rspamd_regexp_get_maxhits(re) == 1) { + re_map->flags[i] |= HS_FLAG_SINGLEMATCH; + } + + re_map->ids[i] = i; + } + + if (re_map->regexps->len > 0 && re_map->patterns) { + + if (!rspamd_try_load_re_map_cache(re_map)) { + gdouble ts1 = rspamd_get_ticks(FALSE); + hs_database_t *hs_db = NULL; + + if (hs_compile_multi((const gchar **) re_map->patterns, + re_map->flags, + re_map->ids, + re_map->regexps->len, + HS_MODE_BLOCK, + &plt, + &hs_db, + &err) != HS_SUCCESS) { + + msg_err_map("cannot create tree of regexp when processing '%s': %s", + err->expression >= 0 ? re_map->patterns[err->expression] : "unknown regexp", err->message); + re_map->hs_db = NULL; + hs_free_compile_error(err); + + return; + } + + if (re_map->map->cfg->hs_cache_dir) { + char fpath[PATH_MAX]; + rspamd_snprintf(fpath, sizeof(fpath), "%s/%*xs.hsmc", + re_map->map->cfg->hs_cache_dir, + (gint) rspamd_cryptobox_HASHBYTES / 2, re_map->re_digest); + re_map->hs_db = rspamd_hyperscan_from_raw_db(hs_db, fpath); + } + else { + re_map->hs_db = rspamd_hyperscan_from_raw_db(hs_db, NULL); + } + + ts1 = (rspamd_get_ticks(FALSE) - ts1) * 1000.0; + msg_info_map("hyperscan compiled %d regular expressions from %s in %.1f ms", + re_map->regexps->len, re_map->map->name, ts1); + rspamd_try_save_re_map_cache(re_map); + } + else { + msg_info_map("hyperscan read %d cached regular expressions from %s", + re_map->regexps->len, re_map->map->name); + } + + if (hs_alloc_scratch(rspamd_hyperscan_get_database(re_map->hs_db), &re_map->hs_scratch) != HS_SUCCESS) { + msg_err_map("cannot allocate scratch space for hyperscan"); + rspamd_hyperscan_free(re_map->hs_db, true); + re_map->hs_db = NULL; + } + } + else { + msg_err_map("regexp map is empty"); + } +#endif +} + +gchar * +rspamd_regexp_list_read_single( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_regexp_map_helper *re_map; + + if (data->cur_data == NULL) { + re_map = rspamd_map_helper_new_regexp(data->map, 0); + data->cur_data = re_map; + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_re, + hash_fill, + final); +} + +gchar * +rspamd_glob_list_read_single( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_regexp_map_helper *re_map; + + if (data->cur_data == NULL) { + re_map = rspamd_map_helper_new_regexp(data->map, RSPAMD_REGEXP_MAP_FLAG_GLOB); + data->cur_data = re_map; + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_re, + hash_fill, + final); +} + +gchar * +rspamd_regexp_list_read_multiple( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_regexp_map_helper *re_map; + + if (data->cur_data == NULL) { + re_map = rspamd_map_helper_new_regexp(data->map, + RSPAMD_REGEXP_MAP_FLAG_MULTIPLE); + data->cur_data = re_map; + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_re, + hash_fill, + final); +} + +gchar * +rspamd_glob_list_read_multiple( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_regexp_map_helper *re_map; + + if (data->cur_data == NULL) { + re_map = rspamd_map_helper_new_regexp(data->map, + RSPAMD_REGEXP_MAP_FLAG_GLOB | RSPAMD_REGEXP_MAP_FLAG_MULTIPLE); + data->cur_data = re_map; + } + + return rspamd_parse_kv_list( + chunk, + len, + data, + rspamd_map_helper_insert_re, + hash_fill, + final); +} + + +void rspamd_regexp_list_fin(struct map_cb_data *data, void **target) +{ + struct rspamd_regexp_map_helper *re_map = NULL, *old_re_map; + struct rspamd_map *map = data->map; + + if (data->errored) { + /* Clean up the current data and do not touch prev data */ + if (data->cur_data) { + msg_info_map("cleanup unfinished new data as error occurred for %s", + map->name); + re_map = (struct rspamd_regexp_map_helper *) data->cur_data; + rspamd_map_helper_destroy_regexp(re_map); + data->cur_data = NULL; + } + } + else { + if (data->cur_data) { + re_map = data->cur_data; + rspamd_cryptobox_hash_final(&re_map->hst, re_map->re_digest); + memcpy(&data->map->digest, re_map->re_digest, sizeof(data->map->digest)); + rspamd_re_map_finalize(re_map); + msg_info_map("read regexp list of %ud elements", + re_map->regexps->len); + data->map->traverse_function = rspamd_map_helper_traverse_regexp; + data->map->nelts = kh_size(re_map->htb); + } + + if (target) { + *target = data->cur_data; + } + + if (data->prev_data) { + old_re_map = data->prev_data; + rspamd_map_helper_destroy_regexp(old_re_map); + } + } +} +void rspamd_regexp_list_dtor(struct map_cb_data *data) +{ + if (data->cur_data) { + rspamd_map_helper_destroy_regexp(data->cur_data); + } +} + +#ifdef WITH_HYPERSCAN +static int +rspamd_match_hs_single_handler(unsigned int id, unsigned long long from, + unsigned long long to, + unsigned int flags, void *context) +{ + guint *i = context; + /* Always return non-zero as we need a single match here */ + + *i = id; + + return 1; +} +#endif + +gconstpointer +rspamd_match_regexp_map_single(struct rspamd_regexp_map_helper *map, + const gchar *in, gsize len) +{ + guint i; + rspamd_regexp_t *re; + gint res = 0; + gpointer ret = NULL; + struct rspamd_map_helper_value *val; + gboolean validated = FALSE; + + g_assert(in != NULL); + + if (map == NULL || len == 0 || map->regexps == NULL) { + return NULL; + } + + if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) { + if (rspamd_fast_utf8_validate(in, len) == 0) { + validated = TRUE; + } + } + else { + validated = TRUE; + } + +#ifdef WITH_HYPERSCAN + if (map->hs_db && map->hs_scratch) { + + if (validated) { + + res = hs_scan(rspamd_hyperscan_get_database(map->hs_db), in, len, 0, + map->hs_scratch, + rspamd_match_hs_single_handler, (void *) &i); + + if (res == HS_SCAN_TERMINATED) { + res = 1; + val = g_ptr_array_index(map->values, i); + + ret = val->value; + val->hits++; + } + + return ret; + } + } +#endif + + if (!res) { + /* PCRE version */ + for (i = 0; i < map->regexps->len; i++) { + re = g_ptr_array_index(map->regexps, i); + + if (rspamd_regexp_search(re, in, len, NULL, NULL, !validated, NULL)) { + val = g_ptr_array_index(map->values, i); + + ret = val->value; + val->hits++; + break; + } + } + } + + return ret; +} + +#ifdef WITH_HYPERSCAN +struct rspamd_multiple_cbdata { + GPtrArray *ar; + struct rspamd_regexp_map_helper *map; +}; + +static int +rspamd_match_hs_multiple_handler(unsigned int id, unsigned long long from, + unsigned long long to, + unsigned int flags, void *context) +{ + struct rspamd_multiple_cbdata *cbd = context; + struct rspamd_map_helper_value *val; + + + if (id < cbd->map->values->len) { + val = g_ptr_array_index(cbd->map->values, id); + val->hits++; + g_ptr_array_add(cbd->ar, val->value); + } + + /* Always return zero as we need all matches here */ + return 0; +} +#endif + +GPtrArray * +rspamd_match_regexp_map_all(struct rspamd_regexp_map_helper *map, + const gchar *in, gsize len) +{ + guint i; + rspamd_regexp_t *re; + GPtrArray *ret; + gint res = 0; + gboolean validated = FALSE; + struct rspamd_map_helper_value *val; + + if (map == NULL || map->regexps == NULL || len == 0) { + return NULL; + } + + g_assert(in != NULL); + + if (map->map_flags & RSPAMD_REGEXP_MAP_FLAG_UTF) { + if (rspamd_fast_utf8_validate(in, len) == 0) { + validated = TRUE; + } + } + else { + validated = TRUE; + } + + ret = g_ptr_array_new(); + +#ifdef WITH_HYPERSCAN + if (map->hs_db && map->hs_scratch) { + + if (validated) { + struct rspamd_multiple_cbdata cbd; + + cbd.ar = ret; + cbd.map = map; + + if (hs_scan(rspamd_hyperscan_get_database(map->hs_db), in, len, + 0, map->hs_scratch, + rspamd_match_hs_multiple_handler, &cbd) == HS_SUCCESS) { + res = 1; + } + } + } +#endif + + if (!res) { + /* PCRE version */ + for (i = 0; i < map->regexps->len; i++) { + re = g_ptr_array_index(map->regexps, i); + + if (rspamd_regexp_search(re, in, len, NULL, NULL, + !validated, NULL)) { + val = g_ptr_array_index(map->values, i); + val->hits++; + g_ptr_array_add(ret, val->value); + } + } + } + + if (ret->len > 0) { + return ret; + } + + g_ptr_array_free(ret, TRUE); + + return NULL; +} + +gconstpointer +rspamd_match_hash_map(struct rspamd_hash_map_helper *map, const gchar *in, + gsize len) +{ + khiter_t k; + struct rspamd_map_helper_value *val; + rspamd_ftok_t tok; + + if (map == NULL || map->htb == NULL) { + return NULL; + } + + tok.begin = in; + tok.len = len; + + k = kh_get(rspamd_map_hash, map->htb, tok); + + if (k != kh_end(map->htb)) { + val = kh_value(map->htb, k); + val->hits++; + + return val->value; + } + + return NULL; +} + +gconstpointer +rspamd_match_radix_map(struct rspamd_radix_map_helper *map, + const guchar *in, gsize inlen) +{ + struct rspamd_map_helper_value *val; + + if (map == NULL || map->trie == NULL) { + return NULL; + } + + val = (struct rspamd_map_helper_value *) radix_find_compressed(map->trie, + in, inlen); + + if (val != (gconstpointer) RADIX_NO_VALUE) { + val->hits++; + + return val->value; + } + + return NULL; +} + +gconstpointer +rspamd_match_radix_map_addr(struct rspamd_radix_map_helper *map, + const rspamd_inet_addr_t *addr) +{ + struct rspamd_map_helper_value *val; + + if (map == NULL || map->trie == NULL) { + return NULL; + } + + val = (struct rspamd_map_helper_value *) radix_find_compressed_addr(map->trie, addr); + + if (val != (gconstpointer) RADIX_NO_VALUE) { + val->hits++; + + return val->value; + } + + return NULL; +} + + +/* + * CBD stuff + */ + +struct rspamd_cdb_map_helper * +rspamd_map_helper_new_cdb(struct rspamd_map *map) +{ + struct rspamd_cdb_map_helper *n; + + n = g_malloc0(sizeof(*n)); + n->cdbs = (GQueue) G_QUEUE_INIT; + n->map = map; + + rspamd_cryptobox_fast_hash_init(&n->hst, map_hash_seed); + + return n; +} + +void rspamd_map_helper_destroy_cdb(struct rspamd_cdb_map_helper *c) +{ + if (c == NULL) { + return; + } + + GList *cur = c->cdbs.head; + + while (cur) { + struct cdb *cdb = (struct cdb *) cur->data; + + cdb_free(cdb); + g_free(cdb->filename); + close(cdb->cdb_fd); + g_free(cdb); + + cur = g_list_next(cur); + } + + g_queue_clear(&c->cdbs); + + g_free(c); +} + +gchar * +rspamd_cdb_list_read(gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final) +{ + struct rspamd_cdb_map_helper *cdb_data; + struct cdb *found = NULL; + struct rspamd_map *map = data->map; + + g_assert(map->no_file_read); + + if (data->cur_data == NULL) { + cdb_data = rspamd_map_helper_new_cdb(data->map); + data->cur_data = cdb_data; + } + else { + cdb_data = (struct rspamd_cdb_map_helper *) data->cur_data; + } + + GList *cur = cdb_data->cdbs.head; + + while (cur) { + struct cdb *elt = (struct cdb *) cur->data; + + if (strcmp(elt->filename, chunk) == 0) { + found = elt; + break; + } + + cur = g_list_next(cur); + } + + if (found == NULL) { + /* New cdb */ + gint fd; + struct cdb *cdb; + + fd = rspamd_file_xopen(chunk, O_RDONLY, 0, TRUE); + + if (fd == -1) { + msg_err_map("cannot open cdb map from %s: %s", chunk, strerror(errno)); + + return NULL; + } + + cdb = g_malloc0(sizeof(struct cdb)); + + if (cdb_init(cdb, fd) == -1) { + g_free(cdb); + msg_err_map("cannot init cdb map from %s: %s", chunk, strerror(errno)); + + return NULL; + } + + cdb->filename = g_strdup(chunk); + g_queue_push_tail(&cdb_data->cdbs, cdb); + cdb_data->total_size += cdb->cdb_fsize; + rspamd_cryptobox_fast_hash_update(&cdb_data->hst, chunk, len); + } + + return chunk + len; +} + +void rspamd_cdb_list_fin(struct map_cb_data *data, void **target) +{ + struct rspamd_map *map = data->map; + struct rspamd_cdb_map_helper *cdb_data; + + if (data->errored) { + /* Clean up the current data and do not touch prev data */ + if (data->cur_data) { + msg_info_map("cleanup unfinished new data as error occurred for %s", + map->name); + cdb_data = (struct rspamd_cdb_map_helper *) data->cur_data; + rspamd_map_helper_destroy_cdb(cdb_data); + data->cur_data = NULL; + } + } + else { + if (data->cur_data) { + cdb_data = (struct rspamd_cdb_map_helper *) data->cur_data; + msg_info_map("read cdb of %Hz size", cdb_data->total_size); + data->map->traverse_function = NULL; + data->map->nelts = 0; + data->map->digest = rspamd_cryptobox_fast_hash_final(&cdb_data->hst); + } + + if (target) { + *target = data->cur_data; + } + + if (data->prev_data) { + cdb_data = (struct rspamd_cdb_map_helper *) data->prev_data; + rspamd_map_helper_destroy_cdb(cdb_data); + } + } +} +void rspamd_cdb_list_dtor(struct map_cb_data *data) +{ + if (data->cur_data) { + rspamd_map_helper_destroy_cdb(data->cur_data); + } +} + +gconstpointer +rspamd_match_cdb_map(struct rspamd_cdb_map_helper *map, + const gchar *in, gsize inlen) +{ + if (map == NULL || map->cdbs.head == NULL) { + return NULL; + } + + GList *cur = map->cdbs.head; + static rspamd_ftok_t found; + + while (cur) { + struct cdb *cdb = (struct cdb *) cur->data; + + if (cdb_find(cdb, in, inlen) > 0) { + /* Extract and push value to lua as string */ + unsigned vlen; + gconstpointer vpos; + + vpos = cdb->cdb_mem + cdb_datapos(cdb); + vlen = cdb_datalen(cdb); + found.len = vlen; + found.begin = vpos; + + return &found; /* Do not reuse! */ + } + + cur = g_list_next(cur); + } + + return NULL; +} diff --git a/src/libserver/maps/map_helpers.h b/src/libserver/maps/map_helpers.h new file mode 100644 index 0000000..82c62b6 --- /dev/null +++ b/src/libserver/maps/map_helpers.h @@ -0,0 +1,269 @@ +/*- + * Copyright 2018 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_MAP_HELPERS_H +#define RSPAMD_MAP_HELPERS_H + +#include "config.h" +#include "map.h" +#include "addr.h" + +/** + * @file map_helpers.h + * + * Defines helper structures to deal with different map types + */ + + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Common structures, abstract for simplicity + */ +struct rspamd_radix_map_helper; +struct rspamd_hash_map_helper; +struct rspamd_regexp_map_helper; +struct rspamd_cdb_map_helper; +struct rspamd_map_helper_value; + +enum rspamd_regexp_map_flags { + RSPAMD_REGEXP_MAP_FLAG_UTF = (1u << 0), + RSPAMD_REGEXP_MAP_FLAG_MULTIPLE = (1u << 1), + RSPAMD_REGEXP_MAP_FLAG_GLOB = (1u << 2), +}; + +typedef void (*rspamd_map_insert_func)(gpointer st, gconstpointer key, + gconstpointer value); + +/** + * Radix list is a list like ip/mask + */ +gchar *rspamd_radix_read( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final); + +void rspamd_radix_fin(struct map_cb_data *data, void **target); + +void rspamd_radix_dtor(struct map_cb_data *data); + +/** + * Kv list is an ordinal list of keys and values separated by whitespace + */ +gchar *rspamd_kv_list_read( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final); + +void rspamd_kv_list_fin(struct map_cb_data *data, void **target); + +void rspamd_kv_list_dtor(struct map_cb_data *data); + +/** + * Cdb is a cdb mapped file with shared data + * chunk must be filename! + */ +gchar *rspamd_cdb_list_read( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final); +void rspamd_cdb_list_fin(struct map_cb_data *data, void **target); +void rspamd_cdb_list_dtor(struct map_cb_data *data); + +/** + * Regexp list is a list of regular expressions + */ + +gchar *rspamd_regexp_list_read_single( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final); + +gchar *rspamd_regexp_list_read_multiple( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final); + +gchar *rspamd_glob_list_read_single( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final); + +gchar *rspamd_glob_list_read_multiple( + gchar *chunk, + gint len, + struct map_cb_data *data, + gboolean final); + +void rspamd_regexp_list_fin(struct map_cb_data *data, void **target); + +void rspamd_regexp_list_dtor(struct map_cb_data *data); + +/** + * FSM for lists parsing (support comments, blank lines and partial replies) + */ +gchar * +rspamd_parse_kv_list( + gchar *chunk, + gint len, + struct map_cb_data *data, + rspamd_map_insert_func func, + const gchar *default_value, + gboolean final); + +/** + * Find a single (any) matching regexp for the specified text or NULL if + * no matches found + * @param map + * @param in + * @param len + * @return + */ +gconstpointer rspamd_match_regexp_map_single(struct rspamd_regexp_map_helper *map, + const gchar *in, gsize len); + +/** + * Find a multiple (all) matching regexp for the specified text or NULL if + * no matches found. Returns GPtrArray that *must* be freed by a caller if not NULL + * @param map + * @param in + * @param len + * @return + */ +GPtrArray *rspamd_match_regexp_map_all(struct rspamd_regexp_map_helper *map, + const gchar *in, gsize len); + +/** + * Find value matching specific key in a hash map + * @param map + * @param in + * @param len + * @return + */ +gconstpointer rspamd_match_hash_map(struct rspamd_hash_map_helper *map, + const gchar *in, gsize len); + +/** + * Find value matching specific key in a cdb map + * @param map + * @param in + * @param len + * @return rspamd_ftok_t pointer (allocated in a static buffer!) + */ +gconstpointer rspamd_match_cdb_map(struct rspamd_cdb_map_helper *map, + const gchar *in, gsize len); + +/** + * Find value matching specific key in a hash map + * @param map + * @param in raw ip address + * @param inlen ip address length (4 for IPv4 and 16 for IPv6) + * @return + */ +gconstpointer rspamd_match_radix_map(struct rspamd_radix_map_helper *map, + const guchar *in, gsize inlen); + +gconstpointer rspamd_match_radix_map_addr(struct rspamd_radix_map_helper *map, + const rspamd_inet_addr_t *addr); + +/** + * Creates radix map helper + * @param map + * @return + */ +struct rspamd_radix_map_helper *rspamd_map_helper_new_radix(struct rspamd_map *map); + +/** + * Inserts new value into radix map + * @param st + * @param key + * @param value + */ +void rspamd_map_helper_insert_radix(gpointer st, gconstpointer key, gconstpointer value); + +/** + * Inserts new value into radix map performing synchronous resolving + * @param st + * @param key + * @param value + */ +void rspamd_map_helper_insert_radix_resolve(gpointer st, gconstpointer key, + gconstpointer value); + +/** + * Destroys radix map helper + * @param r + */ +void rspamd_map_helper_destroy_radix(struct rspamd_radix_map_helper *r); + + +/** + * Creates hash map helper + * @param map + * @return + */ +struct rspamd_hash_map_helper *rspamd_map_helper_new_hash(struct rspamd_map *map); + +/** + * Inserts a new value into a hash map + * @param st + * @param key + * @param value + */ +void rspamd_map_helper_insert_hash(gpointer st, gconstpointer key, gconstpointer value); + +/** + * Destroys hash map helper + * @param r + */ +void rspamd_map_helper_destroy_hash(struct rspamd_hash_map_helper *r); + +/** + * Create new regexp map + * @param map + * @param flags + * @return + */ +struct rspamd_regexp_map_helper *rspamd_map_helper_new_regexp(struct rspamd_map *map, + enum rspamd_regexp_map_flags flags); + +/** + * Inserts a new regexp into regexp map + * @param st + * @param key + * @param value + */ +void rspamd_map_helper_insert_re(gpointer st, gconstpointer key, gconstpointer value); + +/** + * Destroy regexp map + * @param re_map + */ +void rspamd_map_helper_destroy_regexp(struct rspamd_regexp_map_helper *re_map); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/maps/map_private.h b/src/libserver/maps/map_private.h new file mode 100644 index 0000000..60751c0 --- /dev/null +++ b/src/libserver/maps/map_private.h @@ -0,0 +1,226 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBUTIL_MAP_PRIVATE_H_ +#define SRC_LIBUTIL_MAP_PRIVATE_H_ + +#include "config.h" +#include "mem_pool.h" +#include "keypair.h" +#include "unix-std.h" +#include "map.h" +#include "ref.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*rspamd_map_tmp_dtor)(gpointer p); + +extern guint rspamd_map_log_id; +#define msg_err_map(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "map", map->tag, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_map(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "map", map->tag, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_map(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "map", map->tag, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_map(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_map_log_id, "map", map->tag, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +enum fetch_proto { + MAP_PROTO_FILE, + MAP_PROTO_HTTP, + MAP_PROTO_HTTPS, + MAP_PROTO_STATIC +}; + +/** + * Data specific to file maps + */ +struct file_map_data { + gchar *filename; + gboolean need_modify; + ev_stat st_ev; +}; + + +struct http_map_data; + +struct rspamd_http_map_cached_cbdata { + ev_timer timeout; + struct ev_loop *event_loop; + struct rspamd_storage_shmem *shm; + struct rspamd_map *map; + struct http_map_data *data; + guint64 gen; + time_t last_checked; +}; + +struct rspamd_map_cachepoint { + gint available; + gsize len; + time_t last_modified; + gchar shmem_name[256]; +}; + +/** + * Data specific to HTTP maps + */ +struct http_map_data { + /* Shared cache data */ + struct rspamd_map_cachepoint *cache; + /* Non-shared for cache owner, used to cleanup cache */ + struct rspamd_http_map_cached_cbdata *cur_cache_cbd; + gchar *userinfo; + gchar *path; + gchar *host; + gchar *rest; + rspamd_fstring_t *etag; + time_t last_modified; + time_t last_checked; + gboolean request_sent; + guint64 gen; + guint16 port; +}; + +struct static_map_data { + guchar *data; + gsize len; + gboolean processed; +}; + +union rspamd_map_backend_data { + struct file_map_data *fd; + struct http_map_data *hd; + struct static_map_data *sd; +}; + + +struct rspamd_map; +struct rspamd_map_backend { + enum fetch_proto protocol; + gboolean is_signed; + gboolean is_compressed; + gboolean is_fallback; + struct rspamd_map *map; + struct ev_loop *event_loop; + guint32 id; + struct rspamd_cryptobox_pubkey *trusted_pubkey; + union rspamd_map_backend_data data; + gchar *uri; + ref_entry_t ref; +}; + +struct map_periodic_cbdata; + +struct rspamd_map { + struct rspamd_dns_resolver *r; + struct rspamd_config *cfg; + GPtrArray *backends; + struct rspamd_map_backend *fallback_backend; + map_cb_t read_callback; + map_fin_cb_t fin_callback; + map_dtor_t dtor; + void **user_data; + struct ev_loop *event_loop; + struct rspamd_worker *wrk; + gchar *description; + gchar *name; + guint32 id; + struct map_periodic_cbdata *scheduled_check; + rspamd_map_tmp_dtor tmp_dtor; + gpointer tmp_dtor_data; + rspamd_map_traverse_function traverse_function; + rspamd_map_on_load_function on_load_function; + gpointer on_load_ud; + GDestroyNotify on_load_ud_dtor; + gpointer lua_map; + gsize nelts; + guint64 digest; + /* Should we check HTTP or just load cached data */ + ev_tstamp timeout; + gdouble poll_timeout; + time_t next_check; + bool active_http; + bool non_trivial; /* E.g. has http backends in active mode */ + bool file_only; /* No HTTP backends found */ + bool static_only; /* No need to check */ + bool no_file_read; /* Do not read files */ + /* Shared lock for temporary disabling of map reading (e.g. when this map is written by UI) */ + gint *locked; + gchar tag[MEMPOOL_UID_LEN]; +}; + +enum rspamd_map_http_stage { + http_map_resolve_host2 = 0, /* 2 requests sent */ + http_map_resolve_host1, /* 1 requests sent */ + http_map_http_conn, /* http connection */ + http_map_terminated /* terminated when doing resolving */ +}; + +struct map_periodic_cbdata { + struct rspamd_map *map; + struct map_cb_data cbdata; + ev_timer ev; + gboolean need_modify; + gboolean errored; + gboolean locked; + guint cur_backend; + ref_entry_t ref; +}; + +static const gchar rspamd_http_file_magic[] = + {'r', 'm', 'c', 'd', '2', '0', '0', '0'}; + +struct rspamd_http_file_data { + guchar magic[sizeof(rspamd_http_file_magic)]; + goffset data_off; + gulong mtime; + gulong next_check; + gulong etag_len; +}; + +struct http_callback_data { + struct ev_loop *event_loop; + struct rspamd_http_connection *conn; + GPtrArray *addrs; + rspamd_inet_addr_t *addr; + struct rspamd_map *map; + struct rspamd_map_backend *bk; + struct http_map_data *data; + struct map_periodic_cbdata *periodic; + struct rspamd_cryptobox_pubkey *pk; + struct rspamd_storage_shmem *shmem_data; + gsize data_len; + gboolean check; + enum rspamd_map_http_stage stage; + ev_tstamp timeout; + + ref_entry_t ref; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBUTIL_MAP_PRIVATE_H_ */ diff --git a/src/libserver/mempool_vars_internal.h b/src/libserver/mempool_vars_internal.h new file mode 100644 index 0000000..6c95538 --- /dev/null +++ b/src/libserver/mempool_vars_internal.h @@ -0,0 +1,47 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_MEMPOOL_VARS_INTERNAL_H +#define RSPAMD_MEMPOOL_VARS_INTERNAL_H + +/* Basic rspamd mempool variables names */ +#define RSPAMD_MEMPOOL_AVG_WORDS_LEN "avg_words_len" +#define RSPAMD_MEMPOOL_SHORT_WORDS_CNT "short_words_cnt" +#define RSPAMD_MEMPOOL_HEADERS_HASH "headers_hash" +#define RSPAMD_MEMPOOL_MTA_TAG "MTA-Tag" +#define RSPAMD_MEMPOOL_MTA_NAME "MTA-Name" +#define RSPAMD_MEMPOOL_SPF_DOMAIN "spf_domain" +#define RSPAMD_MEMPOOL_SPF_RECORD "spf_record" +#define RSPAMD_MEMPOOL_PRINCIPAL_RECIPIENT "principal_recipient" +#define RSPAMD_MEMPOOL_PROFILE "profile" +#define RSPAMD_MEMPOOL_MILTER_REPLY "milter_reply" +#define RSPAMD_MEMPOOL_DKIM_SIGNATURE "dkim-signature" +#define RSPAMD_MEMPOOL_DMARC_CHECKS "dmarc_checks" +#define RSPAMD_MEMPOOL_DKIM_BH_CACHE "dkim_bh_cache" +#define RSPAMD_MEMPOOL_DKIM_CHECK_RESULTS "dkim_results" +#define RSPAMD_MEMPOOL_DKIM_SIGN_KEY "dkim_key" +#define RSPAMD_MEMPOOL_DKIM_SIGN_SELECTOR "dkim_selector" +#define RSPAMD_MEMPOOL_ARC_SIGN_KEY "arc_key" +#define RSPAMD_MEMPOOL_ARC_SIGN_SELECTOR "arc_selector" +#define RSPAMD_MEMPOOL_STAT_SIGNATURE "stat_signature" +#define RSPAMD_MEMPOOL_FUZZY_RESULT "fuzzy_hashes" +#define RSPAMD_MEMPOOL_SPAM_LEARNS "spam_learns" +#define RSPAMD_MEMPOOL_HAM_LEARNS "ham_learns" +#define RSPAMD_MEMPOOL_RE_MAPS_CACHE "re_maps_cache" +#define RSPAMD_MEMPOOL_HTTP_STAT_BACKEND_RUNTIME "stat_http_runtime" +#define RSPAMD_MEMPOOL_FUZZY_STAT "fuzzy_stat" + +#endif diff --git a/src/libserver/milter.c b/src/libserver/milter.c new file mode 100644 index 0000000..cfb7d3c --- /dev/null +++ b/src/libserver/milter.c @@ -0,0 +1,2232 @@ +/*- + * Copyright 2017 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "milter.h" +#include "milter_internal.h" +#include "email_addr.h" +#include "addr.h" +#include "unix-std.h" +#include "logger.h" +#include "ottery.h" +#include "libserver/http/http_connection.h" +#include "libserver/http/http_private.h" +#include "libserver/protocol_internal.h" +#include "libserver/cfg_file_private.h" +#include "libmime/scan_result.h" +#include "libserver/worker_util.h" +#include "utlist.h" + +#define msg_err_milter(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "milter", priv->pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_milter(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "milter", priv->pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_milter(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "milter", priv->pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_milter(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_milter_log_id, "milter", priv->pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(milter) + +static const struct rspamd_milter_context *milter_ctx = NULL; + +static gboolean rspamd_milter_handle_session( + struct rspamd_milter_session *session, + struct rspamd_milter_private *priv); +static inline void rspamd_milter_plan_io(struct rspamd_milter_session *session, + struct rspamd_milter_private *priv, gshort what); + +static GQuark +rspamd_milter_quark(void) +{ + return g_quark_from_static_string("milter"); +} + +static void +rspamd_milter_obuf_free(struct rspamd_milter_outbuf *obuf) +{ + if (obuf) { + if (obuf->buf) { + rspamd_fstring_free(obuf->buf); + } + + g_free(obuf); + } +} + +#define RSPAMD_MILTER_RESET_COMMON (1 << 0) +#define RSPAMD_MILTER_RESET_IO (1 << 1) +#define RSPAMD_MILTER_RESET_ADDR (1 << 2) +#define RSPAMD_MILTER_RESET_MACRO (1 << 3) +#define RSPAMD_MILTER_RESET_ALL (RSPAMD_MILTER_RESET_COMMON | \ + RSPAMD_MILTER_RESET_IO | \ + RSPAMD_MILTER_RESET_ADDR | \ + RSPAMD_MILTER_RESET_MACRO) +#define RSPAMD_MILTER_RESET_QUIT_NC (RSPAMD_MILTER_RESET_COMMON | \ + RSPAMD_MILTER_RESET_ADDR | \ + RSPAMD_MILTER_RESET_MACRO) +#define RSPAMD_MILTER_RESET_ABORT (RSPAMD_MILTER_RESET_COMMON) + +static void +rspamd_milter_session_reset(struct rspamd_milter_session *session, + guint how) +{ + struct rspamd_milter_outbuf *obuf, *obuf_tmp; + struct rspamd_milter_private *priv = session->priv; + struct rspamd_email_address *cur; + guint i; + + if (how & RSPAMD_MILTER_RESET_IO) { + msg_debug_milter("cleanup IO on abort"); + + DL_FOREACH_SAFE(priv->out_chain, obuf, obuf_tmp) + { + rspamd_milter_obuf_free(obuf); + } + + priv->out_chain = NULL; + + if (priv->parser.buf) { + priv->parser.buf->len = 0; + } + } + + if (how & RSPAMD_MILTER_RESET_COMMON) { + msg_debug_milter("cleanup common data on abort"); + + if (session->message) { + session->message->len = 0; + msg_debug_milter("cleanup message on abort"); + } + + if (session->rcpts) { + PTR_ARRAY_FOREACH(session->rcpts, i, cur) + { + rspamd_email_address_free(cur); + } + + msg_debug_milter("cleanup %d recipients on abort", + (gint) session->rcpts->len); + + g_ptr_array_free(session->rcpts, TRUE); + session->rcpts = NULL; + } + + if (session->from) { + msg_debug_milter("cleanup from"); + rspamd_email_address_free(session->from); + session->from = NULL; + } + + if (priv->headers) { + msg_debug_milter("cleanup headers"); + gchar *k; + GArray *ar; + + kh_foreach(priv->headers, k, ar, { + g_free(k); + g_array_free(ar, TRUE); + }); + + kh_clear(milter_headers_hash_t, priv->headers); + } + + priv->cur_hdr = 0; + } + + if (how & RSPAMD_MILTER_RESET_ADDR) { + if (session->addr) { + msg_debug_milter("cleanup addr"); + rspamd_inet_address_free(session->addr); + session->addr = NULL; + } + if (session->hostname) { + msg_debug_milter("cleanup hostname"); + session->hostname->len = 0; + } + } + + if (how & RSPAMD_MILTER_RESET_MACRO) { + if (session->macros) { + msg_debug_milter("cleanup macros"); + g_hash_table_unref(session->macros); + session->macros = NULL; + } + } +} + +static void +rspamd_milter_session_dtor(struct rspamd_milter_session *session) +{ + struct rspamd_milter_private *priv; + + if (session) { + priv = session->priv; + msg_debug_milter("destroying milter session"); + + rspamd_ev_watcher_stop(priv->event_loop, &priv->ev); + rspamd_milter_session_reset(session, RSPAMD_MILTER_RESET_ALL); + close(priv->fd); + + if (priv->parser.buf) { + rspamd_fstring_free(priv->parser.buf); + } + + if (session->message) { + rspamd_fstring_free(session->message); + } + + if (session->helo) { + rspamd_fstring_free(session->helo); + } + + if (session->hostname) { + rspamd_fstring_free(session->hostname); + } + + if (priv->headers) { + gchar *k; + GArray *ar; + + kh_foreach(priv->headers, k, ar, { + g_free(k); + g_array_free(ar, TRUE); + }); + + kh_destroy(milter_headers_hash_t, priv->headers); + } + + if (milter_ctx->sessions_cache) { + rspamd_worker_session_cache_remove(milter_ctx->sessions_cache, + session); + } + + rspamd_mempool_delete(priv->pool); + g_free(priv); + g_free(session); + } +} + +static void +rspamd_milter_on_protocol_error(struct rspamd_milter_session *session, + struct rspamd_milter_private *priv, GError *err) +{ + msg_debug_milter("protocol error: %e", err); + priv->state = RSPAMD_MILTER_WANNA_DIE; + REF_RETAIN(session); + priv->err_cb(priv->fd, session, priv->ud, err); + REF_RELEASE(session); + g_error_free(err); + + rspamd_milter_plan_io(session, priv, EV_WRITE); +} + +static void +rspamd_milter_on_protocol_ping(struct rspamd_milter_session *session, + struct rspamd_milter_private *priv) +{ + GError *err = NULL; + static const gchar reply[] = "HTTP/1.1 200 OK\r\n" + "Connection: close\r\n" + "Server: rspamd/2.7 (milter mode)\r\n" + "Content-Length: 6\r\n" + "Content-Type: text/plain\r\n" + "\r\n" + "pong\r\n"; + + if (write(priv->fd, reply, sizeof(reply)) == -1) { + gint serrno = errno; + msg_err_milter("cannot write pong reply: %s", strerror(serrno)); + g_set_error(&err, rspamd_milter_quark(), serrno, "ping command IO error: %s", + strerror(serrno)); + priv->state = RSPAMD_MILTER_WANNA_DIE; + REF_RETAIN(session); + priv->err_cb(priv->fd, session, priv->ud, err); + REF_RELEASE(session); + g_error_free(err); + } + else { + priv->state = RSPAMD_MILTER_PONG_AND_DIE; + rspamd_milter_plan_io(session, priv, EV_WRITE); + } +} + +static gint +rspamd_milter_http_on_url(http_parser *parser, const gchar *at, size_t length) +{ + GString *url = (GString *) parser->data; + + g_string_append_len(url, at, length); + + return 0; +} + +static void +rspamd_milter_io_handler(gint fd, gshort what, void *ud) +{ + struct rspamd_milter_session *session = ud; + struct rspamd_milter_private *priv; + GError *err; + + priv = session->priv; + + if (what == EV_TIMEOUT) { + msg_debug_milter("connection timed out"); + err = g_error_new(rspamd_milter_quark(), ETIMEDOUT, "connection " + "timed out"); + rspamd_milter_on_protocol_error(session, priv, err); + } + else { + rspamd_milter_handle_session(session, priv); + } +} + +static inline void +rspamd_milter_plan_io(struct rspamd_milter_session *session, + struct rspamd_milter_private *priv, gshort what) +{ + rspamd_ev_watcher_reschedule(priv->event_loop, &priv->ev, what); +} + + +#define READ_INT_32(pos, var) \ + do { \ + memcpy(&(var), (pos), sizeof(var)); \ + (pos) += sizeof(var); \ + (var) = ntohl(var); \ + } while (0) +#define READ_INT_16(pos, var) \ + do { \ + memcpy(&(var), (pos), sizeof(var)); \ + (pos) += sizeof(var); \ + (var) = ntohs(var); \ + } while (0) + +static gboolean +rspamd_milter_process_command(struct rspamd_milter_session *session, + struct rspamd_milter_private *priv) +{ + GError *err; + rspamd_fstring_t *buf; + const guchar *pos, *end, *zero; + guint cmdlen; + guint32 version, actions, protocol; + + buf = priv->parser.buf; + pos = buf->str + priv->parser.cmd_start; + cmdlen = priv->parser.datalen; + end = pos + cmdlen; + + switch (priv->parser.cur_cmd) { + case RSPAMD_MILTER_CMD_ABORT: + msg_debug_milter("got abort command"); + rspamd_milter_session_reset(session, RSPAMD_MILTER_RESET_ABORT); + break; + case RSPAMD_MILTER_CMD_BODY: + if (!session->message) { + session->message = rspamd_fstring_sized_new( + RSPAMD_MILTER_MESSAGE_CHUNK); + } + + msg_debug_milter("got body chunk: %d bytes", (int) cmdlen); + session->message = rspamd_fstring_append(session->message, + pos, cmdlen); + break; + case RSPAMD_MILTER_CMD_CONNECT: + msg_debug_milter("got connect command"); + + /* + * char hostname[]: Hostname, NUL terminated + * char family: Protocol family + * uint16 port: Port number (SMFIA_INET or SMFIA_INET6 only) + * char address[]: IP address (ASCII) or unix socket path, NUL terminated + */ + zero = memchr(pos, '\0', cmdlen); + + if (zero == NULL || zero > (end - sizeof(guint16) + 1)) { + err = g_error_new(rspamd_milter_quark(), EINVAL, "invalid " + "connect command (no name)"); + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + else { + guchar proto; + guint16 port; + gchar ip6_str[INET6_ADDRSTRLEN + 3]; + gsize r; + + /* + * Important notice: Postfix do NOT use this command to pass + * client's info (e.g. hostname is not really here) + * Sendmail will pass it here + */ + if (session->hostname == NULL) { + session->hostname = rspamd_fstring_new_init(pos, zero - pos); + msg_debug_milter("got hostname on connect phase: %V", + session->hostname); + } + else { + session->hostname = rspamd_fstring_assign(session->hostname, + pos, zero - pos); + msg_debug_milter("rewrote hostname on connect phase: %V", + session->hostname); + } + + pos = zero + 1; + proto = *pos++; + + if (proto == RSPAMD_MILTER_CONN_UNKNOWN) { + /* We have no information about host */ + msg_debug_milter("unknown connect address"); + } + else { + READ_INT_16(pos, port); + + if (pos >= end) { + /* No IP somehow */ + msg_debug_milter("unknown connect IP/socket"); + } + else { + zero = memchr(pos, '\0', end - pos); + + if (zero == NULL) { + err = g_error_new(rspamd_milter_quark(), EINVAL, "invalid " + "connect command (no zero terminated IP)"); + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + + switch (proto) { + case RSPAMD_MILTER_CONN_UNIX: + session->addr = rspamd_inet_address_new(AF_UNIX, + pos); + break; + + case RSPAMD_MILTER_CONN_INET: + session->addr = rspamd_inet_address_new(AF_INET, NULL); + + if (!rspamd_parse_inet_address_ip(pos, zero - pos, + session->addr)) { + err = g_error_new(rspamd_milter_quark(), EINVAL, + "invalid connect command (bad IPv4)"); + rspamd_milter_on_protocol_error(session, priv, + err); + + return FALSE; + } + + rspamd_inet_address_set_port(session->addr, port); + break; + + case RSPAMD_MILTER_CONN_INET6: + session->addr = rspamd_inet_address_new(AF_INET6, NULL); + + if (zero - pos > sizeof("IPv6:") && + rspamd_lc_cmp(pos, "IPv6:", + sizeof("IPv6:") - 1) == 0) { + /* Kill sendmail please */ + pos += sizeof("IPv6:") - 1; + + if (*pos != '[') { + /* Add explicit braces */ + r = rspamd_snprintf(ip6_str, sizeof(ip6_str), + "[%*s]", (int) (zero - pos), pos); + } + else { + r = rspamd_strlcpy(ip6_str, pos, sizeof(ip6_str)); + } + } + else { + r = rspamd_strlcpy(ip6_str, pos, sizeof(ip6_str)); + } + + if (!rspamd_parse_inet_address_ip(ip6_str, r, + session->addr)) { + err = g_error_new(rspamd_milter_quark(), EINVAL, + "invalid connect command (bad IPv6)"); + rspamd_milter_on_protocol_error(session, priv, + err); + + return FALSE; + } + + rspamd_inet_address_set_port(session->addr, port); + break; + + default: + err = g_error_new(rspamd_milter_quark(), EINVAL, + "invalid connect command (bad protocol: %c)", + proto); + rspamd_milter_on_protocol_error(session, priv, + err); + + return FALSE; + } + } + } + + msg_info_milter("got connection from %s", + rspamd_inet_address_to_string_pretty(session->addr)); + } + break; + case RSPAMD_MILTER_CMD_MACRO: + msg_debug_milter("got macro command"); + /* + * Format is + * 1 byte - command associated (we don't care about it) + * 0-terminated name + * 0-terminated value + * ... + */ + if (session->macros == NULL) { + session->macros = g_hash_table_new_full(rspamd_ftok_icase_hash, + rspamd_ftok_icase_equal, + rspamd_fstring_mapped_ftok_free, + rspamd_fstring_mapped_ftok_free); + } + + /* Ignore one byte */ + pos++; + + while (pos < end) { + zero = memchr(pos, '\0', cmdlen); + + if (zero == NULL || zero >= end) { + err = g_error_new(rspamd_milter_quark(), EINVAL, "invalid " + "macro command (no name)"); + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + else { + rspamd_fstring_t *name, *value; + rspamd_ftok_t *name_tok, *value_tok; + const guchar *zero_val; + + zero_val = memchr(zero + 1, '\0', end - zero - 1); + + if (zero_val != NULL && end > zero_val) { + name = rspamd_fstring_new_init(pos, zero - pos); + value = rspamd_fstring_new_init(zero + 1, + zero_val - zero - 1); + name_tok = rspamd_ftok_map(name); + value_tok = rspamd_ftok_map(value); + + g_hash_table_replace(session->macros, name_tok, value_tok); + msg_debug_milter("got macro: %T -> %T", + name_tok, value_tok); + + cmdlen -= zero_val - pos; + pos = zero_val + 1; + } + else { + err = g_error_new(rspamd_milter_quark(), EINVAL, + "invalid macro command (bad value)"); + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + } + } + break; + case RSPAMD_MILTER_CMD_BODYEOB: + msg_debug_milter("got eob command"); + REF_RETAIN(session); + priv->fin_cb(priv->fd, session, priv->ud); + REF_RELEASE(session); + break; + case RSPAMD_MILTER_CMD_HELO: + msg_debug_milter("got helo command"); + + if (end > pos && *(end - 1) == '\0') { + if (session->helo == NULL) { + session->helo = rspamd_fstring_new_init(pos, cmdlen - 1); + } + else { + session->helo = rspamd_fstring_assign(session->helo, + pos, cmdlen - 1); + } + } + else if (end > pos) { + /* Should not happen */ + if (session->helo == NULL) { + session->helo = rspamd_fstring_new_init(pos, cmdlen); + } + else { + session->helo = rspamd_fstring_assign(session->helo, + pos, cmdlen); + } + } + + msg_debug_milter("got helo value: %V", session->helo); + + break; + case RSPAMD_MILTER_CMD_QUIT_NC: + /* We need to reset session and start over */ + msg_debug_milter("got quit_nc command"); + rspamd_milter_session_reset(session, RSPAMD_MILTER_RESET_QUIT_NC); + break; + case RSPAMD_MILTER_CMD_HEADER: + msg_debug_milter("got header command"); + if (!session->message) { + session->message = rspamd_fstring_sized_new( + RSPAMD_MILTER_MESSAGE_CHUNK); + } + zero = memchr(pos, '\0', cmdlen); + + if (zero == NULL) { + err = g_error_new(rspamd_milter_quark(), EINVAL, "invalid " + "header command (no name)"); + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + else { + if (end > zero && *(end - 1) == '\0') { + khiter_t k; + gint res; + + k = kh_get(milter_headers_hash_t, priv->headers, (gchar *) pos); + + if (k == kh_end(priv->headers)) { + GArray *ar; + + k = kh_put(milter_headers_hash_t, priv->headers, + g_strdup(pos), &res); + ar = g_array_new(FALSE, FALSE, sizeof(gint)); + g_array_append_val(ar, priv->cur_hdr); + kh_value(priv->headers, k) = ar; + } + else { + g_array_append_val(kh_value(priv->headers, k), + priv->cur_hdr); + } + + rspamd_printf_fstring(&session->message, "%*s: %*s\r\n", + (int) (zero - pos), pos, + (int) (end - zero - 2), zero + 1); + priv->cur_hdr++; + } + else { + err = g_error_new(rspamd_milter_quark(), EINVAL, "invalid " + "header command (bad value)"); + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + } + break; + case RSPAMD_MILTER_CMD_MAIL: + msg_debug_milter("mail command"); + + while (pos < end) { + struct rspamd_email_address *addr; + gchar *cpy; + + zero = memchr(pos, '\0', end - pos); + + if (zero && zero > pos) { + cpy = rspamd_mempool_alloc(priv->pool, zero - pos); + memcpy(cpy, pos, zero - pos); + msg_debug_milter("got mail: %*s", (int) (zero - pos), cpy); + addr = rspamd_email_address_from_smtp(cpy, zero - pos); + + if (addr) { + session->from = addr; + } + + /* TODO: parse esmtp arguments */ + break; + } + else { + msg_debug_milter("got weird from: %*s", (int) (end - pos), + pos); + /* That actually should not happen */ + cpy = rspamd_mempool_alloc(priv->pool, end - pos); + memcpy(cpy, pos, end - pos); + addr = rspamd_email_address_from_smtp(cpy, end - pos); + + if (addr) { + session->from = addr; + } + + break; + } + } + break; + case RSPAMD_MILTER_CMD_EOH: + msg_debug_milter("got eoh command"); + + if (!session->message) { + session->message = rspamd_fstring_sized_new( + RSPAMD_MILTER_MESSAGE_CHUNK); + } + + session->message = rspamd_fstring_append(session->message, + "\r\n", 2); + break; + case RSPAMD_MILTER_CMD_OPTNEG: + if (cmdlen != sizeof(guint32) * 3) { + err = g_error_new(rspamd_milter_quark(), EINVAL, "invalid " + "optneg command"); + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + + READ_INT_32(pos, version); + READ_INT_32(pos, actions); + READ_INT_32(pos, protocol); + + msg_debug_milter("optneg: version: %d, actions: %d, protocol: %d", + version, actions, protocol); + + if (version < RSPAMD_MILTER_PROTO_VER) { + msg_warn_milter("MTA specifies too old protocol: %d, " + "aborting connection", + version); + + err = g_error_new(rspamd_milter_quark(), EINVAL, "invalid " + "protocol version: %d", + version); + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + + version = RSPAMD_MILTER_PROTO_VER; + actions |= RSPAMD_MILTER_ACTIONS_MASK; + protocol = RSPAMD_MILTER_FLAG_NOREPLY_MASK; + + return rspamd_milter_send_action(session, RSPAMD_MILTER_OPTNEG, + version, actions, protocol); + break; + case RSPAMD_MILTER_CMD_QUIT: + if (priv->out_chain) { + msg_debug_milter("quit command, refcount: %d, " + "some output buffers left - draining", + session->ref.refcount); + + priv->state = RSPAMD_MILTER_WRITE_AND_DIE; + } + else { + msg_debug_milter("quit command, refcount: %d", + session->ref.refcount); + + priv->state = RSPAMD_MILTER_WANNA_DIE; + REF_RETAIN(session); + priv->fin_cb(priv->fd, session, priv->ud); + REF_RELEASE(session); + return FALSE; + } + break; + case RSPAMD_MILTER_CMD_RCPT: + msg_debug_milter("rcpt command"); + + while (pos < end) { + struct rspamd_email_address *addr; + gchar *cpy; + + zero = memchr(pos, '\0', end - pos); + + if (zero && zero > pos) { + cpy = rspamd_mempool_alloc(priv->pool, end - pos); + memcpy(cpy, pos, end - pos); + + msg_debug_milter("got rcpt: %*s", (int) (zero - pos), cpy); + addr = rspamd_email_address_from_smtp(cpy, zero - pos); + + if (addr) { + if (!session->rcpts) { + session->rcpts = g_ptr_array_sized_new(1); + } + + g_ptr_array_add(session->rcpts, addr); + } + + pos = zero + 1; + } + else { + cpy = rspamd_mempool_alloc(priv->pool, end - pos); + memcpy(cpy, pos, end - pos); + + msg_debug_milter("got weird rcpt: %*s", (int) (end - pos), + pos); + /* That actually should not happen */ + addr = rspamd_email_address_from_smtp(cpy, end - pos); + + if (addr) { + if (!session->rcpts) { + session->rcpts = g_ptr_array_sized_new(1); + } + + g_ptr_array_add(session->rcpts, addr); + } + + break; + } + } + break; + case RSPAMD_MILTER_CMD_DATA: + if (!session->message) { + session->message = rspamd_fstring_sized_new( + RSPAMD_MILTER_MESSAGE_CHUNK); + } + msg_debug_milter("got data command"); + /* We do not need reply as specified */ + break; + default: + msg_debug_milter("got bad command: %c", priv->parser.cur_cmd); + break; + } + + return TRUE; +} + +static gboolean +rspamd_milter_is_valid_cmd(guchar c) +{ + switch (c) { + case RSPAMD_MILTER_CMD_ABORT: + case RSPAMD_MILTER_CMD_BODY: + case RSPAMD_MILTER_CMD_CONNECT: + case RSPAMD_MILTER_CMD_MACRO: + case RSPAMD_MILTER_CMD_BODYEOB: + case RSPAMD_MILTER_CMD_HELO: + case RSPAMD_MILTER_CMD_QUIT_NC: + case RSPAMD_MILTER_CMD_HEADER: + case RSPAMD_MILTER_CMD_MAIL: + case RSPAMD_MILTER_CMD_EOH: + case RSPAMD_MILTER_CMD_OPTNEG: + case RSPAMD_MILTER_CMD_QUIT: + case RSPAMD_MILTER_CMD_RCPT: + case RSPAMD_MILTER_CMD_DATA: + case RSPAMD_MILTER_CMD_UNKNOWN: + return TRUE; + default: + break; + } + + return FALSE; +} + +static gboolean +rspamd_milter_consume_input(struct rspamd_milter_session *session, + struct rspamd_milter_private *priv) +{ + const guchar *p, *end; + GError *err; + + p = priv->parser.buf->str + priv->parser.pos; + end = priv->parser.buf->str + priv->parser.buf->len; + + while (p < end) { + msg_debug_milter("offset: %d, state: %d", + (gint) (p - (const guchar *) priv->parser.buf->str), + priv->parser.state); + + switch (priv->parser.state) { + case st_len_1: + /* The first length byte in big endian order */ + priv->parser.datalen = 0; + priv->parser.datalen |= ((gsize) *p) << 24; + priv->parser.state = st_len_2; + p++; + break; + case st_len_2: + /* The second length byte in big endian order */ + priv->parser.datalen |= ((gsize) *p) << 16; + priv->parser.state = st_len_3; + p++; + break; + case st_len_3: + /* The third length byte in big endian order */ + priv->parser.datalen |= ((gsize) *p) << 8; + priv->parser.state = st_len_4; + p++; + break; + case st_len_4: + /* The fourth length byte in big endian order */ + priv->parser.datalen |= ((gsize) *p); + priv->parser.state = st_read_cmd; + p++; + break; + case st_read_cmd: + priv->parser.cur_cmd = *p; + priv->parser.state = st_read_data; + + if (priv->parser.datalen < 1) { + err = g_error_new(rspamd_milter_quark(), EINVAL, + "Command length is too short"); + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + else { + /* Eat command itself */ + priv->parser.datalen--; + } + + p++; + priv->parser.cmd_start = p - (const guchar *) priv->parser.buf->str; + break; + case st_read_data: + /* We might need some more data in buffer for further steps */ + if (priv->parser.datalen > + RSPAMD_MILTER_MESSAGE_CHUNK * 2) { + /* Check if we have HTTP input instead of milter */ + if (priv->parser.buf->len > sizeof("GET") && + memcmp(priv->parser.buf->str, "GET", 3) == 0) { + struct http_parser http_parser; + struct http_parser_settings http_callbacks; + GString *url = g_string_new(NULL); + + /* Hack, hack, hack */ + /* + * This code is assumed to read `/ping` command and + * handle it to monitor port's availability since + * milter protocol is stupid and does not allow to do that + * This code also assumes that HTTP request can be read + * as as single data chunk which is not true in some cases + * In general, don't use it for anything but ping checks + */ + memset(&http_callbacks, 0, sizeof(http_callbacks)); + http_parser.data = url; + http_parser_init(&http_parser, HTTP_REQUEST); + http_callbacks.on_url = rspamd_milter_http_on_url; + http_parser_execute(&http_parser, &http_callbacks, + priv->parser.buf->str, priv->parser.buf->len); + + if (url->len == sizeof("/ping") - 1 && + rspamd_lc_cmp(url->str, "/ping", url->len) == 0) { + rspamd_milter_on_protocol_ping(session, priv); + g_string_free(url, TRUE); + + return TRUE; + } + else { + err = g_error_new(rspamd_milter_quark(), EINVAL, + "HTTP GET request is not supported in milter mode, url: %s", + url->str); + } + + g_string_free(url, TRUE); + } + else if (priv->parser.buf->len > sizeof("POST") && + memcmp(priv->parser.buf->str, "POST", 4) == 0) { + err = g_error_new(rspamd_milter_quark(), EINVAL, + "HTTP POST request is not supported in milter mode"); + } + else { + err = g_error_new(rspamd_milter_quark(), E2BIG, + "Command length is too big: %zd", + priv->parser.datalen); + } + + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + if (!rspamd_milter_is_valid_cmd(priv->parser.cur_cmd)) { + err = g_error_new(rspamd_milter_quark(), E2BIG, + "Unvalid command: %c", + priv->parser.cur_cmd); + rspamd_milter_on_protocol_error(session, priv, err); + + return FALSE; + } + if (priv->parser.buf->allocated < priv->parser.datalen) { + priv->parser.pos = p - (const guchar *) priv->parser.buf->str; + priv->parser.buf = rspamd_fstring_grow(priv->parser.buf, + priv->parser.buf->len + priv->parser.datalen); + /* This can realloc buffer */ + rspamd_milter_plan_io(session, priv, EV_READ); + goto end; + } + else { + /* We may have the full command available */ + if (p + priv->parser.datalen <= end) { + /* We can process command */ + if (!rspamd_milter_process_command(session, priv)) { + return FALSE; + } + + p += priv->parser.datalen; + priv->parser.state = st_len_1; + priv->parser.cur_cmd = '\0'; + priv->parser.cmd_start = 0; + } + else { + /* Need to read more */ + priv->parser.pos = p - (const guchar *) priv->parser.buf->str; + rspamd_milter_plan_io(session, priv, EV_READ); + goto end; + } + } + break; + } + } + + /* Leftover */ + switch (priv->parser.state) { + case st_read_data: + if (p + priv->parser.datalen <= end) { + if (!rspamd_milter_process_command(session, priv)) { + return FALSE; + } + + priv->parser.state = st_len_1; + priv->parser.cur_cmd = '\0'; + priv->parser.cmd_start = 0; + } + break; + default: + /* No need to do anything */ + break; + } + + if (p == end) { + priv->parser.buf->len = 0; + priv->parser.pos = 0; + priv->parser.cmd_start = 0; + } + + if (priv->out_chain) { + rspamd_milter_plan_io(session, priv, EV_READ | EV_WRITE); + } + else { + rspamd_milter_plan_io(session, priv, EV_READ); + } +end: + + return TRUE; +} + +static gboolean +rspamd_milter_handle_session(struct rspamd_milter_session *session, + struct rspamd_milter_private *priv) +{ + struct rspamd_milter_outbuf *obuf, *obuf_tmp; + gssize r, to_write; + GError *err; + + g_assert(session != NULL); + + switch (priv->state) { + case RSPAMD_MILTER_READ_MORE: + if (priv->parser.buf->len >= priv->parser.buf->allocated) { + priv->parser.buf = rspamd_fstring_grow(priv->parser.buf, + priv->parser.buf->len * 2); + } + + r = read(priv->fd, priv->parser.buf->str + priv->parser.buf->len, + priv->parser.buf->allocated - priv->parser.buf->len); + + msg_debug_milter("read %z bytes, %z remain, %z allocated", + r, priv->parser.buf->len, priv->parser.buf->allocated); + + if (r == -1) { + if (errno == EAGAIN || errno == EINTR) { + rspamd_milter_plan_io(session, priv, EV_READ); + + return TRUE; + } + else { + /* Fatal IO error */ + err = g_error_new(rspamd_milter_quark(), errno, + "IO read error: %s", strerror(errno)); + REF_RETAIN(session); + priv->err_cb(priv->fd, session, priv->ud, err); + REF_RELEASE(session); + g_error_free(err); + + REF_RELEASE(session); + + return FALSE; + } + } + else if (r == 0) { + err = g_error_new(rspamd_milter_quark(), ECONNRESET, + "Unexpected EOF"); + REF_RETAIN(session); + priv->err_cb(priv->fd, session, priv->ud, err); + REF_RELEASE(session); + g_error_free(err); + + REF_RELEASE(session); + + return FALSE; + } + else { + priv->parser.buf->len += r; + + return rspamd_milter_consume_input(session, priv); + } + + break; + case RSPAMD_MILTER_WRITE_REPLY: + case RSPAMD_MILTER_WRITE_AND_DIE: + if (priv->out_chain == NULL) { + if (priv->state == RSPAMD_MILTER_WRITE_AND_DIE) { + /* Finished writing, let's die finally */ + msg_debug_milter("output drained, terminating, refcount: %d", + session->ref.refcount); + + /* Session should be destroyed by fin_cb... */ + REF_RETAIN(session); + priv->fin_cb(priv->fd, session, priv->ud); + REF_RELEASE(session); + + return FALSE; + } + else { + /* We have written everything, so we can read something */ + priv->state = RSPAMD_MILTER_READ_MORE; + rspamd_milter_plan_io(session, priv, EV_READ); + } + } + else { + DL_FOREACH_SAFE(priv->out_chain, obuf, obuf_tmp) + { + to_write = obuf->buf->len - obuf->pos; + + g_assert(to_write > 0); + + r = write(priv->fd, obuf->buf->str + obuf->pos, to_write); + + if (r == -1) { + if (errno == EAGAIN || errno == EINTR) { + rspamd_milter_plan_io(session, priv, EV_WRITE); + } + else { + /* Fatal IO error */ + err = g_error_new(rspamd_milter_quark(), errno, + "IO write error: %s", strerror(errno)); + REF_RETAIN(session); + priv->err_cb(priv->fd, session, priv->ud, err); + REF_RELEASE(session); + g_error_free(err); + + REF_RELEASE(session); + + return FALSE; + } + } + else if (r == 0) { + err = g_error_new(rspamd_milter_quark(), ECONNRESET, + "Unexpected EOF"); + REF_RETAIN(session); + priv->err_cb(priv->fd, session, priv->ud, err); + REF_RELEASE(session); + g_error_free(err); + + REF_RELEASE(session); + + return FALSE; + } + else { + if (r == to_write) { + /* We have done with this buf */ + DL_DELETE(priv->out_chain, obuf); + rspamd_milter_obuf_free(obuf); + } + else { + /* We need to plan another write */ + obuf->pos += r; + rspamd_milter_plan_io(session, priv, EV_WRITE); + + return TRUE; + } + } + } + + /* Here we have written everything, so we can plan reading */ + priv->state = RSPAMD_MILTER_READ_MORE; + rspamd_milter_plan_io(session, priv, EV_READ); + } + break; + case RSPAMD_MILTER_WANNA_DIE: + /* We are here after processing everything, so release session */ + REF_RELEASE(session); + return FALSE; + break; + case RSPAMD_MILTER_PONG_AND_DIE: + err = g_error_new(rspamd_milter_quark(), 0, + "ping command"); + REF_RETAIN(session); + priv->err_cb(priv->fd, session, priv->ud, err); + REF_RELEASE(session); + g_error_free(err); + REF_RELEASE(session); + return FALSE; + break; + } + + return TRUE; +} + + +gboolean +rspamd_milter_handle_socket(gint fd, ev_tstamp timeout, + rspamd_mempool_t *pool, + struct ev_loop *ev_base, rspamd_milter_finish finish_cb, + rspamd_milter_error error_cb, void *ud) +{ + struct rspamd_milter_session *session; + struct rspamd_milter_private *priv; + gint nfd = dup(fd); + + if (nfd == -1) { + GError *err = g_error_new(rspamd_milter_quark(), errno, + "dup failed: %s", strerror(errno)); + error_cb(fd, NULL, ud, err); + + return FALSE; + } + + g_assert(finish_cb != NULL); + g_assert(error_cb != NULL); + g_assert(milter_ctx != NULL); + + session = g_malloc0(sizeof(*session)); + priv = g_malloc0(sizeof(*priv)); + priv->fd = nfd; + priv->ud = ud; + priv->fin_cb = finish_cb; + priv->err_cb = error_cb; + priv->parser.state = st_len_1; + priv->parser.buf = rspamd_fstring_sized_new(RSPAMD_MILTER_MESSAGE_CHUNK + 5); + priv->event_loop = ev_base; + priv->state = RSPAMD_MILTER_READ_MORE; + priv->pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "milter", 0); + priv->discard_on_reject = milter_ctx->discard_on_reject; + priv->quarantine_on_reject = milter_ctx->quarantine_on_reject; + priv->ev.timeout = timeout; + + rspamd_ev_watcher_init(&priv->ev, priv->fd, EV_READ | EV_WRITE, + rspamd_milter_io_handler, session); + + if (pool) { + /* Copy tag */ + memcpy(priv->pool->tag.uid, pool->tag.uid, sizeof(pool->tag.uid)); + } + + priv->headers = kh_init(milter_headers_hash_t); + kh_resize(milter_headers_hash_t, priv->headers, 32); + + session->priv = priv; + REF_INIT_RETAIN(session, rspamd_milter_session_dtor); + + if (milter_ctx->sessions_cache) { + rspamd_worker_session_cache_add(milter_ctx->sessions_cache, + priv->pool->tag.uid, &session->ref.refcount, session); + } + + return rspamd_milter_handle_session(session, priv); +} + +gboolean +rspamd_milter_set_reply(struct rspamd_milter_session *session, + rspamd_fstring_t *rcode, + rspamd_fstring_t *xcode, + rspamd_fstring_t *reply) +{ + GString *buf; + gboolean ret; + + buf = g_string_sized_new(xcode->len + rcode->len + reply->len + 2); + rspamd_printf_gstring(buf, "%V %V %V", rcode, xcode, reply); + ret = rspamd_milter_send_action(session, RSPAMD_MILTER_REPLYCODE, + buf); + g_string_free(buf, TRUE); + + return ret; +} + +#define SET_COMMAND(cmd, sz, reply, pos) \ + do { \ + guint32 _len; \ + _len = (sz) + 1; \ + (reply) = rspamd_fstring_sized_new(sizeof(_len) + _len); \ + (reply)->len = sizeof(_len) + _len; \ + _len = htonl(_len); \ + memcpy((reply)->str, &_len, sizeof(_len)); \ + (reply)->str[sizeof(_len)] = (cmd); \ + (pos) = (guchar *) (reply)->str + sizeof(_len) + 1; \ + } while (0) + +gboolean +rspamd_milter_send_action(struct rspamd_milter_session *session, + enum rspamd_milter_reply act, ...) +{ + guint32 ver, actions, protocol, idx; + va_list ap; + guchar cmd, *pos; + rspamd_fstring_t *reply = NULL; + gsize len; + GString *name, *value; + const char *reason, *body_str; + struct rspamd_milter_outbuf *obuf; + struct rspamd_milter_private *priv = session->priv; + + va_start(ap, act); + cmd = act; + + switch (act) { + case RSPAMD_MILTER_ACCEPT: + case RSPAMD_MILTER_CONTINUE: + case RSPAMD_MILTER_DISCARD: + case RSPAMD_MILTER_PROGRESS: + case RSPAMD_MILTER_REJECT: + case RSPAMD_MILTER_TEMPFAIL: + /* No additional arguments */ + msg_debug_milter("send %c command", cmd); + SET_COMMAND(cmd, 0, reply, pos); + break; + case RSPAMD_MILTER_QUARANTINE: + reason = va_arg(ap, const char *); + + if (reason == NULL) { + reason = ""; + } + + len = strlen(reason); + msg_debug_milter("send quarantine action %s", reason); + SET_COMMAND(cmd, len + 1, reply, pos); + memcpy(pos, reason, len + 1); + break; + case RSPAMD_MILTER_ADDHEADER: + name = va_arg(ap, GString *); + value = va_arg(ap, GString *); + + /* Name and value must be zero terminated */ + msg_debug_milter("add header command - \"%v\"=\"%v\"", name, value); + SET_COMMAND(cmd, name->len + value->len + 2, reply, pos); + memcpy(pos, name->str, name->len + 1); + pos += name->len + 1; + memcpy(pos, value->str, value->len + 1); + break; + case RSPAMD_MILTER_CHGHEADER: + case RSPAMD_MILTER_INSHEADER: + idx = va_arg(ap, guint32); + name = va_arg(ap, GString *); + value = va_arg(ap, GString *); + + msg_debug_milter("change/insert header command pos = %d- \"%v\"=\"%v\"", + idx, name, value); + /* Name and value must be zero terminated */ + SET_COMMAND(cmd, name->len + value->len + 2 + sizeof(guint32), + reply, pos); + idx = htonl(idx); + memcpy(pos, &idx, sizeof(idx)); + pos += sizeof(idx); + memcpy(pos, name->str, name->len + 1); + pos += name->len + 1; + memcpy(pos, value->str, value->len + 1); + break; + case RSPAMD_MILTER_REPLBODY: + len = va_arg(ap, gsize); + body_str = va_arg(ap, const char *); + msg_debug_milter("want to change body; size = %uz", + len); + SET_COMMAND(cmd, len, reply, pos); + memcpy(pos, body_str, len); + break; + case RSPAMD_MILTER_REPLYCODE: + case RSPAMD_MILTER_ADDRCPT: + case RSPAMD_MILTER_DELRCPT: + case RSPAMD_MILTER_CHGFROM: + /* Single GString * argument */ + value = va_arg(ap, GString *); + msg_debug_milter("command %c; value=%v", cmd, value); + SET_COMMAND(cmd, value->len + 1, reply, pos); + memcpy(pos, value->str, value->len + 1); + break; + case RSPAMD_MILTER_OPTNEG: + ver = va_arg(ap, guint32); + actions = va_arg(ap, guint32); + protocol = va_arg(ap, guint32); + + msg_debug_milter("optneg reply: ver=%d, actions=%d, protocol=%d", + ver, actions, protocol); + ver = htonl(ver); + actions = htonl(actions); + protocol = htonl(protocol); + SET_COMMAND(cmd, sizeof(guint32) * 3, reply, pos); + memcpy(pos, &ver, sizeof(ver)); + pos += sizeof(ver); + memcpy(pos, &actions, sizeof(actions)); + pos += sizeof(actions); + memcpy(pos, &protocol, sizeof(protocol)); + break; + default: + msg_err_milter("invalid command: %c", cmd); + break; + } + + va_end(ap); + + if (reply) { + obuf = g_malloc(sizeof(*obuf)); + obuf->buf = reply; + obuf->pos = 0; + DL_APPEND(priv->out_chain, obuf); + priv->state = RSPAMD_MILTER_WRITE_REPLY; + rspamd_milter_plan_io(session, priv, EV_WRITE); + + return TRUE; + } + + return FALSE; +} + +gboolean +rspamd_milter_add_header(struct rspamd_milter_session *session, + GString *name, GString *value) +{ + return rspamd_milter_send_action(session, RSPAMD_MILTER_ADDHEADER, + name, value); +} + +gboolean +rspamd_milter_del_header(struct rspamd_milter_session *session, + GString *name) +{ + GString value; + guint32 idx = 1; + + value.str = (gchar *) ""; + value.len = 0; + + return rspamd_milter_send_action(session, RSPAMD_MILTER_CHGHEADER, + idx, name, &value); +} + +void rspamd_milter_session_unref(struct rspamd_milter_session *session) +{ + REF_RELEASE(session); +} + +struct rspamd_milter_session * +rspamd_milter_session_ref(struct rspamd_milter_session *session) +{ + REF_RETAIN(session); + + return session; +} + +#define IF_MACRO(lit) \ + RSPAMD_FTOK_ASSIGN(&srch, (lit)); \ + found = g_hash_table_lookup(session->macros, &srch); \ + if (found) + +static void +rspamd_milter_macro_http(struct rspamd_milter_session *session, + struct rspamd_http_message *msg) +{ + rspamd_ftok_t *found, srch; + struct rspamd_milter_private *priv = session->priv; + + /* + * We assume postfix macros here, sendmail ones might be slightly + * different + */ + + if (!session->macros) { + return; + } + + IF_MACRO("{i}") + { + rspamd_http_message_add_header_len(msg, QUEUE_ID_HEADER, + found->begin, found->len); + } + else + { + IF_MACRO("i") + { + rspamd_http_message_add_header_len(msg, QUEUE_ID_HEADER, + found->begin, found->len); + } + } + + IF_MACRO("{v}") + { + rspamd_http_message_add_header_len(msg, USER_AGENT_HEADER, + found->begin, found->len); + } + else + { + IF_MACRO("v") + { + rspamd_http_message_add_header_len(msg, USER_AGENT_HEADER, + found->begin, found->len); + } + } + + IF_MACRO("{cipher}") + { + rspamd_http_message_add_header_len(msg, TLS_CIPHER_HEADER, + found->begin, found->len); + } + + IF_MACRO("{tls_version}") + { + rspamd_http_message_add_header_len(msg, TLS_VERSION_HEADER, + found->begin, found->len); + } + + IF_MACRO("{auth_authen}") + { + rspamd_http_message_add_header_len(msg, USER_HEADER, + found->begin, found->len); + } + + IF_MACRO("{rcpt_mailer}") + { + rspamd_http_message_add_header_len(msg, MAILER_HEADER, + found->begin, found->len); + } + + if (milter_ctx->client_ca_name) { + IF_MACRO("{cert_issuer}") + { + rspamd_http_message_add_header_len(msg, CERT_ISSUER_HEADER, + found->begin, found->len); + + if (found->len == strlen(milter_ctx->client_ca_name) && + rspamd_cryptobox_memcmp(found->begin, + milter_ctx->client_ca_name, found->len) == 0) { + msg_debug_milter("process certificate issued by %T", found); + IF_MACRO("{cert_subject}") + { + rspamd_http_message_add_header_len(msg, USER_HEADER, + found->begin, found->len); + } + } + else { + msg_debug_milter("skip certificate issued by %T", found); + } + } + } + else { + IF_MACRO("{cert_issuer}") + { + rspamd_http_message_add_header_len(msg, CERT_ISSUER_HEADER, + found->begin, found->len); + } + } + + if (!session->hostname || session->hostname->len == 0) { + IF_MACRO("{client_name}") + { + if (!(found->len == sizeof("unknown") - 1 && + memcmp(found->begin, "unknown", + sizeof("unknown") - 1) == 0)) { + rspamd_http_message_add_header_len(msg, HOSTNAME_HEADER, + found->begin, found->len); + } + else { + msg_debug_milter("skip unknown hostname from being added"); + } + } + } + + IF_MACRO("{daemon_name}") + { + /* Postfix style */ + rspamd_http_message_add_header_len(msg, MTA_NAME_HEADER, + found->begin, found->len); + } + else + { + /* Sendmail style */ + IF_MACRO("{j}") + { + rspamd_http_message_add_header_len(msg, MTA_NAME_HEADER, + found->begin, found->len); + } + else + { + IF_MACRO("j") + { + rspamd_http_message_add_header_len(msg, MTA_NAME_HEADER, + found->begin, found->len); + } + } + } +} + +struct rspamd_http_message * +rspamd_milter_to_http(struct rspamd_milter_session *session) +{ + struct rspamd_http_message *msg; + guint i; + struct rspamd_email_address *rcpt; + struct rspamd_milter_private *priv = session->priv; + + g_assert(session != NULL); + + msg = rspamd_http_new_message(HTTP_REQUEST); + + msg->url = rspamd_fstring_assign(msg->url, "/" MSG_CMD_CHECK_V2, + sizeof("/" MSG_CMD_CHECK_V2) - 1); + + if (session->message) { + rspamd_http_message_set_body_from_fstring_steal(msg, session->message); + session->message = NULL; + } + + if (session->hostname && RSPAMD_FSTRING_LEN(session->hostname) > 0) { + if (!(session->hostname->len == sizeof("unknown") - 1 && + memcmp(RSPAMD_FSTRING_DATA(session->hostname), "unknown", + sizeof("unknown") - 1) == 0)) { + rspamd_http_message_add_header_fstr(msg, HOSTNAME_HEADER, + session->hostname); + } + else { + msg_debug_milter("skip unknown hostname from being added"); + } + } + + if (session->helo && session->helo->len > 0) { + rspamd_http_message_add_header_fstr(msg, HELO_HEADER, + session->helo); + } + + if (session->from) { + rspamd_http_message_add_header_len(msg, FROM_HEADER, + session->from->raw, session->from->raw_len); + } + + if (session->rcpts) { + PTR_ARRAY_FOREACH(session->rcpts, i, rcpt) + { + rspamd_http_message_add_header_len(msg, RCPT_HEADER, + rcpt->raw, rcpt->raw_len); + } + } + + if (session->addr) { + if (rspamd_inet_address_get_af(session->addr) != AF_UNIX) { + rspamd_http_message_add_header(msg, IP_ADDR_HEADER, + rspamd_inet_address_to_string_pretty(session->addr)); + } + else { + rspamd_http_message_add_header(msg, IP_ADDR_HEADER, + rspamd_inet_address_to_string(session->addr)); + } + } + + rspamd_milter_macro_http(session, msg); + rspamd_http_message_add_header(msg, FLAGS_HEADER, "milter,body_block"); + + return msg; +} + +void * +rspamd_milter_update_userdata(struct rspamd_milter_session *session, + void *ud) +{ + struct rspamd_milter_private *priv = session->priv; + void *prev_ud; + + prev_ud = priv->ud; + priv->ud = ud; + + return prev_ud; +} + +static void +rspamd_milter_remove_header_safe(struct rspamd_milter_session *session, + const gchar *key, gint nhdr) +{ + gint i; + GString *hname, *hvalue; + struct rspamd_milter_private *priv = session->priv; + khiter_t k; + GArray *ar; + + k = kh_get(milter_headers_hash_t, priv->headers, (char *) key); + + if (k != kh_end(priv->headers)) { + ar = kh_val(priv->headers, k); + + hname = g_string_new(key); + hvalue = g_string_new(""); + + if (nhdr > 0) { + if (ar->len >= nhdr) { + rspamd_milter_send_action(session, + RSPAMD_MILTER_CHGHEADER, + nhdr, hname, hvalue); + priv->cur_hdr--; + } + } + else if (nhdr == 0) { + /* We need to clear all headers */ + for (i = ar->len; i > 0; i--) { + rspamd_milter_send_action(session, + RSPAMD_MILTER_CHGHEADER, + i, hname, hvalue); + priv->cur_hdr--; + } + } + else { + /* Remove from the end */ + if (nhdr >= -(ar->len)) { + rspamd_milter_send_action(session, + RSPAMD_MILTER_CHGHEADER, + ar->len + nhdr + 1, hname, hvalue); + priv->cur_hdr--; + } + } + + g_string_free(hname, TRUE); + g_string_free(hvalue, TRUE); + + if (priv->cur_hdr < 0) { + msg_err_milter("negative header count after removing %s", key); + priv->cur_hdr = 0; + } + } +} + +static void +rspamd_milter_extract_single_header(struct rspamd_milter_session *session, + const gchar *hdr, const ucl_object_t *obj) +{ + GString *hname, *hvalue; + struct rspamd_milter_private *priv = session->priv; + gint idx = -1; + const ucl_object_t *val; + + val = ucl_object_lookup(obj, "value"); + + if (val && ucl_object_type(val) == UCL_STRING) { + const ucl_object_t *idx_obj; + gboolean has_idx = FALSE; + + idx_obj = ucl_object_lookup_any(obj, "order", + "index", NULL); + + if (idx_obj && (ucl_object_type(idx_obj) == UCL_INT || ucl_object_type(idx_obj) == UCL_FLOAT)) { + idx = ucl_object_toint(idx_obj); + has_idx = TRUE; + } + + hname = g_string_new(hdr); + hvalue = g_string_new(ucl_object_tostring(val)); + + if (has_idx) { + if (idx >= 0) { + rspamd_milter_send_action(session, + RSPAMD_MILTER_INSHEADER, + idx, + hname, hvalue); + } + else { + /* Calculate negative offset */ + + if (idx == -1) { + rspamd_milter_send_action(session, + RSPAMD_MILTER_ADDHEADER, + hname, hvalue); + } + else if (-idx <= priv->cur_hdr) { + /* + * Note: We should account MTA's own "Received:" field + * which wasn't passed by Milter's header command. + */ + rspamd_milter_send_action(session, + RSPAMD_MILTER_INSHEADER, + priv->cur_hdr + idx + 2, + hname, hvalue); + } + else { + rspamd_milter_send_action(session, + RSPAMD_MILTER_INSHEADER, + 0, + hname, hvalue); + } + } + } + else { + rspamd_milter_send_action(session, + RSPAMD_MILTER_ADDHEADER, + hname, hvalue); + } + + priv->cur_hdr++; + + g_string_free(hname, TRUE); + g_string_free(hvalue, TRUE); + } +} + +/* + * Returns `TRUE` if action has been processed internally by this function + */ +static gboolean +rspamd_milter_process_milter_block(struct rspamd_milter_session *session, + const ucl_object_t *obj, struct rspamd_action *action) +{ + const ucl_object_t *elt, *cur; + ucl_object_iter_t it; + struct rspamd_milter_private *priv = session->priv; + GString *hname, *hvalue; + + if (obj && ucl_object_type(obj) == UCL_OBJECT) { + elt = ucl_object_lookup(obj, "remove_headers"); + /* + * remove_headers: {"name": 1, ... } + * where number is the header's position starting from '1' + */ + if (elt && ucl_object_type(elt) == UCL_OBJECT) { + it = NULL; + + while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) { + if (ucl_object_type(cur) == UCL_INT) { + rspamd_milter_remove_header_safe(session, + ucl_object_key(cur), + ucl_object_toint(cur)); + } + } + } + + elt = ucl_object_lookup(obj, "add_headers"); + /* + * add_headers: {"name": "value", ... } + * name could have multiple values + * -or- (since 1.7) + * {"name": {"value": "val", "order": 0}, ... } + */ + if (elt && ucl_object_type(elt) == UCL_OBJECT) { + it = NULL; + + while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) { + + const char *key_name = ucl_object_key(cur); + + if (ucl_object_type(cur) == UCL_STRING) { + /* + * Legacy support of {"name": "value", ... } with + * multiple names under the same name + */ + ucl_object_iter_t *elt_it; + const ucl_object_t *cur_elt; + + elt_it = ucl_object_iterate_new(cur); + while ((cur_elt = ucl_object_iterate_safe(elt_it, false)) != NULL) { + if (ucl_object_type(cur_elt) == UCL_STRING) { + hname = g_string_new(key_name); + hvalue = g_string_new(ucl_object_tostring(cur_elt)); + + rspamd_milter_send_action(session, + RSPAMD_MILTER_ADDHEADER, + hname, hvalue); + g_string_free(hname, TRUE); + g_string_free(hvalue, TRUE); + } + else { + msg_warn_milter("legacy header with name %s, that has not a string value: %s", + key_name, ucl_object_type_to_string(cur_elt->type)); + } + } + ucl_object_iterate_free(elt_it); + } + else { + if (ucl_object_type(cur) == UCL_OBJECT) { + rspamd_milter_extract_single_header(session, + key_name, cur); + } + else if (ucl_object_type(cur) == UCL_ARRAY) { + /* Multiple values for the same key */ + ucl_object_iter_t *array_it; + const ucl_object_t *array_elt; + + array_it = ucl_object_iterate_new(cur); + + while ((array_elt = ucl_object_iterate_safe(array_it, + true)) != NULL) { + rspamd_milter_extract_single_header(session, + key_name, array_elt); + } + + ucl_object_iterate_free(array_it); + } + else { + msg_warn_milter("non-legacy header with name %s, that has unsupported value type: %s", + key_name, ucl_object_type_to_string(cur->type)); + } + } + } + } + + elt = ucl_object_lookup(obj, "change_from"); + + if (elt && ucl_object_type(elt) == UCL_STRING) { + hvalue = g_string_new(ucl_object_tostring(elt)); + rspamd_milter_send_action(session, + RSPAMD_MILTER_CHGFROM, + hvalue); + g_string_free(hvalue, TRUE); + } + + elt = ucl_object_lookup(obj, "add_rcpt"); + + if (elt && ucl_object_type(elt) == UCL_ARRAY) { + it = NULL; + + while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) { + hvalue = g_string_new(ucl_object_tostring(cur)); + rspamd_milter_send_action(session, + RSPAMD_MILTER_ADDRCPT, + hvalue); + g_string_free(hvalue, TRUE); + } + } + + elt = ucl_object_lookup(obj, "del_rcpt"); + + if (elt && ucl_object_type(elt) == UCL_ARRAY) { + it = NULL; + + while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) { + hvalue = g_string_new(ucl_object_tostring(cur)); + rspamd_milter_send_action(session, + RSPAMD_MILTER_DELRCPT, + hvalue); + g_string_free(hvalue, TRUE); + } + } + + elt = ucl_object_lookup(obj, "reject"); + + if (elt && ucl_object_type(elt) == UCL_STRING) { + if (strcmp(ucl_object_tostring(elt), "discard") == 0) { + priv->discard_on_reject = TRUE; + msg_info_milter("discard message instead of rejection"); + } + else if (strcmp(ucl_object_tostring(elt), "quarantine") == 0) { + priv->quarantine_on_reject = TRUE; + msg_info_milter("quarantine message instead of rejection"); + } + else { + priv->discard_on_reject = FALSE; + priv->quarantine_on_reject = FALSE; + } + } + + elt = ucl_object_lookup(obj, "no_action"); + + if (elt && ucl_object_type(elt) == UCL_BOOLEAN) { + priv->no_action = ucl_object_toboolean(elt); + } + } + + if (action->action_type == METRIC_ACTION_ADD_HEADER) { + elt = ucl_object_lookup(obj, "spam_header"); + + if (elt) { + if (ucl_object_type(elt) == UCL_STRING) { + rspamd_milter_remove_header_safe(session, + milter_ctx->spam_header, + 0); + + hname = g_string_new(milter_ctx->spam_header); + hvalue = g_string_new(ucl_object_tostring(elt)); + rspamd_milter_send_action(session, RSPAMD_MILTER_CHGHEADER, + (guint32) 1, hname, hvalue); + g_string_free(hname, TRUE); + g_string_free(hvalue, TRUE); + rspamd_milter_send_action(session, RSPAMD_MILTER_ACCEPT); + + return TRUE; + } + else if (ucl_object_type(elt) == UCL_OBJECT) { + it = NULL; + + while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) { + rspamd_milter_remove_header_safe(session, + ucl_object_key(cur), + 0); + + hname = g_string_new(ucl_object_key(cur)); + hvalue = g_string_new(ucl_object_tostring(cur)); + rspamd_milter_send_action(session, RSPAMD_MILTER_CHGHEADER, + (guint32) 1, hname, hvalue); + g_string_free(hname, TRUE); + g_string_free(hvalue, TRUE); + } + + rspamd_milter_send_action(session, RSPAMD_MILTER_ACCEPT); + + return TRUE; + } + } + } + + return FALSE; +} + +void rspamd_milter_send_task_results(struct rspamd_milter_session *session, + const ucl_object_t *results, + const gchar *new_body, + gsize bodylen) +{ + const ucl_object_t *elt; + struct rspamd_milter_private *priv = session->priv; + const gchar *str_action; + struct rspamd_action *action; + rspamd_fstring_t *xcode = NULL, *rcode = NULL, *reply = NULL; + GString *hname, *hvalue; + gboolean processed = FALSE; + + if (results == NULL) { + msg_err_milter("cannot find scan results, tempfail"); + rspamd_milter_send_action(session, RSPAMD_MILTER_TEMPFAIL); + + goto cleanup; + } + + elt = ucl_object_lookup(results, "action"); + + if (!elt) { + msg_err_milter("cannot find action in results, tempfail"); + rspamd_milter_send_action(session, RSPAMD_MILTER_TEMPFAIL); + + goto cleanup; + } + + str_action = ucl_object_tostring(elt); + action = rspamd_config_get_action(milter_ctx->cfg, str_action); + + if (action == NULL) { + msg_err_milter("action %s has not been registered", str_action); + rspamd_milter_send_action(session, RSPAMD_MILTER_TEMPFAIL); + + goto cleanup; + } + + elt = ucl_object_lookup(results, "messages"); + if (elt) { + const ucl_object_t *smtp_res; + const gchar *msg; + gsize len = 0; + + smtp_res = ucl_object_lookup(elt, "smtp_message"); + + if (smtp_res) { + msg = ucl_object_tolstring(smtp_res, &len); + reply = rspamd_fstring_new_init(msg, len); + } + } + + /* Deal with milter headers */ + elt = ucl_object_lookup(results, "milter"); + + if (elt) { + processed = rspamd_milter_process_milter_block(session, elt, action); + } + + /* DKIM-Signature */ + elt = ucl_object_lookup(results, "dkim-signature"); + + if (elt) { + hname = g_string_new(RSPAMD_MILTER_DKIM_HEADER); + + if (ucl_object_type(elt) == UCL_STRING) { + hvalue = g_string_new(ucl_object_tostring(elt)); + + rspamd_milter_send_action(session, RSPAMD_MILTER_INSHEADER, + 1, hname, hvalue); + + g_string_free(hvalue, TRUE); + } + else { + ucl_object_iter_t it; + const ucl_object_t *cur; + int i = 1; + + it = ucl_object_iterate_new(elt); + + while ((cur = ucl_object_iterate_safe(it, true)) != NULL) { + hvalue = g_string_new(ucl_object_tostring(cur)); + + rspamd_milter_send_action(session, RSPAMD_MILTER_INSHEADER, + i++, hname, hvalue); + + g_string_free(hvalue, TRUE); + } + + ucl_object_iterate_free(it); + } + + g_string_free(hname, TRUE); + } + + if (processed) { + goto cleanup; + } + + if (new_body) { + rspamd_milter_send_action(session, RSPAMD_MILTER_REPLBODY, + bodylen, new_body); + } + + if (priv->no_action) { + msg_info_milter("do not apply action %s, no_action is set", + str_action); + hname = g_string_new(RSPAMD_MILTER_ACTION_HEADER); + hvalue = g_string_new(str_action); + + rspamd_milter_send_action(session, RSPAMD_MILTER_ADDHEADER, + hname, hvalue); + g_string_free(hname, TRUE); + g_string_free(hvalue, TRUE); + rspamd_milter_send_action(session, RSPAMD_MILTER_ACCEPT); + + goto cleanup; + } + + switch (action->action_type) { + case METRIC_ACTION_REJECT: + if (priv->discard_on_reject) { + rspamd_milter_send_action(session, RSPAMD_MILTER_DISCARD); + } + else if (priv->quarantine_on_reject) { + /* TODO: be more flexible about SMTP messages */ + rspamd_milter_send_action(session, RSPAMD_MILTER_QUARANTINE, + RSPAMD_MILTER_QUARANTINE_MESSAGE); + + /* Quarantine also requires accept action, all hail Sendmail */ + rspamd_milter_send_action(session, RSPAMD_MILTER_ACCEPT); + } + else { + rcode = rspamd_fstring_new_init(RSPAMD_MILTER_RCODE_REJECT, + sizeof(RSPAMD_MILTER_RCODE_REJECT) - 1); + xcode = rspamd_fstring_new_init(RSPAMD_MILTER_XCODE_REJECT, + sizeof(RSPAMD_MILTER_XCODE_REJECT) - 1); + + if (!reply) { + if (milter_ctx->reject_message == NULL) { + reply = rspamd_fstring_new_init( + RSPAMD_MILTER_REJECT_MESSAGE, + sizeof(RSPAMD_MILTER_REJECT_MESSAGE) - 1); + } + else { + reply = rspamd_fstring_new_init(milter_ctx->reject_message, + strlen(milter_ctx->reject_message)); + } + } + + rspamd_milter_set_reply(session, rcode, xcode, reply); + } + break; + case METRIC_ACTION_SOFT_REJECT: + rcode = rspamd_fstring_new_init(RSPAMD_MILTER_RCODE_TEMPFAIL, + sizeof(RSPAMD_MILTER_RCODE_TEMPFAIL) - 1); + xcode = rspamd_fstring_new_init(RSPAMD_MILTER_XCODE_TEMPFAIL, + sizeof(RSPAMD_MILTER_XCODE_TEMPFAIL) - 1); + + if (!reply) { + reply = rspamd_fstring_new_init(RSPAMD_MILTER_TEMPFAIL_MESSAGE, + sizeof(RSPAMD_MILTER_TEMPFAIL_MESSAGE) - 1); + } + + rspamd_milter_set_reply(session, rcode, xcode, reply); + break; + + case METRIC_ACTION_REWRITE_SUBJECT: + elt = ucl_object_lookup(results, "subject"); + + if (elt) { + hname = g_string_new("Subject"); + hvalue = g_string_new(ucl_object_tostring(elt)); + + rspamd_milter_send_action(session, RSPAMD_MILTER_CHGHEADER, + (guint32) 1, hname, hvalue); + g_string_free(hname, TRUE); + g_string_free(hvalue, TRUE); + } + + rspamd_milter_send_action(session, RSPAMD_MILTER_ACCEPT); + break; + + case METRIC_ACTION_ADD_HEADER: + /* Remove existing headers */ + rspamd_milter_remove_header_safe(session, + milter_ctx->spam_header, + 0); + + hname = g_string_new(milter_ctx->spam_header); + hvalue = g_string_new("Yes"); + rspamd_milter_send_action(session, RSPAMD_MILTER_CHGHEADER, + (guint32) 1, hname, hvalue); + g_string_free(hname, TRUE); + g_string_free(hvalue, TRUE); + rspamd_milter_send_action(session, RSPAMD_MILTER_ACCEPT); + break; + + case METRIC_ACTION_QUARANTINE: + /* TODO: be more flexible about SMTP messages */ + rspamd_milter_send_action(session, RSPAMD_MILTER_QUARANTINE, + RSPAMD_MILTER_QUARANTINE_MESSAGE); + + /* Quarantine also requires accept action, all hail Sendmail */ + rspamd_milter_send_action(session, RSPAMD_MILTER_ACCEPT); + break; + case METRIC_ACTION_DISCARD: + rspamd_milter_send_action(session, RSPAMD_MILTER_DISCARD); + break; + case METRIC_ACTION_GREYLIST: + case METRIC_ACTION_NOACTION: + default: + rspamd_milter_send_action(session, RSPAMD_MILTER_ACCEPT); + break; + } + +cleanup: + rspamd_fstring_free(rcode); + rspamd_fstring_free(xcode); + rspamd_fstring_free(reply); + + rspamd_milter_session_reset(session, RSPAMD_MILTER_RESET_ABORT); +} + +void rspamd_milter_init_library(const struct rspamd_milter_context *ctx) +{ + milter_ctx = ctx; +} + +rspamd_mempool_t * +rspamd_milter_get_session_pool(struct rspamd_milter_session *session) +{ + struct rspamd_milter_private *priv = session->priv; + + return priv->pool; +} diff --git a/src/libserver/milter.h b/src/libserver/milter.h new file mode 100644 index 0000000..096cda8 --- /dev/null +++ b/src/libserver/milter.h @@ -0,0 +1,188 @@ +/*- + * Copyright 2017 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_MILTER_H +#define RSPAMD_MILTER_H + +#include "config.h" +#include "fstring.h" +#include "addr.h" +#include "contrib/libucl/ucl.h" +#include "contrib/libev/ev.h" +#include "ref.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum rspamd_milter_reply { + RSPAMD_MILTER_ADDRCPT = '+', + RSPAMD_MILTER_DELRCPT = '-', + RSPAMD_MILTER_ACCEPT = 'a', + RSPAMD_MILTER_CONTINUE = 'c', + RSPAMD_MILTER_DISCARD = 'd', + RSPAMD_MILTER_CHGFROM = 'e', + RSPAMD_MILTER_ADDHEADER = 'h', + RSPAMD_MILTER_CHGHEADER = 'm', + RSPAMD_MILTER_INSHEADER = 'i', + RSPAMD_MILTER_REPLBODY = 'b', + RSPAMD_MILTER_REJECT = 'r', + RSPAMD_MILTER_TEMPFAIL = 't', + RSPAMD_MILTER_REPLYCODE = 'y', + RSPAMD_MILTER_OPTNEG = 'O', + RSPAMD_MILTER_PROGRESS = 'p', + RSPAMD_MILTER_QUARANTINE = 'q', +}; + +struct rspamd_email_address; +struct ev_loop; +struct rspamd_http_message; +struct rspamd_config; + +struct rspamd_milter_context { + const gchar *spam_header; + const gchar *client_ca_name; + const gchar *reject_message; + void *sessions_cache; + struct rspamd_config *cfg; + gboolean discard_on_reject; + gboolean quarantine_on_reject; +}; + +struct rspamd_milter_session { + GHashTable *macros; + rspamd_inet_addr_t *addr; + struct rspamd_email_address *from; + GPtrArray *rcpts; + rspamd_fstring_t *helo; + rspamd_fstring_t *hostname; + rspamd_fstring_t *message; + void *priv; + ref_entry_t ref; +}; + +typedef void (*rspamd_milter_finish)(gint fd, + struct rspamd_milter_session *session, void *ud); + +typedef void (*rspamd_milter_error)(gint fd, + struct rspamd_milter_session *session, + void *ud, GError *err); + +/** + * Handles socket with milter protocol + * @param fd + * @param finish_cb + * @param error_cb + * @param ud + * @return + */ +gboolean rspamd_milter_handle_socket(gint fd, ev_tstamp timeout, + rspamd_mempool_t *pool, + struct ev_loop *ev_base, rspamd_milter_finish finish_cb, + rspamd_milter_error error_cb, void *ud); + +/** + * Updates userdata for a session, returns previous userdata + * @param session + * @param ud + * @return + */ +void *rspamd_milter_update_userdata(struct rspamd_milter_session *session, + void *ud); + +/** + * Sets SMTP reply string + * @param session + * @param rcode + * @param xcode + * @param reply + * @return + */ +gboolean rspamd_milter_set_reply(struct rspamd_milter_session *session, + rspamd_fstring_t *rcode, + rspamd_fstring_t *xcode, + rspamd_fstring_t *reply); + +/** + * Send some action to the MTA + * @param fd + * @param session + * @param act + * @return + */ +gboolean rspamd_milter_send_action(struct rspamd_milter_session *session, + enum rspamd_milter_reply act, ...); + +/** + * Adds some header + * @param session + * @param name + * @param value + * @return + */ +gboolean rspamd_milter_add_header(struct rspamd_milter_session *session, + GString *name, GString *value); + +/** + * Removes some header + * @param session + * @param name + * @return + */ +gboolean rspamd_milter_del_header(struct rspamd_milter_session *session, + GString *name); + +void rspamd_milter_session_unref(struct rspamd_milter_session *session); + +struct rspamd_milter_session *rspamd_milter_session_ref( + struct rspamd_milter_session *session); + +/** + * Converts milter session to HTTP session that is suitable for Rspamd + * @param session + * @return + */ +struct rspamd_http_message *rspamd_milter_to_http( + struct rspamd_milter_session *session); + +/** + * Sends task results to the + * @param session + * @param results + */ +void rspamd_milter_send_task_results(struct rspamd_milter_session *session, + const ucl_object_t *results, + const gchar *new_body, + gsize bodylen); + +/** + * Init internal milter context + * @param spam_header spam header name (must NOT be NULL) + */ +void rspamd_milter_init_library(const struct rspamd_milter_context *ctx); + +/** + * Returns pool for a session + * @param session + * @return + */ +rspamd_mempool_t *rspamd_milter_get_session_pool( + struct rspamd_milter_session *session); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/milter_internal.h b/src/libserver/milter_internal.h new file mode 100644 index 0000000..bc292d3 --- /dev/null +++ b/src/libserver/milter_internal.h @@ -0,0 +1,176 @@ +/*- + * Copyright 2017 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_MILTER_INTERNAL_H +#define RSPAMD_MILTER_INTERNAL_H + +#include "config.h" +#include "libutil/mem_pool.h" +#include "contrib/libev/ev.h" +#include "khash.h" +#include "libutil/str_util.h" +#include "libutil/libev_helper.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum rspamd_milter_state { + st_len_1 = 0, + st_len_2, + st_len_3, + st_len_4, + st_read_cmd, + st_read_data +}; + +struct rspamd_milter_parser { + rspamd_fstring_t *buf; + goffset pos; + goffset cmd_start; + gsize datalen; + enum rspamd_milter_state state; + gchar cur_cmd; +}; + +struct rspamd_milter_outbuf { + rspamd_fstring_t *buf; + goffset pos; + struct rspamd_milter_outbuf *next, *prev; +}; + +enum rspamd_milter_io_state { + RSPAMD_MILTER_READ_MORE, + RSPAMD_MILTER_WRITE_REPLY, + RSPAMD_MILTER_WANNA_DIE, + RSPAMD_MILTER_WRITE_AND_DIE, + RSPAMD_MILTER_PONG_AND_DIE, +}; + +KHASH_INIT(milter_headers_hash_t, char *, GArray *, true, + rspamd_strcase_hash, rspamd_strcase_equal); + +struct rspamd_milter_private { + struct rspamd_milter_parser parser; + struct rspamd_io_ev ev; + struct rspamd_milter_outbuf *out_chain; + struct ev_loop *event_loop; + rspamd_mempool_t *pool; + khash_t(milter_headers_hash_t) * headers; + gint cur_hdr; + rspamd_milter_finish fin_cb; + rspamd_milter_error err_cb; + void *ud; + enum rspamd_milter_io_state state; + int fd; + gboolean discard_on_reject; + gboolean quarantine_on_reject; + gboolean no_action; +}; + +enum rspamd_milter_io_cmd { + RSPAMD_MILTER_CMD_ABORT = 'A', /* Abort */ + RSPAMD_MILTER_CMD_BODY = 'B', /* Body chunk */ + RSPAMD_MILTER_CMD_CONNECT = 'C', /* Connection information */ + RSPAMD_MILTER_CMD_MACRO = 'D', /* Define macro */ + RSPAMD_MILTER_CMD_BODYEOB = 'E', /* final body chunk (end of message) */ + RSPAMD_MILTER_CMD_HELO = 'H', /* HELO/EHLO */ + RSPAMD_MILTER_CMD_QUIT_NC = 'K', /* QUIT but new connection follows */ + RSPAMD_MILTER_CMD_HEADER = 'L', /* Header */ + RSPAMD_MILTER_CMD_MAIL = 'M', /* MAIL from */ + RSPAMD_MILTER_CMD_EOH = 'N', /* EOH */ + RSPAMD_MILTER_CMD_OPTNEG = 'O', /* Option negotiation */ + RSPAMD_MILTER_CMD_QUIT = 'Q', /* QUIT */ + RSPAMD_MILTER_CMD_RCPT = 'R', /* RCPT to */ + RSPAMD_MILTER_CMD_DATA = 'T', /* DATA */ + RSPAMD_MILTER_CMD_UNKNOWN = 'U' /* Any unknown command */ +}; + +/* + * Protocol flags + */ +#define RSPAMD_MILTER_FLAG_NOUNKNOWN (1L << 8) /* filter does not want unknown cmd */ +#define RSPAMD_MILTER_FLAG_NODATA (1L << 9) /* filter does not want DATA */ +#define RSPAMD_MILTER_FLAG_NR_HDR (1L << 7) /* filter won't reply for header */ +#define RSPAMD_MILTER_FLAG_SKIP (1L << 10) /* MTA supports SMFIR_SKIP */ +#define RSPAMD_MILTER_FLAG_RCPT_REJ (1L << 11) /* filter wants rejected RCPTs */ +#define RSPAMD_MILTER_FLAG_NR_CONN (1L << 12) /* filter won't reply for connect */ +#define RSPAMD_MILTER_FLAG_NR_HELO (1L << 13) /* filter won't reply for HELO */ +#define RSPAMD_MILTER_FLAG_NR_MAIL (1L << 14) /* filter won't reply for MAIL */ +#define RSPAMD_MILTER_FLAG_NR_RCPT (1L << 15) /* filter won't reply for RCPT */ +#define RSPAMD_MILTER_FLAG_NR_DATA (1L << 16) /* filter won't reply for DATA */ +#define RSPAMD_MILTER_FLAG_NR_UNKN (1L << 17) /* filter won't reply for UNKNOWN */ +#define RSPAMD_MILTER_FLAG_NR_EOH (1L << 18) /* filter won't reply for eoh */ +#define RSPAMD_MILTER_FLAG_NR_BODY (1L << 19) /* filter won't reply for body chunk */ + +/* + * For now, we specify that we want to reply just after EOM + */ +#define RSPAMD_MILTER_FLAG_NOREPLY_MASK \ + (RSPAMD_MILTER_FLAG_NR_CONN | RSPAMD_MILTER_FLAG_NR_HELO | \ + RSPAMD_MILTER_FLAG_NR_MAIL | RSPAMD_MILTER_FLAG_NR_RCPT | \ + RSPAMD_MILTER_FLAG_NR_DATA | RSPAMD_MILTER_FLAG_NR_UNKN | \ + RSPAMD_MILTER_FLAG_NR_HDR | RSPAMD_MILTER_FLAG_NR_EOH | \ + RSPAMD_MILTER_FLAG_NR_BODY) + +/* + * Options that the filter may send at initial handshake time, and message + * modifications that the filter may request at the end of the message body. + */ +#define RSPAMD_MILTER_FLAG_ADDHDRS (1L << 0) /* filter may add headers */ +#define RSPAMD_MILTER_FLAG_CHGBODY (1L << 1) /* filter may replace body */ +#define RSPAMD_MILTER_FLAG_ADDRCPT (1L << 2) /* filter may add recipients */ +#define RSPAMD_MILTER_FLAG_DELRCPT (1L << 3) /* filter may delete recipients */ +#define RSPAMD_MILTER_FLAG_CHGHDRS (1L << 4) /* filter may change/delete headers */ +#define RSPAMD_MILTER_FLAG_QUARANTINE (1L << 5) /* filter may request quarantine */ + +#define RSPAMD_MILTER_ACTIONS_MASK \ + (RSPAMD_MILTER_FLAG_ADDHDRS | RSPAMD_MILTER_FLAG_ADDRCPT | \ + RSPAMD_MILTER_FLAG_DELRCPT | RSPAMD_MILTER_FLAG_CHGHDRS | \ + RSPAMD_MILTER_FLAG_CHGBODY | RSPAMD_MILTER_FLAG_QUARANTINE) + +enum rspamd_milter_connect_proto { + RSPAMD_MILTER_CONN_UNKNOWN = 'U', + RSPAMD_MILTER_CONN_UNIX = 'L', + RSPAMD_MILTER_CONN_INET = '4', + RSPAMD_MILTER_CONN_INET6 = '6', +}; + +/* + * Rspamd supports just version 6 of the protocol, failing all versions below + * this one + */ +#define RSPAMD_MILTER_PROTO_VER 6 + +#define RSPAMD_MILTER_MESSAGE_CHUNK 65536 + +#define RSPAMD_MILTER_RCODE_REJECT "554" +#define RSPAMD_MILTER_RCODE_TEMPFAIL "451" +#define RSPAMD_MILTER_RCODE_LATER "452" +#define RSPAMD_MILTER_XCODE_REJECT "5.7.1" +#define RSPAMD_MILTER_XCODE_TEMPFAIL "4.7.1" +#define RSPAMD_MILTER_REJECT_MESSAGE "Spam message rejected" +#define RSPAMD_MILTER_QUARANTINE_MESSAGE "Spam message quarantined" +#define RSPAMD_MILTER_TEMPFAIL_MESSAGE "Try again later" +#define RSPAMD_MILTER_SPAM_HEADER "X-Spam" +#define RSPAMD_MILTER_DKIM_HEADER "DKIM-Signature" +#define RSPAMD_MILTER_ACTION_HEADER "X-Rspamd-Action" + +#ifdef __cplusplus +} +#endif + +#endif
\ No newline at end of file diff --git a/src/libserver/monitored.c b/src/libserver/monitored.c new file mode 100644 index 0000000..3aebaf6 --- /dev/null +++ b/src/libserver/monitored.c @@ -0,0 +1,735 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <contrib/librdns/rdns.h> +#include "rdns.h" +#include "mem_pool.h" +#include "cfg_file.h" +#include "cryptobox.h" +#include "logger.h" +#include "contrib/uthash/utlist.h" + +static const gdouble default_monitoring_interval = 60.0; +static const guint default_max_errors = 2; +static const gdouble default_max_monitored_mult = 32; +static const gdouble default_min_monitored_mult = 0.1; +static const gdouble default_initial_monitored_mult = default_min_monitored_mult; +static const gdouble default_offline_monitored_mult = 8.0; + +struct rspamd_monitored_methods { + void *(*monitored_config)(struct rspamd_monitored *m, + struct rspamd_monitored_ctx *ctx, + const ucl_object_t *opts); + gboolean (*monitored_update)(struct rspamd_monitored *m, + struct rspamd_monitored_ctx *ctx, gpointer ud); + void (*monitored_dtor)(struct rspamd_monitored *m, + struct rspamd_monitored_ctx *ctx, gpointer ud); + gpointer ud; +}; + +struct rspamd_monitored_ctx { + struct rspamd_config *cfg; + struct rdns_resolver *resolver; + struct ev_loop *event_loop; + GPtrArray *elts; + GHashTable *helts; + mon_change_cb change_cb; + gpointer ud; + gdouble monitoring_interval; + gdouble max_monitored_mult; + gdouble min_monitored_mult; + gdouble initial_monitored_mult; + gdouble offline_monitored_mult; + guint max_errors; + gboolean initialized; +}; + +struct rspamd_monitored { + gchar *url; + gdouble monitoring_mult; + gdouble offline_time; + gdouble total_offline_time; + gdouble latency; + guint nchecks; + guint max_errors; + guint cur_errors; + gboolean alive; + enum rspamd_monitored_type type; + enum rspamd_monitored_flags flags; + struct rspamd_monitored_ctx *ctx; + struct rspamd_monitored_methods proc; + ev_timer periodic; + gchar tag[RSPAMD_MONITORED_TAG_LEN]; +}; + +#define msg_err_mon(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "monitored", m->tag, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_mon(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "monitored", m->tag, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_mon(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "monitored", m->tag, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_notice_mon(...) rspamd_default_log_function(G_LOG_LEVEL_MESSAGE, \ + "monitored", m->tag, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_mon(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_monitored_log_id, "monitored", m->tag, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(monitored) + +static inline void +rspamd_monitored_propagate_error(struct rspamd_monitored *m, + const gchar *error) +{ + if (m->alive) { + if (m->cur_errors < m->max_errors) { + + m->cur_errors++; + /* Reduce timeout */ + rspamd_monitored_stop(m); + + if (m->monitoring_mult > m->ctx->min_monitored_mult) { + if (m->monitoring_mult < 1.0) { + m->monitoring_mult = 1.0; + } + else { + m->monitoring_mult /= 2.0; + } + } + + msg_debug_mon("%s on resolving %s, %d retries left; next check in %.2f", + error, m->url, m->max_errors - m->cur_errors, + m->ctx->monitoring_interval * m->monitoring_mult); + + rspamd_monitored_start(m); + } + else { + msg_notice_mon("%s on resolving %s, disable object", + error, m->url); + m->alive = FALSE; + m->offline_time = rspamd_get_calendar_ticks(); + rspamd_monitored_stop(m); + m->monitoring_mult = 2.0; + rspamd_monitored_start(m); + + if (m->ctx->change_cb) { + m->ctx->change_cb(m->ctx, m, FALSE, m->ctx->ud); + } + } + } + else { + if (m->monitoring_mult < m->ctx->offline_monitored_mult) { + /* Increase timeout */ + rspamd_monitored_stop(m); + m->monitoring_mult *= 2.0; + rspamd_monitored_start(m); + } + else { + rspamd_monitored_stop(m); + m->monitoring_mult = m->ctx->offline_monitored_mult; + rspamd_monitored_start(m); + } + } +} + +static inline void +rspamd_monitored_propagate_success(struct rspamd_monitored *m, gdouble lat) +{ + gdouble t; + + m->cur_errors = 0; + + if (!m->alive) { + m->monitoring_mult = 1.0; + t = rspamd_get_calendar_ticks(); + m->total_offline_time += t - m->offline_time; + m->alive = TRUE; + msg_notice_mon("restoring %s after %.1f seconds of downtime, " + "total downtime: %.1f", + m->url, t - m->offline_time, m->total_offline_time); + m->offline_time = 0; + m->nchecks = 1; + m->latency = lat; + rspamd_monitored_stop(m); + rspamd_monitored_start(m); + + if (m->ctx->change_cb) { + m->ctx->change_cb(m->ctx, m, TRUE, m->ctx->ud); + } + } + else { + /* Increase monitored interval */ + if (m->monitoring_mult < m->ctx->max_monitored_mult) { + if (m->monitoring_mult < 1.0) { + /* Upgrade fast from the initial mult */ + m->monitoring_mult = 1.0; + } + else { + m->monitoring_mult *= 2.0; + } + } + else { + m->monitoring_mult = m->ctx->max_monitored_mult; + } + m->latency = (lat + m->latency * m->nchecks) / (m->nchecks + 1); + m->nchecks++; + } +} + +static void +rspamd_monitored_periodic(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_monitored *m = (struct rspamd_monitored *) w->data; + gdouble jittered; + gboolean ret = FALSE; + + if (m->proc.monitored_update) { + ret = m->proc.monitored_update(m, m->ctx, m->proc.ud); + } + + jittered = rspamd_time_jitter(m->ctx->monitoring_interval * m->monitoring_mult, + 0.0); + + if (ret) { + m->periodic.repeat = jittered; + ev_timer_again(EV_A_ & m->periodic); + } +} + +struct rspamd_dns_monitored_conf { + enum rdns_request_type rt; + GString *request; + radix_compressed_t *expected; + struct rspamd_monitored *m; + gint expected_code; + gdouble check_tm; +}; + +static void +rspamd_monitored_dns_random(struct rspamd_monitored *m, + struct rspamd_dns_monitored_conf *conf) +{ + gchar random_prefix[32]; + const gchar dns_chars[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_"; + gint len; + + len = rspamd_random_uint64_fast() % sizeof(random_prefix); + + if (len < 8) { + len = 8; + } + + for (guint i = 0; i < len; i++) { + guint idx = rspamd_random_uint64_fast() % (sizeof(dns_chars) - 1); + random_prefix[i] = dns_chars[idx]; + } + + conf->request->len = 0; + rspamd_printf_gstring(conf->request, "%*.s.%s", len, random_prefix, + m->url); +} + +static void * +rspamd_monitored_dns_conf(struct rspamd_monitored *m, + struct rspamd_monitored_ctx *ctx, + const ucl_object_t *opts) +{ + struct rspamd_dns_monitored_conf *conf; + const ucl_object_t *elt; + gint rt; + GString *req = g_string_sized_new(127); + + conf = g_malloc0(sizeof(*conf)); + conf->rt = RDNS_REQUEST_A; + conf->m = m; + conf->expected_code = -1; + + if (opts) { + elt = ucl_object_lookup(opts, "type"); + + if (elt) { + rt = rdns_type_fromstr(ucl_object_tostring(elt)); + + if (rt != -1) { + conf->rt = rt; + } + else { + msg_err_mon("invalid resolve type: %s", + ucl_object_tostring(elt)); + } + } + + if (!(m->flags & RSPAMD_MONITORED_RANDOM)) { + /* Prefix is useless for random monitored */ + elt = ucl_object_lookup(opts, "prefix"); + + if (elt && ucl_object_type(elt) == UCL_STRING) { + rspamd_printf_gstring(req, "%s.", ucl_object_tostring(elt)); + } + } + + elt = ucl_object_lookup(opts, "ipnet"); + + if (elt) { + if (ucl_object_type(elt) == UCL_STRING) { + radix_add_generic_iplist(ucl_object_tostring(elt), + &conf->expected, FALSE, NULL); + } + else if (ucl_object_type(elt) == UCL_ARRAY) { + const ucl_object_t *cur; + ucl_object_iter_t it = NULL; + + while ((cur = ucl_object_iterate(elt, &it, true)) != NULL) { + radix_add_generic_iplist(ucl_object_tostring(elt), + &conf->expected, FALSE, NULL); + } + } + } + + elt = ucl_object_lookup(opts, "rcode"); + if (elt) { + rt = rdns_rcode_fromstr(ucl_object_tostring(elt)); + + if (rt != -1) { + conf->expected_code = rt; + } + else { + msg_err_mon("invalid resolve rcode: %s", + ucl_object_tostring(elt)); + } + } + } + + if (!(m->flags & RSPAMD_MONITORED_RANDOM)) { + rspamd_printf_gstring(req, "%s", m->url); + } + + conf->request = req; + + return conf; +} + +static void +rspamd_monitored_dns_cb(struct rdns_reply *reply, void *arg) +{ + struct rspamd_dns_monitored_conf *conf = arg; + struct rspamd_monitored *m; + struct rdns_reply_entry *cur; + gboolean is_special_reply = FALSE; + gdouble lat; + + m = conf->m; + lat = rspamd_get_calendar_ticks() - conf->check_tm; + conf->check_tm = 0; + msg_debug_mon("dns callback for %s in %.2f: %s", m->url, lat, + rdns_strerror(reply->code)); + + if (reply->code == RDNS_RC_TIMEOUT) { + rspamd_monitored_propagate_error(m, "timeout"); + } + else if (reply->code == RDNS_RC_SERVFAIL) { + rspamd_monitored_propagate_error(m, "servfail"); + } + else if (reply->code == RDNS_RC_REFUSED) { + rspamd_monitored_propagate_error(m, "refused"); + } + else { + if (conf->expected_code != -1) { + if (reply->code != conf->expected_code) { + if (reply->code == RDNS_RC_NOREC && + conf->expected_code == RDNS_RC_NXDOMAIN) { + rspamd_monitored_propagate_success(m, lat); + } + else { + LL_FOREACH(reply->entries, cur) + { + if (cur->type == RDNS_REQUEST_A) { + if ((guint32) cur->content.a.addr.s_addr == + htonl(INADDR_LOOPBACK)) { + is_special_reply = TRUE; + } + } + } + + if (is_special_reply) { + msg_notice_mon("DNS query blocked on %s " + "(127.0.0.1 returned), " + "possibly due to high volume", + m->url); + } + else { + msg_notice_mon("DNS reply returned '%s' for %s while '%s' " + "was expected when querying for '%s'" + "(likely DNS spoofing or BL internal issues)", + rdns_strerror(reply->code), + m->url, + rdns_strerror(conf->expected_code), + conf->request->str); + } + + rspamd_monitored_propagate_error(m, "invalid return"); + } + } + else { + rspamd_monitored_propagate_success(m, lat); + } + } + else if (conf->expected) { + /* We also need to check IP */ + if (reply->code != RDNS_RC_NOERROR) { + rspamd_monitored_propagate_error(m, "no record"); + } + else { + rspamd_inet_addr_t *addr; + + addr = rspamd_inet_address_from_rnds(reply->entries); + + if (!addr) { + rspamd_monitored_propagate_error(m, + "unreadable address"); + } + else if (radix_find_compressed_addr(conf->expected, addr)) { + msg_notice_mon("bad address %s is returned when monitoring %s", + rspamd_inet_address_to_string(addr), + conf->request->str); + rspamd_monitored_propagate_error(m, + "invalid address"); + + rspamd_inet_address_free(addr); + } + else { + rspamd_monitored_propagate_success(m, lat); + rspamd_inet_address_free(addr); + } + } + } + else { + rspamd_monitored_propagate_success(m, lat); + } + } +} + +static gboolean +rspamd_monitored_dns_mon(struct rspamd_monitored *m, + struct rspamd_monitored_ctx *ctx, gpointer ud) +{ + struct rspamd_dns_monitored_conf *conf = ud; + + if (m->flags & RSPAMD_MONITORED_RANDOM) { + rspamd_monitored_dns_random(m, conf); + } + + if (!rdns_make_request_full(ctx->resolver, rspamd_monitored_dns_cb, + conf, ctx->cfg->dns_timeout, ctx->cfg->dns_retransmits, + 1, conf->request->str, conf->rt)) { + msg_notice_mon("cannot make request to resolve %s (%s monitored url)", + conf->request->str, conf->m->url); + + m->cur_errors++; + rspamd_monitored_propagate_error(m, "failed to make DNS request"); + + return FALSE; + } + else { + conf->check_tm = rspamd_get_calendar_ticks(); + } + + return TRUE; +} + +void rspamd_monitored_dns_dtor(struct rspamd_monitored *m, + struct rspamd_monitored_ctx *ctx, gpointer ud) +{ + struct rspamd_dns_monitored_conf *conf = ud; + + g_string_free(conf->request, TRUE); + + if (conf->expected) { + radix_destroy_compressed(conf->expected); + } + + g_free(conf); +} + +struct rspamd_monitored_ctx * +rspamd_monitored_ctx_init(void) +{ + struct rspamd_monitored_ctx *ctx; + + ctx = g_malloc0(sizeof(*ctx)); + ctx->monitoring_interval = default_monitoring_interval; + ctx->max_errors = default_max_errors; + ctx->offline_monitored_mult = default_offline_monitored_mult; + ctx->initial_monitored_mult = default_initial_monitored_mult; + ctx->max_monitored_mult = default_max_monitored_mult; + ctx->min_monitored_mult = default_min_monitored_mult; + ctx->elts = g_ptr_array_new(); + ctx->helts = g_hash_table_new(g_str_hash, g_str_equal); + + return ctx; +} + + +void rspamd_monitored_ctx_config(struct rspamd_monitored_ctx *ctx, + struct rspamd_config *cfg, + struct ev_loop *ev_base, + struct rdns_resolver *resolver, + mon_change_cb change_cb, + gpointer ud) +{ + struct rspamd_monitored *m; + guint i; + + g_assert(ctx != NULL); + ctx->event_loop = ev_base; + ctx->resolver = resolver; + ctx->cfg = cfg; + ctx->initialized = TRUE; + ctx->change_cb = change_cb; + ctx->ud = ud; + + if (cfg->monitored_interval != 0) { + ctx->monitoring_interval = cfg->monitored_interval; + } + + /* Start all events */ + for (i = 0; i < ctx->elts->len; i++) { + m = g_ptr_array_index(ctx->elts, i); + m->monitoring_mult = ctx->initial_monitored_mult; + rspamd_monitored_start(m); + m->monitoring_mult = 1.0; + } +} + + +struct ev_loop * +rspamd_monitored_ctx_get_ev_base(struct rspamd_monitored_ctx *ctx) +{ + return ctx->event_loop; +} + + +struct rspamd_monitored * +rspamd_monitored_create_(struct rspamd_monitored_ctx *ctx, + const gchar *line, + enum rspamd_monitored_type type, + enum rspamd_monitored_flags flags, + const ucl_object_t *opts, + const gchar *loc) +{ + struct rspamd_monitored *m; + rspamd_cryptobox_hash_state_t st; + gchar *cksum_encoded, cksum[rspamd_cryptobox_HASHBYTES]; + + g_assert(ctx != NULL); + + m = g_malloc0(sizeof(*m)); + m->type = type; + m->flags = flags; + + m->url = g_strdup(line); + m->ctx = ctx; + m->monitoring_mult = ctx->initial_monitored_mult; + m->max_errors = ctx->max_errors; + m->alive = TRUE; + + if (type == RSPAMD_MONITORED_DNS) { + m->proc.monitored_update = rspamd_monitored_dns_mon; + m->proc.monitored_config = rspamd_monitored_dns_conf; + m->proc.monitored_dtor = rspamd_monitored_dns_dtor; + } + else { + g_free(m); + + return NULL; + } + + if (opts) { + const ucl_object_t *rnd_obj; + + rnd_obj = ucl_object_lookup(opts, "random"); + + if (rnd_obj && ucl_object_type(rnd_obj) == UCL_BOOLEAN) { + if (ucl_object_toboolean(rnd_obj)) { + m->flags |= RSPAMD_MONITORED_RANDOM; + } + } + } + + m->proc.ud = m->proc.monitored_config(m, ctx, opts); + + if (m->proc.ud == NULL) { + g_free(m); + + return NULL; + } + + /* Create a persistent tag */ + rspamd_cryptobox_hash_init(&st, NULL, 0); + rspamd_cryptobox_hash_update(&st, m->url, strlen(m->url)); + rspamd_cryptobox_hash_update(&st, loc, strlen(loc)); + rspamd_cryptobox_hash_final(&st, cksum); + cksum_encoded = rspamd_encode_base32(cksum, sizeof(cksum), RSPAMD_BASE32_DEFAULT); + rspamd_strlcpy(m->tag, cksum_encoded, sizeof(m->tag)); + + if (g_hash_table_lookup(ctx->helts, m->tag) != NULL) { + msg_err("monitored error: tag collision detected for %s; " + "url: %s", + m->tag, m->url); + } + else { + g_hash_table_insert(ctx->helts, m->tag, m); + } + + g_free(cksum_encoded); + + g_ptr_array_add(ctx->elts, m); + + if (ctx->event_loop) { + rspamd_monitored_start(m); + } + + return m; +} + +gboolean +rspamd_monitored_alive(struct rspamd_monitored *m) +{ + g_assert(m != NULL); + + return m->alive; +} + +gboolean +rspamd_monitored_set_alive(struct rspamd_monitored *m, gboolean alive) +{ + gboolean st; + + g_assert(m != NULL); + st = m->alive; + m->alive = alive; + + return st; +} + +gdouble +rspamd_monitored_offline_time(struct rspamd_monitored *m) +{ + g_assert(m != NULL); + + if (m->offline_time > 0) { + return rspamd_get_calendar_ticks() - m->offline_time; + } + + return 0; +} + +gdouble +rspamd_monitored_total_offline_time(struct rspamd_monitored *m) +{ + g_assert(m != NULL); + + if (m->offline_time > 0) { + return rspamd_get_calendar_ticks() - m->offline_time + m->total_offline_time; + } + + + return m->total_offline_time; +} + +gdouble +rspamd_monitored_latency(struct rspamd_monitored *m) +{ + g_assert(m != NULL); + + return m->latency; +} + +void rspamd_monitored_stop(struct rspamd_monitored *m) +{ + g_assert(m != NULL); + + ev_timer_stop(m->ctx->event_loop, &m->periodic); +} + +void rspamd_monitored_start(struct rspamd_monitored *m) +{ + gdouble jittered; + + g_assert(m != NULL); + jittered = rspamd_time_jitter(m->ctx->monitoring_interval * m->monitoring_mult, + 0.0); + + msg_debug_mon("started monitored object %s in %.2f seconds", m->url, jittered); + + if (ev_can_stop(&m->periodic)) { + ev_timer_stop(m->ctx->event_loop, &m->periodic); + } + + m->periodic.data = m; + ev_timer_init(&m->periodic, rspamd_monitored_periodic, jittered, 0.0); + ev_timer_start(m->ctx->event_loop, &m->periodic); +} + +void rspamd_monitored_ctx_destroy(struct rspamd_monitored_ctx *ctx) +{ + struct rspamd_monitored *m; + guint i; + + g_assert(ctx != NULL); + + for (i = 0; i < ctx->elts->len; i++) { + m = g_ptr_array_index(ctx->elts, i); + rspamd_monitored_stop(m); + m->proc.monitored_dtor(m, m->ctx, m->proc.ud); + g_free(m->url); + g_free(m); + } + + g_ptr_array_free(ctx->elts, TRUE); + g_hash_table_unref(ctx->helts); + g_free(ctx); +} + +struct rspamd_monitored * +rspamd_monitored_by_tag(struct rspamd_monitored_ctx *ctx, + guchar tag[RSPAMD_MONITORED_TAG_LEN]) +{ + struct rspamd_monitored *res; + gchar rtag[RSPAMD_MONITORED_TAG_LEN]; + + rspamd_strlcpy(rtag, tag, sizeof(rtag)); + res = g_hash_table_lookup(ctx->helts, rtag); + + return res; +} + + +void rspamd_monitored_get_tag(struct rspamd_monitored *m, + guchar tag_out[RSPAMD_MONITORED_TAG_LEN]) +{ + g_assert(m != NULL); + + rspamd_strlcpy(tag_out, m->tag, RSPAMD_MONITORED_TAG_LEN); +}
\ No newline at end of file diff --git a/src/libserver/monitored.h b/src/libserver/monitored.h new file mode 100644 index 0000000..01f050a --- /dev/null +++ b/src/libserver/monitored.h @@ -0,0 +1,161 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBSERVER_MONITORED_H_ +#define SRC_LIBSERVER_MONITORED_H_ + +#include "config.h" +#include "rdns.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_monitored; +struct rspamd_monitored_ctx; +struct rspamd_config; + +#define RSPAMD_MONITORED_TAG_LEN 32 + +enum rspamd_monitored_type { + RSPAMD_MONITORED_DNS = 0, +}; + +enum rspamd_monitored_flags { + RSPAMD_MONITORED_DEFAULT = 0u, + RSPAMD_MONITORED_RBL = (1u << 0u), + RSPAMD_MONITORED_RANDOM = (1u << 1u) +}; + +/** + * Initialize new monitored context + * @return opaque context pointer (should be configured) + */ +struct rspamd_monitored_ctx *rspamd_monitored_ctx_init(void); + +typedef void (*mon_change_cb)(struct rspamd_monitored_ctx *ctx, + struct rspamd_monitored *m, gboolean alive, + void *ud); + +/** + * Configure context for monitored objects + * @param ctx context + * @param cfg configuration + * @param ev_base events base + * @param resolver resolver object + */ +void rspamd_monitored_ctx_config(struct rspamd_monitored_ctx *ctx, + struct rspamd_config *cfg, + struct ev_loop *ev_base, + struct rdns_resolver *resolver, + mon_change_cb change_cb, + gpointer ud); + +struct ev_loop *rspamd_monitored_ctx_get_ev_base(struct rspamd_monitored_ctx *ctx); + +/** + * Create monitored object + * @param ctx context + * @param line string definition (e.g. hostname) + * @param type type of monitoring + * @param flags specific flags for monitoring + * @return new monitored object + */ +struct rspamd_monitored *rspamd_monitored_create_( + struct rspamd_monitored_ctx *ctx, + const gchar *line, + enum rspamd_monitored_type type, + enum rspamd_monitored_flags flags, + const ucl_object_t *opts, + const gchar *loc); + +#define rspamd_monitored_create(ctx, line, type, flags, opts) \ + rspamd_monitored_create_(ctx, line, type, flags, opts, G_STRFUNC) + +/** + * Return monitored by its tag + * @param ctx + * @param tag + * @return + */ +struct rspamd_monitored *rspamd_monitored_by_tag(struct rspamd_monitored_ctx *ctx, + guchar tag[RSPAMD_MONITORED_TAG_LEN]); + +/** + * Sets `tag_out` to the monitored tag + * @param m + * @param tag_out + */ +void rspamd_monitored_get_tag(struct rspamd_monitored *m, + guchar tag_out[RSPAMD_MONITORED_TAG_LEN]); + +/** + * Return TRUE if monitored object is alive + * @param m monitored object + * @return TRUE or FALSE + */ +gboolean rspamd_monitored_alive(struct rspamd_monitored *m); + +/** + * Force alive flag for a monitored object + * @param m monitored object + * @return TRUE or FALSE + */ +gboolean rspamd_monitored_set_alive(struct rspamd_monitored *m, gboolean alive); + +/** + * Returns the current offline time for a monitored object + * @param m + * @return + */ +gdouble rspamd_monitored_offline_time(struct rspamd_monitored *m); + +/** + * Returns the total offline time for a monitored object + * @param m + * @return + */ +gdouble rspamd_monitored_total_offline_time(struct rspamd_monitored *m); + +/** + * Returns the latency for monitored object (in seconds) + * @param m + * @return + */ +gdouble rspamd_monitored_latency(struct rspamd_monitored *m); + +/** + * Explicitly disable monitored object + * @param m + */ +void rspamd_monitored_stop(struct rspamd_monitored *m); + +/** + * Explicitly enable monitored object + * @param m + */ +void rspamd_monitored_start(struct rspamd_monitored *m); + +/** + * Destroy monitored context and all monitored objects inside + * @param ctx + */ +void rspamd_monitored_ctx_destroy(struct rspamd_monitored_ctx *ctx); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBSERVER_MONITORED_H_ */ diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c new file mode 100644 index 0000000..8674557 --- /dev/null +++ b/src/libserver/protocol.c @@ -0,0 +1,2185 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "rspamd.h" +#include "message.h" +#include "utlist.h" +#include "libserver/http/http_private.h" +#include "worker_private.h" +#include "libserver/cfg_file_private.h" +#include "libmime/scan_result_private.h" +#include "lua/lua_common.h" +#include "unix-std.h" +#include "protocol_internal.h" +#include "libserver/mempool_vars_internal.h" +#include "contrib/fastutf8/fastutf8.h" +#include "task.h" +#include <math.h> + +#ifdef SYS_ZSTD +#include "zstd.h" +#else +#include "contrib/zstd/zstd.h" +#endif + +INIT_LOG_MODULE(protocol) + +#define msg_err_protocol(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "protocol", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_warn_protocol(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "protocol", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_info_protocol(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "protocol", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_debug_protocol(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_protocol_log_id, "protocol", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + +static GQuark +rspamd_protocol_quark(void) +{ + return g_quark_from_static_string("protocol-error"); +} + +/* + * Remove <> from the fixed string and copy it to the pool + */ +static gchar * +rspamd_protocol_escape_braces(struct rspamd_task *task, rspamd_ftok_t *in) +{ + guint nchars = 0; + const gchar *p; + rspamd_ftok_t tok; + gboolean has_obrace = FALSE; + + g_assert(in != NULL); + g_assert(in->len > 0); + + p = in->begin; + + while ((g_ascii_isspace(*p) || *p == '<') && nchars < in->len) { + if (*p == '<') { + has_obrace = TRUE; + } + + p++; + nchars++; + } + + tok.begin = p; + + p = in->begin + in->len - 1; + tok.len = in->len - nchars; + + while (g_ascii_isspace(*p) && tok.len > 0) { + p--; + tok.len--; + } + + if (has_obrace && *p == '>') { + tok.len--; + } + + return rspamd_mempool_ftokdup(task->task_pool, &tok); +} + +#define COMPARE_CMD(str, cmd, len) (sizeof(cmd) - 1 == (len) && rspamd_lc_cmp((str), (cmd), (len)) == 0) + +static gboolean +rspamd_protocol_handle_url(struct rspamd_task *task, + struct rspamd_http_message *msg) +{ + GHashTable *query_args; + GHashTableIter it; + struct http_parser_url u; + const gchar *p; + gsize pathlen; + rspamd_ftok_t *key, *value; + gpointer k, v; + + if (msg->url == NULL || msg->url->len == 0) { + g_set_error(&task->err, rspamd_protocol_quark(), 400, "missing command"); + return FALSE; + } + + if (http_parser_parse_url(msg->url->str, msg->url->len, 0, &u) != 0) { + g_set_error(&task->err, rspamd_protocol_quark(), 400, "bad request URL"); + + return FALSE; + } + + if (!(u.field_set & (1 << UF_PATH))) { + g_set_error(&task->err, rspamd_protocol_quark(), 400, + "bad request URL: missing path"); + + return FALSE; + } + + p = msg->url->str + u.field_data[UF_PATH].off; + pathlen = u.field_data[UF_PATH].len; + + if (*p == '/') { + p++; + pathlen--; + } + + switch (*p) { + case 'c': + case 'C': + /* check */ + if (COMPARE_CMD(p, MSG_CMD_CHECK_V2, pathlen)) { + task->cmd = CMD_CHECK_V2; + msg_debug_protocol("got checkv2 command"); + } + else if (COMPARE_CMD(p, MSG_CMD_CHECK, pathlen)) { + task->cmd = CMD_CHECK; + msg_debug_protocol("got check command"); + } + else { + goto err; + } + break; + case 's': + case 'S': + /* symbols, skip */ + if (COMPARE_CMD(p, MSG_CMD_SYMBOLS, pathlen)) { + task->cmd = CMD_CHECK; + msg_debug_protocol("got symbols -> old check command"); + } + else if (COMPARE_CMD(p, MSG_CMD_SCAN, pathlen)) { + task->cmd = CMD_CHECK; + msg_debug_protocol("got scan -> old check command"); + } + else if (COMPARE_CMD(p, MSG_CMD_SKIP, pathlen)) { + msg_debug_protocol("got skip command"); + task->cmd = CMD_SKIP; + } + else { + goto err; + } + break; + case 'p': + case 'P': + /* ping, process */ + if (COMPARE_CMD(p, MSG_CMD_PING, pathlen)) { + msg_debug_protocol("got ping command"); + task->cmd = CMD_PING; + task->flags |= RSPAMD_TASK_FLAG_SKIP; + task->processed_stages |= RSPAMD_TASK_STAGE_DONE; /* Skip all */ + } + else if (COMPARE_CMD(p, MSG_CMD_PROCESS, pathlen)) { + msg_debug_protocol("got process -> old check command"); + task->cmd = CMD_CHECK; + } + else { + goto err; + } + break; + case 'r': + case 'R': + /* report, report_ifspam */ + if (COMPARE_CMD(p, MSG_CMD_REPORT, pathlen)) { + msg_debug_protocol("got report -> old check command"); + task->cmd = CMD_CHECK; + } + else if (COMPARE_CMD(p, MSG_CMD_REPORT_IFSPAM, pathlen)) { + msg_debug_protocol("got reportifspam -> old check command"); + task->cmd = CMD_CHECK; + } + else { + goto err; + } + break; + default: + goto err; + } + + if (u.field_set & (1u << UF_QUERY)) { + /* In case if we have a query, we need to store it somewhere */ + query_args = rspamd_http_message_parse_query(msg); + + /* Insert the rest of query params as HTTP headers */ + g_hash_table_iter_init(&it, query_args); + + while (g_hash_table_iter_next(&it, &k, &v)) { + gchar *key_cpy; + key = k; + value = v; + + key_cpy = rspamd_mempool_ftokdup(task->task_pool, key); + + rspamd_http_message_add_header_len(msg, key_cpy, + value->begin, value->len); + msg_debug_protocol("added header \"%T\" -> \"%T\" from HTTP query", + key, value); + } + + g_hash_table_unref(query_args); + } + + return TRUE; + +err: + g_set_error(&task->err, rspamd_protocol_quark(), 400, "invalid command"); + + return FALSE; +} + +static void +rspamd_protocol_process_recipients(struct rspamd_task *task, + const rspamd_ftok_t *hdr) +{ + enum { + skip_spaces, + quoted_string, + normal_string, + } state = skip_spaces; + const gchar *p, *end, *start_addr; + struct rspamd_email_address *addr; + + p = hdr->begin; + end = hdr->begin + hdr->len; + start_addr = NULL; + + while (p < end) { + switch (state) { + case skip_spaces: + if (g_ascii_isspace(*p)) { + p++; + } + else if (*p == '"') { + start_addr = p; + p++; + state = quoted_string; + } + else { + state = normal_string; + start_addr = p; + } + break; + case quoted_string: + if (*p == '"') { + state = normal_string; + p++; + } + else if (*p == '\\') { + /* Quoted pair */ + p += 2; + } + else { + p++; + } + break; + case normal_string: + if (*p == '"') { + state = quoted_string; + p++; + } + else if (*p == ',' && start_addr != NULL && p > start_addr) { + /* We have finished address, check what we have */ + addr = rspamd_email_address_from_smtp(start_addr, + p - start_addr); + + if (addr) { + if (task->rcpt_envelope == NULL) { + task->rcpt_envelope = g_ptr_array_sized_new( + 2); + } + + g_ptr_array_add(task->rcpt_envelope, addr); + } + else { + msg_err_protocol("bad rcpt address: '%*s'", + (int) (p - start_addr), start_addr); + task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS; + } + start_addr = NULL; + p++; + state = skip_spaces; + } + else { + p++; + } + break; + } + } + + /* Check remainder */ + if (start_addr && p > start_addr) { + switch (state) { + case normal_string: + addr = rspamd_email_address_from_smtp(start_addr, end - start_addr); + + if (addr) { + if (task->rcpt_envelope == NULL) { + task->rcpt_envelope = g_ptr_array_sized_new( + 2); + } + + g_ptr_array_add(task->rcpt_envelope, addr); + } + else { + msg_err_protocol("bad rcpt address: '%*s'", + (int) (end - start_addr), start_addr); + task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS; + } + break; + case skip_spaces: + /* Do nothing */ + break; + case quoted_string: + default: + msg_err_protocol("bad state when parsing rcpt address: '%*s'", + (int) (end - start_addr), start_addr); + task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS; + } + } +} + +#define COMPARE_FLAG_LIT(lit) (len == sizeof(lit) - 1 && memcmp((lit), str, len) == 0) +#define CHECK_PROTOCOL_FLAG(lit, fl) \ + do { \ + if (!known && COMPARE_FLAG_LIT(lit)) { \ + task->protocol_flags |= (fl); \ + known = TRUE; \ + msg_debug_protocol("add protocol flag %s", lit); \ + } \ + } while (0) +#define CHECK_TASK_FLAG(lit, fl) \ + do { \ + if (!known && COMPARE_FLAG_LIT(lit)) { \ + task->flags |= (fl); \ + known = TRUE; \ + msg_debug_protocol("add task flag %s", lit); \ + } \ + } while (0) + +static void +rspamd_protocol_handle_flag(struct rspamd_task *task, const gchar *str, + gsize len) +{ + gboolean known = FALSE; + + CHECK_TASK_FLAG("pass_all", RSPAMD_TASK_FLAG_PASS_ALL); + CHECK_TASK_FLAG("no_log", RSPAMD_TASK_FLAG_NO_LOG); + CHECK_TASK_FLAG("skip", RSPAMD_TASK_FLAG_SKIP); + CHECK_TASK_FLAG("skip_process", RSPAMD_TASK_FLAG_SKIP_PROCESS); + CHECK_TASK_FLAG("no_stat", RSPAMD_TASK_FLAG_NO_STAT); + CHECK_TASK_FLAG("ssl", RSPAMD_TASK_FLAG_SSL); + CHECK_TASK_FLAG("profile", RSPAMD_TASK_FLAG_PROFILE); + + CHECK_PROTOCOL_FLAG("milter", RSPAMD_TASK_PROTOCOL_FLAG_MILTER); + CHECK_PROTOCOL_FLAG("zstd", RSPAMD_TASK_PROTOCOL_FLAG_COMPRESSED); + CHECK_PROTOCOL_FLAG("ext_urls", RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS); + CHECK_PROTOCOL_FLAG("body_block", RSPAMD_TASK_PROTOCOL_FLAG_BODY_BLOCK); + CHECK_PROTOCOL_FLAG("groups", RSPAMD_TASK_PROTOCOL_FLAG_GROUPS); + + if (!known) { + msg_warn_protocol("unknown flag: %*s", (gint) len, str); + } +} + +#undef COMPARE_FLAG +#undef CHECK_PROTOCOL_FLAG + +static void +rspamd_protocol_process_flags(struct rspamd_task *task, const rspamd_ftok_t *hdr) +{ + enum { + skip_spaces, + read_flag, + } state = skip_spaces; + const gchar *p, *end, *start; + + p = hdr->begin; + end = hdr->begin + hdr->len; + start = NULL; + + while (p < end) { + switch (state) { + case skip_spaces: + if (g_ascii_isspace(*p)) { + p++; + } + else { + state = read_flag; + start = p; + } + break; + case read_flag: + if (*p == ',') { + if (p > start) { + rspamd_protocol_handle_flag(task, start, p - start); + } + start = NULL; + state = skip_spaces; + p++; + } + else { + p++; + } + break; + } + } + + /* Check remainder */ + if (start && end > start && state == read_flag) { + rspamd_protocol_handle_flag(task, start, end - start); + } +} + +#define IF_HEADER(name) \ + srch.begin = (name); \ + srch.len = sizeof(name) - 1; \ + if (rspamd_ftok_casecmp(hn_tok, &srch) == 0) + +gboolean +rspamd_protocol_handle_headers(struct rspamd_task *task, + struct rspamd_http_message *msg) +{ + rspamd_ftok_t *hn_tok, *hv_tok, srch; + gboolean has_ip = FALSE, seen_settings_header = FALSE; + struct rspamd_http_header *header, *h; + gchar *ntok; + + kh_foreach_value (msg->headers, header, { + DL_FOREACH (header, h) { + ntok = rspamd_mempool_ftokdup (task->task_pool, &h->name); + hn_tok = rspamd_mempool_alloc (task->task_pool, sizeof (*hn_tok)); + hn_tok->begin = ntok; + hn_tok->len = h->name.len; + + + ntok = rspamd_mempool_ftokdup (task->task_pool, &h->value); + hv_tok = rspamd_mempool_alloc (task->task_pool, sizeof (*hv_tok)); + hv_tok->begin = ntok; + hv_tok->len = h->value.len; + + switch (*hn_tok->begin) { + case 'd': + case 'D': + IF_HEADER(DELIVER_TO_HEADER) + { + task->deliver_to = rspamd_protocol_escape_braces(task, hv_tok); + msg_debug_protocol("read deliver-to header, value: %s", + task->deliver_to); + } + else + { + msg_debug_protocol("wrong header: %T", hn_tok); + } + break; + case 'h': + case 'H': + IF_HEADER(HELO_HEADER) + { + task->helo = rspamd_mempool_ftokdup(task->task_pool, hv_tok); + msg_debug_protocol("read helo header, value: %s", task->helo); + } + IF_HEADER(HOSTNAME_HEADER) + { + task->hostname = rspamd_mempool_ftokdup(task->task_pool, + hv_tok); + msg_debug_protocol("read hostname header, value: %s", task->hostname); + } + break; + case 'f': + case 'F': + IF_HEADER(FROM_HEADER) + { + if (hv_tok->len == 0) { + /* Replace '' with '<>' to fix parsing issue */ + RSPAMD_FTOK_ASSIGN(hv_tok, "<>"); + } + task->from_envelope = rspamd_email_address_from_smtp( + hv_tok->begin, + hv_tok->len); + msg_debug_protocol("read from header, value: %T", hv_tok); + + if (!task->from_envelope) { + msg_err_protocol("bad from header: '%T'", hv_tok); + task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS; + } + } + IF_HEADER(FILENAME_HEADER) + { + task->msg.fpath = rspamd_mempool_ftokdup(task->task_pool, + hv_tok); + msg_debug_protocol("read filename header, value: %s", task->msg.fpath); + } + IF_HEADER(FLAGS_HEADER) + { + msg_debug_protocol("read flags header, value: %T", hv_tok); + rspamd_protocol_process_flags(task, hv_tok); + } + break; + case 'q': + case 'Q': + IF_HEADER(QUEUE_ID_HEADER) + { + task->queue_id = rspamd_mempool_ftokdup(task->task_pool, + hv_tok); + msg_debug_protocol("read queue_id header, value: %s", task->queue_id); + } + else + { + msg_debug_protocol("wrong header: %T", hn_tok); + } + break; + case 'r': + case 'R': + IF_HEADER(RCPT_HEADER) + { + rspamd_protocol_process_recipients(task, hv_tok); + msg_debug_protocol("read rcpt header, value: %T", hv_tok); + } + IF_HEADER(RAW_DATA_HEADER) + { + srch.begin = "yes"; + srch.len = 3; + + msg_debug_protocol("read raw data header, value: %T", hv_tok); + + if (rspamd_ftok_casecmp(hv_tok, &srch) == 0) { + task->flags &= ~RSPAMD_TASK_FLAG_MIME; + msg_debug_protocol("disable mime parsing"); + } + } + break; + case 'i': + case 'I': + IF_HEADER(IP_ADDR_HEADER) + { + if (!rspamd_parse_inet_address(&task->from_addr, + hv_tok->begin, hv_tok->len, + RSPAMD_INET_ADDRESS_PARSE_DEFAULT)) { + msg_err_protocol("bad ip header: '%T'", hv_tok); + } + else { + msg_debug_protocol("read IP header, value: %T", hv_tok); + has_ip = TRUE; + } + } + else + { + msg_debug_protocol("wrong header: %T", hn_tok); + } + break; + case 'p': + case 'P': + IF_HEADER(PASS_HEADER) + { + srch.begin = "all"; + srch.len = 3; + + msg_debug_protocol("read pass header, value: %T", hv_tok); + + if (rspamd_ftok_casecmp(hv_tok, &srch) == 0) { + task->flags |= RSPAMD_TASK_FLAG_PASS_ALL; + msg_debug_protocol("pass all filters"); + } + } + IF_HEADER(PROFILE_HEADER) + { + msg_debug_protocol("read profile header, value: %T", hv_tok); + task->flags |= RSPAMD_TASK_FLAG_PROFILE; + } + break; + case 's': + case 'S': + IF_HEADER(SETTINGS_ID_HEADER) + { + msg_debug_protocol("read settings-id header, value: %T", hv_tok); + task->settings_elt = rspamd_config_find_settings_name_ref( + task->cfg, hv_tok->begin, hv_tok->len); + + if (task->settings_elt == NULL) { + GString *known_ids = g_string_new(NULL); + struct rspamd_config_settings_elt *cur; + + DL_FOREACH(task->cfg->setting_ids, cur) + { + rspamd_printf_gstring(known_ids, "%s(%ud);", + cur->name, cur->id); + } + + msg_warn_protocol("unknown settings id: %T(%d); known_ids: %v", + hv_tok, + rspamd_config_name_to_id(hv_tok->begin, hv_tok->len), + known_ids); + + g_string_free(known_ids, TRUE); + } + else { + msg_debug_protocol("applied settings id %T -> %ud", hv_tok, + task->settings_elt->id); + } + } + IF_HEADER(SETTINGS_HEADER) + { + msg_debug_protocol("read settings header, value: %T", hv_tok); + seen_settings_header = TRUE; + } + break; + case 'u': + case 'U': + IF_HEADER(USER_HEADER) + { + /* + * We must ignore User header in case of spamc, as SA has + * different meaning of this header + */ + msg_debug_protocol("read user header, value: %T", hv_tok); + if (!RSPAMD_TASK_IS_SPAMC(task)) { + task->auth_user = rspamd_mempool_ftokdup(task->task_pool, + hv_tok); + } + else { + msg_info_protocol("ignore user header: legacy SA protocol"); + } + } + IF_HEADER(URLS_HEADER) + { + msg_debug_protocol("read urls header, value: %T", hv_tok); + + srch.begin = "extended"; + srch.len = 8; + + if (rspamd_ftok_casecmp(hv_tok, &srch) == 0) { + task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS; + msg_debug_protocol("extended urls information"); + } + + /* TODO: add more formats there */ + } + IF_HEADER(USER_AGENT_HEADER) + { + msg_debug_protocol("read user-agent header, value: %T", hv_tok); + + if (hv_tok->len == 6 && + rspamd_lc_cmp(hv_tok->begin, "rspamc", 6) == 0) { + task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_LOCAL_CLIENT; + } + } + break; + case 'l': + case 'L': + IF_HEADER(NO_LOG_HEADER) + { + msg_debug_protocol("read log header, value: %T", hv_tok); + srch.begin = "no"; + srch.len = 2; + + if (rspamd_ftok_casecmp(hv_tok, &srch) == 0) { + task->flags |= RSPAMD_TASK_FLAG_NO_LOG; + } + } + break; + case 'm': + case 'M': + IF_HEADER(MLEN_HEADER) + { + msg_debug_protocol("read message length header, value: %T", + hv_tok); + task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_HAS_CONTROL; + } + IF_HEADER(MTA_TAG_HEADER) + { + gchar *mta_tag; + mta_tag = rspamd_mempool_ftokdup(task->task_pool, hv_tok); + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_MTA_TAG, + mta_tag, NULL); + msg_debug_protocol("read MTA-Tag header, value: %s", mta_tag); + } + IF_HEADER(MTA_NAME_HEADER) + { + gchar *mta_name; + mta_name = rspamd_mempool_ftokdup(task->task_pool, hv_tok); + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_MTA_NAME, + mta_name, NULL); + msg_debug_protocol("read MTA-Name header, value: %s", mta_name); + } + IF_HEADER(MILTER_HEADER) + { + task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_MILTER; + msg_debug_protocol("read Milter header, value: %T", hv_tok); + } + break; + case 't': + case 'T': + IF_HEADER(TLS_CIPHER_HEADER) + { + task->flags |= RSPAMD_TASK_FLAG_SSL; + msg_debug_protocol("read TLS cipher header, value: %T", hv_tok); + } + break; + default: + msg_debug_protocol("generic header: %T", hn_tok); + break; + } + + rspamd_task_add_request_header (task, hn_tok, hv_tok); +} +}); /* End of kh_foreach_value */ + +if (seen_settings_header && task->settings_elt) { + msg_warn_task("ignore settings id %s as settings header is also presented", + task->settings_elt->name); + REF_RELEASE(task->settings_elt); + + task->settings_elt = NULL; +} + +if (!has_ip) { + task->flags |= RSPAMD_TASK_FLAG_NO_IP; +} + +return TRUE; +} + +#define BOOL_TO_FLAG(val, flags, flag) \ + do { \ + if ((val)) (flags) |= (flag); \ + else \ + (flags) &= ~(flag); \ + } while (0) + +gboolean +rspamd_protocol_parse_task_flags(rspamd_mempool_t *pool, + const ucl_object_t *obj, + gpointer ud, + struct rspamd_rcl_section *section, + GError **err) +{ + struct rspamd_rcl_struct_parser *pd = ud; + gint *target; + const gchar *key; + gboolean value; + + target = (gint *) (((gchar *) pd->user_struct) + pd->offset); + key = ucl_object_key(obj); + value = ucl_object_toboolean(obj); + + if (key != NULL) { + if (g_ascii_strcasecmp(key, "pass_all") == 0) { + BOOL_TO_FLAG(value, *target, RSPAMD_TASK_FLAG_PASS_ALL); + } + else if (g_ascii_strcasecmp(key, "no_log") == 0) { + BOOL_TO_FLAG(value, *target, RSPAMD_TASK_FLAG_NO_LOG); + } + } + + return TRUE; +} + +static struct rspamd_rcl_sections_map *control_parser = NULL; + +RSPAMD_CONSTRUCTOR(rspamd_protocol_control_parser_ctor) +{ + + struct rspamd_rcl_section *sub = rspamd_rcl_add_section(&control_parser, NULL, + "*", + NULL, + NULL, + UCL_OBJECT, + FALSE, + TRUE); + /* Default handlers */ + rspamd_rcl_add_default_handler(sub, + "ip", + rspamd_rcl_parse_struct_addr, + G_STRUCT_OFFSET(struct rspamd_task, from_addr), + 0, + NULL); + rspamd_rcl_add_default_handler(sub, + "from", + rspamd_rcl_parse_struct_mime_addr, + G_STRUCT_OFFSET(struct rspamd_task, from_envelope), + 0, + NULL); + rspamd_rcl_add_default_handler(sub, + "rcpt", + rspamd_rcl_parse_struct_mime_addr, + G_STRUCT_OFFSET(struct rspamd_task, rcpt_envelope), + 0, + NULL); + rspamd_rcl_add_default_handler(sub, + "helo", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_task, helo), + 0, + NULL); + rspamd_rcl_add_default_handler(sub, + "user", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET(struct rspamd_task, auth_user), + 0, + NULL); + rspamd_rcl_add_default_handler(sub, + "pass_all", + rspamd_protocol_parse_task_flags, + G_STRUCT_OFFSET(struct rspamd_task, flags), + 0, + NULL); + rspamd_rcl_add_default_handler(sub, + "json", + rspamd_protocol_parse_task_flags, + G_STRUCT_OFFSET(struct rspamd_task, flags), + 0, + NULL); +} + +RSPAMD_DESTRUCTOR(rspamd_protocol_control_parser_dtor) +{ + rspamd_rcl_sections_free(control_parser); +} + +gboolean +rspamd_protocol_handle_control(struct rspamd_task *task, + const ucl_object_t *control) +{ + GError *err = NULL; + + if (!rspamd_rcl_parse(control_parser, task->cfg, task, task->task_pool, + control, &err)) { + msg_warn_protocol("cannot parse control block: %e", err); + g_error_free(err); + + return FALSE; + } + + return TRUE; +} + +gboolean +rspamd_protocol_handle_request(struct rspamd_task *task, + struct rspamd_http_message *msg) +{ + gboolean ret = TRUE; + + if (msg->method == HTTP_SYMBOLS) { + msg_debug_protocol("got legacy SYMBOLS method, enable rspamc protocol workaround"); + task->cmd = CMD_CHECK_RSPAMC; + } + else if (msg->method == HTTP_CHECK) { + msg_debug_protocol("got legacy CHECK method, enable rspamc protocol workaround"); + task->cmd = CMD_CHECK_RSPAMC; + } + else { + ret = rspamd_protocol_handle_url(task, msg); + } + + if (msg->flags & RSPAMD_HTTP_FLAG_SPAMC) { + msg_debug_protocol("got legacy SA input, enable spamc protocol workaround"); + task->cmd = CMD_CHECK_SPAMC; + } + + return ret; +} + +/* Structure for writing tree data */ +struct tree_cb_data { + ucl_object_t *top; + khash_t(rspamd_url_host_hash) * seen; + struct rspamd_task *task; +}; + +static ucl_object_t * +rspamd_protocol_extended_url(struct rspamd_task *task, + struct rspamd_url *url, + const gchar *encoded, gsize enclen) +{ + ucl_object_t *obj, *elt; + + obj = ucl_object_typed_new(UCL_OBJECT); + + elt = ucl_object_fromstring_common(encoded, enclen, 0); + ucl_object_insert_key(obj, elt, "url", 0, false); + + if (url->tldlen > 0) { + elt = ucl_object_fromstring_common(rspamd_url_tld_unsafe(url), + url->tldlen, 0); + ucl_object_insert_key(obj, elt, "tld", 0, false); + } + if (url->hostlen > 0) { + elt = ucl_object_fromstring_common(rspamd_url_host_unsafe(url), + url->hostlen, 0); + ucl_object_insert_key(obj, elt, "host", 0, false); + } + + ucl_object_t *flags = ucl_object_typed_new(UCL_ARRAY); + + for (unsigned int i = 0; i < RSPAMD_URL_MAX_FLAG_SHIFT; i++) { + if (url->flags & (1u << i)) { + ucl_object_t *fl = ucl_object_fromstring(rspamd_url_flag_to_string(1u << i)); + ucl_array_append(flags, fl); + } + } + + ucl_object_insert_key(obj, flags, "flags", 0, false); + + if (url->ext && url->ext->linked_url) { + encoded = rspamd_url_encode(url->ext->linked_url, &enclen, task->task_pool); + elt = rspamd_protocol_extended_url(task, url->ext->linked_url, encoded, + enclen); + ucl_object_insert_key(obj, elt, "linked_url", 0, false); + } + + return obj; +} + +/* + * Callback for writing urls + */ +static void +urls_protocol_cb(struct rspamd_url *url, struct tree_cb_data *cb) +{ + ucl_object_t *obj; + struct rspamd_task *task = cb->task; + const gchar *user_field = "unknown", *encoded = NULL; + gboolean has_user = FALSE; + guint len = 0; + gsize enclen = 0; + + if (!(task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS)) { + if (url->hostlen > 0) { + if (rspamd_url_host_set_has(cb->seen, url)) { + return; + } + + goffset err_offset; + + if ((err_offset = rspamd_fast_utf8_validate(rspamd_url_host_unsafe(url), + url->hostlen)) == 0) { + obj = ucl_object_fromstring_common(rspamd_url_host_unsafe(url), + url->hostlen, 0); + } + else { + obj = ucl_object_fromstring_common(rspamd_url_host_unsafe(url), + err_offset - 1, 0); + } + } + else { + return; + } + + rspamd_url_host_set_add(cb->seen, url); + } + else { + encoded = rspamd_url_encode(url, &enclen, task->task_pool); + obj = rspamd_protocol_extended_url(task, url, encoded, enclen); + } + + ucl_array_append(cb->top, obj); + + if (cb->task->cfg->log_urls) { + if (task->auth_user) { + user_field = task->auth_user; + len = strlen(task->auth_user); + has_user = TRUE; + } + else if (task->from_envelope) { + user_field = task->from_envelope->addr; + len = task->from_envelope->addr_len; + } + + if (!encoded) { + encoded = rspamd_url_encode(url, &enclen, task->task_pool); + } + + msg_notice_task_encrypted("<%s> %s: %*s; ip: %s; URL: %*s", + MESSAGE_FIELD_CHECK(task, message_id), + has_user ? "user" : "from", + len, user_field, + rspamd_inet_address_to_string(task->from_addr), + (gint) enclen, encoded); + } +} + +static ucl_object_t * +rspamd_urls_tree_ucl(khash_t(rspamd_url_hash) * set, + struct rspamd_task *task) +{ + struct tree_cb_data cb; + ucl_object_t *obj; + struct rspamd_url *u; + + obj = ucl_object_typed_new(UCL_ARRAY); + cb.top = obj; + cb.task = task; + cb.seen = kh_init(rspamd_url_host_hash); + + kh_foreach_key(set, u, { + if (!(u->protocol & PROTOCOL_MAILTO)) { + urls_protocol_cb(u, &cb); + } + }); + + kh_destroy(rspamd_url_host_hash, cb.seen); + + return obj; +} + +static void +emails_protocol_cb(struct rspamd_url *url, struct tree_cb_data *cb) +{ + ucl_object_t *obj; + + if (url->userlen > 0 && url->hostlen > 0) { + obj = ucl_object_fromlstring(rspamd_url_user_unsafe(url), + url->userlen + url->hostlen + 1); + ucl_array_append(cb->top, obj); + } +} + +static ucl_object_t * +rspamd_emails_tree_ucl(khash_t(rspamd_url_hash) * set, + struct rspamd_task *task) +{ + struct tree_cb_data cb; + ucl_object_t *obj; + struct rspamd_url *u; + + obj = ucl_object_typed_new(UCL_ARRAY); + cb.top = obj; + cb.task = task; + + kh_foreach_key(set, u, { + if ((u->protocol & PROTOCOL_MAILTO)) { + emails_protocol_cb(u, &cb); + } + }); + + + return obj; +} + + +/* Write new subject */ +static const gchar * +rspamd_protocol_rewrite_subject(struct rspamd_task *task) +{ + GString *subj_buf; + gchar *res; + const gchar *s, *c, *p; + gsize slen = 0; + + c = rspamd_mempool_get_variable(task->task_pool, "metric_subject"); + + if (c == NULL) { + c = task->cfg->subject; + } + + if (c == NULL) { + c = SPAM_SUBJECT; + } + + p = c; + s = MESSAGE_FIELD_CHECK(task, subject); + + if (s) { + slen = strlen(s); + } + + subj_buf = g_string_sized_new(strlen(c) + slen); + + while (*p) { + if (*p == '%') { + switch (p[1]) { + case 's': + g_string_append_len(subj_buf, c, p - c); + + if (s) { + g_string_append_len(subj_buf, s, slen); + } + c = p + 2; + p += 2; + break; + case 'd': + g_string_append_len(subj_buf, c, p - c); + rspamd_printf_gstring(subj_buf, "%.2f", task->result->score); + c = p + 2; + p += 2; + break; + case '%': + g_string_append_len(subj_buf, c, p - c); + g_string_append_c(subj_buf, '%'); + c = p + 2; + p += 2; + break; + default: + p++; /* Just % something unknown */ + break; + } + } + else { + p++; + } + } + + if (p > c) { + g_string_append_len(subj_buf, c, p - c); + } + + res = rspamd_mime_header_encode(subj_buf->str, subj_buf->len); + + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) g_free, + res); + g_string_free(subj_buf, TRUE); + + return res; +} + +static ucl_object_t * +rspamd_metric_symbol_ucl(struct rspamd_task *task, struct rspamd_symbol_result *sym) +{ + ucl_object_t *obj = NULL, *ar; + const gchar *description = NULL; + struct rspamd_symbol_option *opt; + + if (sym->sym != NULL) { + description = sym->sym->description; + } + + obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(obj, ucl_object_fromstring(sym->name), "name", 0, false); + ucl_object_insert_key(obj, ucl_object_fromdouble(sym->score), "score", 0, false); + + if (task->cmd == CMD_CHECK_V2) { + if (sym->sym) { + ucl_object_insert_key(obj, ucl_object_fromdouble(sym->sym->score), "metric_score", 0, false); + } + else { + ucl_object_insert_key(obj, ucl_object_fromdouble(0.0), + "metric_score", 0, false); + } + } + + if (description) { + ucl_object_insert_key(obj, ucl_object_fromstring(description), + "description", 0, false); + } + + if (sym->options != NULL) { + ar = ucl_object_typed_new(UCL_ARRAY); + + DL_FOREACH(sym->opts_head, opt) + { + ucl_array_append(ar, ucl_object_fromstring_common(opt->option, + opt->optlen, 0)); + } + + ucl_object_insert_key(obj, ar, "options", 0, false); + } + + return obj; +} + +static ucl_object_t * +rspamd_metric_group_ucl(struct rspamd_task *task, + struct rspamd_symbols_group *gr, gdouble score) +{ + ucl_object_t *obj = NULL; + + obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(obj, ucl_object_fromdouble(score), + "score", 0, false); + + if (gr->description) { + ucl_object_insert_key(obj, ucl_object_fromstring(gr->description), + "description", 0, false); + } + + return obj; +} + +static ucl_object_t * +rspamd_scan_result_ucl(struct rspamd_task *task, + struct rspamd_scan_result *mres, ucl_object_t *top) +{ + struct rspamd_symbol_result *sym; + gboolean is_spam; + struct rspamd_action *action; + ucl_object_t *obj = NULL, *sobj; + const gchar *subject; + struct rspamd_passthrough_result *pr = NULL; + + action = rspamd_check_action_metric(task, &pr, NULL); + is_spam = !(action->flags & RSPAMD_ACTION_HAM); + + if (task->cmd == CMD_CHECK) { + obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(obj, + ucl_object_frombool(is_spam), + "is_spam", 0, false); + } + else { + obj = top; + } + + if (pr) { + if (pr->message && !(pr->flags & RSPAMD_PASSTHROUGH_NO_SMTP_MESSAGE)) { + /* Add smtp message if it does not exist: see #3269 for details */ + if (ucl_object_lookup(task->messages, "smtp_message") == NULL) { + ucl_object_insert_key(task->messages, + ucl_object_fromstring_common(pr->message, 0, UCL_STRING_RAW), + "smtp_message", 0, + false); + } + } + + ucl_object_insert_key(obj, + ucl_object_fromstring(pr->module), + "passthrough_module", 0, false); + } + + ucl_object_insert_key(obj, + ucl_object_frombool(RSPAMD_TASK_IS_SKIPPED(task)), + "is_skipped", 0, false); + + if (!isnan(mres->score)) { + ucl_object_insert_key(obj, ucl_object_fromdouble(mres->score), + "score", 0, false); + } + else { + ucl_object_insert_key(obj, + ucl_object_fromdouble(0.0), "score", 0, false); + } + + ucl_object_insert_key(obj, + ucl_object_fromdouble(rspamd_task_get_required_score(task, mres)), + "required_score", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromstring(action->name), + "action", 0, false); + + if (action->action_type == METRIC_ACTION_REWRITE_SUBJECT) { + subject = rspamd_protocol_rewrite_subject(task); + + if (subject) { + ucl_object_insert_key(obj, ucl_object_fromstring(subject), + "subject", 0, false); + } + } + if (action->flags & RSPAMD_ACTION_MILTER) { + /* Treat milter action specially */ + if (action->action_type == METRIC_ACTION_DISCARD) { + ucl_object_insert_key(obj, ucl_object_fromstring("discard"), + "reject", 0, false); + } + else if (action->action_type == METRIC_ACTION_QUARANTINE) { + ucl_object_insert_key(obj, ucl_object_fromstring("quarantine"), + "reject", 0, false); + } + } + + /* Now handle symbols */ + if (task->cmd != CMD_CHECK) { + /* Insert actions thresholds */ + ucl_object_t *actions_obj = ucl_object_typed_new(UCL_OBJECT); + + for (int i = task->result->nactions - 1; i >= 0; i--) { + struct rspamd_action_config *action_lim = &task->result->actions_config[i]; + + if (!isnan(action_lim->cur_limit) && + !(action_lim->action->flags & (RSPAMD_ACTION_NO_THRESHOLD | RSPAMD_ACTION_HAM))) { + ucl_object_insert_key(actions_obj, ucl_object_fromdouble(action_lim->cur_limit), + action_lim->action->name, 0, true); + } + } + + ucl_object_insert_key(obj, actions_obj, "thresholds", 0, false); + + /* For checkv2 we insert symbols as a separate object */ + obj = ucl_object_typed_new(UCL_OBJECT); + } + + kh_foreach_value(mres->symbols, sym, { + if (!(sym->flags & RSPAMD_SYMBOL_RESULT_IGNORED)) { + sobj = rspamd_metric_symbol_ucl(task, sym); + ucl_object_insert_key(obj, sobj, sym->name, 0, false); + } + }) + + if (task->cmd != CMD_CHECK) + { + /* For checkv2 we insert symbols as a separate object */ + ucl_object_insert_key(top, obj, "symbols", 0, false); + } + else + { + /* For legacy check we just insert it as "default" all together */ + ucl_object_insert_key(top, obj, DEFAULT_METRIC, 0, false); + } + + /* Handle groups if needed */ + if (task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_GROUPS) { + struct rspamd_symbols_group *gr; + gdouble gr_score; + + obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_reserve(obj, kh_size(mres->sym_groups)); + + kh_foreach(mres->sym_groups, gr, gr_score, { + if (task->cfg->public_groups_only && + !(gr->flags & RSPAMD_SYMBOL_GROUP_PUBLIC)) { + continue; + } + sobj = rspamd_metric_group_ucl(task, gr, gr_score); + ucl_object_insert_key(obj, sobj, gr->name, 0, false); + }); + + ucl_object_insert_key(top, obj, "groups", 0, false); + } + + return obj; +} + +void rspamd_ucl_torspamc_output(const ucl_object_t *top, + rspamd_fstring_t **out) +{ + const ucl_object_t *symbols, *score, + *required_score, *is_spam, *elt, *cur; + ucl_object_iter_t iter = NULL; + + score = ucl_object_lookup(top, "score"); + required_score = ucl_object_lookup(top, "required_score"); + is_spam = ucl_object_lookup(top, "is_spam"); + rspamd_printf_fstring(out, + "Metric: default; %s; %.2f / %.2f / 0.0\r\n", + ucl_object_toboolean(is_spam) ? "True" : "False", + ucl_object_todouble(score), + ucl_object_todouble(required_score)); + elt = ucl_object_lookup(top, "action"); + if (elt != NULL) { + rspamd_printf_fstring(out, "Action: %s\r\n", + ucl_object_tostring(elt)); + } + + elt = ucl_object_lookup(top, "subject"); + if (elt != NULL) { + rspamd_printf_fstring(out, "Subject: %s\r\n", + ucl_object_tostring(elt)); + } + + symbols = ucl_object_lookup(top, "symbols"); + + if (symbols != NULL) { + iter = NULL; + while ((elt = ucl_object_iterate(symbols, &iter, true)) != NULL) { + if (elt->type == UCL_OBJECT) { + const ucl_object_t *sym_score; + sym_score = ucl_object_lookup(elt, "score"); + rspamd_printf_fstring(out, "Symbol: %s(%.2f)\r\n", + ucl_object_key(elt), + ucl_object_todouble(sym_score)); + } + } + } + + elt = ucl_object_lookup(top, "messages"); + if (elt != NULL) { + iter = NULL; + while ((cur = ucl_object_iterate(elt, &iter, true)) != NULL) { + if (cur->type == UCL_STRING) { + rspamd_printf_fstring(out, "Message: %s\r\n", + ucl_object_tostring(cur)); + } + } + } + + elt = ucl_object_lookup(top, "message-id"); + if (elt != NULL) { + rspamd_printf_fstring(out, "Message-ID: %s\r\n", + ucl_object_tostring(elt)); + } +} + +void rspamd_ucl_tospamc_output(const ucl_object_t *top, + rspamd_fstring_t **out) +{ + const ucl_object_t *symbols, *score, + *required_score, *is_spam, *elt; + ucl_object_iter_t iter = NULL; + rspamd_fstring_t *f; + + score = ucl_object_lookup(top, "score"); + required_score = ucl_object_lookup(top, "required_score"); + is_spam = ucl_object_lookup(top, "is_spam"); + rspamd_printf_fstring(out, + "Spam: %s ; %.2f / %.2f\r\n\r\n", + ucl_object_toboolean(is_spam) ? "True" : "False", + ucl_object_todouble(score), + ucl_object_todouble(required_score)); + + symbols = ucl_object_lookup(top, "symbols"); + + if (symbols != NULL) { + while ((elt = ucl_object_iterate(symbols, &iter, true)) != NULL) { + if (elt->type == UCL_OBJECT) { + rspamd_printf_fstring(out, "%s,", + ucl_object_key(elt)); + } + } + /* Ugly hack, but the whole spamc is ugly */ + f = *out; + if (f->str[f->len - 1] == ',') { + f->len--; + + *out = rspamd_fstring_append(*out, CRLF, 2); + } + } +} + +static void +rspamd_protocol_output_profiling(struct rspamd_task *task, + ucl_object_t *top) +{ + GHashTable *tbl; + GHashTableIter it; + gpointer k, v; + ucl_object_t *prof; + gdouble val; + + prof = ucl_object_typed_new(UCL_OBJECT); + tbl = rspamd_mempool_get_variable(task->task_pool, "profile"); + + if (tbl) { + g_hash_table_iter_init(&it, tbl); + + while (g_hash_table_iter_next(&it, &k, &v)) { + val = *(gdouble *) v; + ucl_object_insert_key(prof, ucl_object_fromdouble(val), + (const char *) k, 0, false); + } + } + + ucl_object_insert_key(top, prof, "profile", 0, false); +} + +ucl_object_t * +rspamd_protocol_write_ucl(struct rspamd_task *task, + enum rspamd_protocol_flags flags) +{ + ucl_object_t *top = NULL; + GString *dkim_sig; + GList *dkim_sigs; + const ucl_object_t *milter_reply; + + rspamd_task_set_finish_time(task); + top = ucl_object_typed_new(UCL_OBJECT); + + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) ucl_object_unref, top); + + if (flags & RSPAMD_PROTOCOL_METRICS) { + rspamd_scan_result_ucl(task, task->result, top); + } + + if (flags & RSPAMD_PROTOCOL_MESSAGES) { + if (G_UNLIKELY(task->cfg->compat_messages)) { + const ucl_object_t *cur; + ucl_object_t *msg_object; + ucl_object_iter_t iter = NULL; + + msg_object = ucl_object_typed_new(UCL_ARRAY); + + while ((cur = ucl_object_iterate(task->messages, &iter, true)) != NULL) { + if (cur->type == UCL_STRING) { + ucl_array_append(msg_object, ucl_object_ref(cur)); + } + } + + ucl_object_insert_key(top, msg_object, "messages", 0, false); + } + else { + ucl_object_insert_key(top, ucl_object_ref(task->messages), + "messages", 0, false); + } + } + + if (flags & RSPAMD_PROTOCOL_URLS && task->message) { + if (kh_size(MESSAGE_FIELD(task, urls)) > 0) { + ucl_object_insert_key(top, + rspamd_urls_tree_ucl(MESSAGE_FIELD(task, urls), task), + "urls", 0, false); + ucl_object_insert_key(top, + rspamd_emails_tree_ucl(MESSAGE_FIELD(task, urls), task), + "emails", 0, false); + } + } + + if (flags & RSPAMD_PROTOCOL_EXTRA) { + if (G_UNLIKELY(RSPAMD_TASK_IS_PROFILING(task))) { + rspamd_protocol_output_profiling(task, top); + } + } + + if (flags & RSPAMD_PROTOCOL_BASIC) { + ucl_object_insert_key(top, + ucl_object_fromstring(MESSAGE_FIELD_CHECK(task, message_id)), + "message-id", 0, false); + ucl_object_insert_key(top, + ucl_object_fromdouble(task->time_real_finish - task->task_timestamp), + "time_real", 0, false); + } + + if (flags & RSPAMD_PROTOCOL_DKIM) { + dkim_sigs = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_DKIM_SIGNATURE); + + if (dkim_sigs) { + if (dkim_sigs->next) { + /* Multiple DKIM signatures */ + ucl_object_t *ar = ucl_object_typed_new(UCL_ARRAY); + + for (; dkim_sigs != NULL; dkim_sigs = dkim_sigs->next) { + GString *folded_header; + dkim_sig = (GString *) dkim_sigs->data; + + if (task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_MILTER || + !task->message) { + + folded_header = rspamd_header_value_fold( + "DKIM-Signature", strlen("DKIM-Signature"), + dkim_sig->str, dkim_sig->len, + 80, RSPAMD_TASK_NEWLINES_LF, NULL); + } + else { + folded_header = rspamd_header_value_fold( + "DKIM-Signature", strlen("DKIM-Signature"), + dkim_sig->str, dkim_sig->len, + 80, + MESSAGE_FIELD(task, nlines_type), + NULL); + } + + ucl_array_append(ar, + ucl_object_fromstring_common(folded_header->str, + folded_header->len, UCL_STRING_RAW)); + g_string_free(folded_header, TRUE); + } + + ucl_object_insert_key(top, + ar, + "dkim-signature", 0, + false); + } + else { + /* Single DKIM signature */ + GString *folded_header; + dkim_sig = (GString *) dkim_sigs->data; + + if (task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_MILTER) { + folded_header = rspamd_header_value_fold( + "DKIM-Signature", strlen("DKIM-Signature"), + dkim_sig->str, dkim_sig->len, + 80, RSPAMD_TASK_NEWLINES_LF, NULL); + } + else { + folded_header = rspamd_header_value_fold( + "DKIM-Signature", strlen("DKIM-Signature"), + dkim_sig->str, dkim_sig->len, + 80, MESSAGE_FIELD(task, nlines_type), + NULL); + } + + ucl_object_insert_key(top, + ucl_object_fromstring_common(folded_header->str, + folded_header->len, UCL_STRING_RAW), + "dkim-signature", 0, false); + g_string_free(folded_header, TRUE); + } + } + } + + if (flags & RSPAMD_PROTOCOL_RMILTER) { + milter_reply = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_MILTER_REPLY); + + if (milter_reply) { + if (task->cmd != CMD_CHECK) { + ucl_object_insert_key(top, ucl_object_ref(milter_reply), + "milter", 0, false); + } + else { + ucl_object_insert_key(top, ucl_object_ref(milter_reply), + "rmilter", 0, false); + } + } + } + + return top; +} + +void rspamd_protocol_http_reply(struct rspamd_http_message *msg, + struct rspamd_task *task, ucl_object_t **pobj) +{ + struct rspamd_scan_result *metric_res; + const struct rspamd_re_cache_stat *restat; + + ucl_object_t *top = NULL; + rspamd_fstring_t *reply; + gint flags = RSPAMD_PROTOCOL_DEFAULT; + struct rspamd_action *action; + + /* Removed in 2.0 */ +#if 0 + GHashTableIter hiter; + gpointer h, v; + /* Write custom headers */ + g_hash_table_iter_init (&hiter, task->reply_headers); + while (g_hash_table_iter_next (&hiter, &h, &v)) { + rspamd_ftok_t *hn = h, *hv = v; + + rspamd_http_message_add_header (msg, hn->begin, hv->begin); + } +#endif + + flags |= RSPAMD_PROTOCOL_URLS; + + top = rspamd_protocol_write_ucl(task, flags); + + if (pobj) { + *pobj = top; + } + + if (!(task->flags & RSPAMD_TASK_FLAG_NO_LOG)) { + rspamd_roll_history_update(task->worker->srv->history, task); + } + else { + msg_debug_protocol("skip history update due to no log flag"); + } + + rspamd_task_write_log(task); + + if (task->cfg->log_flags & RSPAMD_LOG_FLAG_RE_CACHE) { + restat = rspamd_re_cache_get_stat(task->re_rt); + g_assert(restat != NULL); + msg_notice_task( + "regexp statistics: %ud pcre regexps scanned, %ud regexps matched," + " %ud regexps total, %ud regexps cached," + " %HL scanned using pcre, %HL scanned total", + restat->regexp_checked, + restat->regexp_matched, + restat->regexp_total, + restat->regexp_fast_cached, + restat->bytes_scanned_pcre, + restat->bytes_scanned); + } + + reply = rspamd_fstring_sized_new(1000); + + if (msg->method < HTTP_SYMBOLS && !RSPAMD_TASK_IS_SPAMC(task)) { + msg_debug_protocol("writing json reply"); + rspamd_ucl_emit_fstring(top, UCL_EMIT_JSON_COMPACT, &reply); + } + else { + if (RSPAMD_TASK_IS_SPAMC(task)) { + msg_debug_protocol("writing spamc legacy reply to client"); + rspamd_ucl_tospamc_output(top, &reply); + } + else { + msg_debug_protocol("writing rspamc legacy reply to client"); + rspamd_ucl_torspamc_output(top, &reply); + } + } + + if (task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_BODY_BLOCK) { + /* Check if we need to insert a body block */ + if (task->flags & RSPAMD_TASK_FLAG_MESSAGE_REWRITE) { + GString *hdr_offset = g_string_sized_new(30); + + rspamd_printf_gstring(hdr_offset, "%z", RSPAMD_FSTRING_LEN(reply)); + rspamd_http_message_add_header(msg, MESSAGE_OFFSET_HEADER, + hdr_offset->str); + msg_debug_protocol("write body block at position %s", + hdr_offset->str); + g_string_free(hdr_offset, TRUE); + + /* In case of milter, we append just body, otherwise - full message */ + if (task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_MILTER) { + const gchar *start; + goffset len, hdr_off; + + start = task->msg.begin; + len = task->msg.len; + + hdr_off = MESSAGE_FIELD(task, raw_headers_content).len; + + if (hdr_off < len) { + start += hdr_off; + len -= hdr_off; + + /* The problem here is that we need not end of headers, we need + * start of body. + * + * Hence, we need to skip one \r\n till there is anything else in + * a line. + */ + + if (*start == '\r' && len > 0) { + start++; + len--; + } + + if (*start == '\n' && len > 0) { + start++; + len--; + } + + msg_debug_protocol("milter version of body block size %d", + (int) len); + reply = rspamd_fstring_append(reply, start, len); + } + } + else { + msg_debug_protocol("general version of body block size %d", + (int) task->msg.len); + reply = rspamd_fstring_append(reply, + task->msg.begin, task->msg.len); + } + } + } + + if ((task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_COMPRESSED) && + rspamd_libs_reset_compression(task->cfg->libs_ctx)) { + /* We can compress output */ + ZSTD_inBuffer zin; + ZSTD_outBuffer zout; + ZSTD_CStream *zstream; + rspamd_fstring_t *compressed_reply; + gsize r; + + zstream = task->cfg->libs_ctx->out_zstream; + compressed_reply = rspamd_fstring_sized_new(ZSTD_compressBound(reply->len)); + zin.pos = 0; + zin.src = reply->str; + zin.size = reply->len; + zout.pos = 0; + zout.dst = compressed_reply->str; + zout.size = compressed_reply->allocated; + + while (zin.pos < zin.size) { + r = ZSTD_compressStream(zstream, &zout, &zin); + + if (ZSTD_isError(r)) { + msg_err_protocol("cannot compress: %s", ZSTD_getErrorName(r)); + rspamd_fstring_free(compressed_reply); + rspamd_http_message_set_body_from_fstring_steal(msg, reply); + + goto end; + } + } + + ZSTD_flushStream(zstream, &zout); + r = ZSTD_endStream(zstream, &zout); + + if (ZSTD_isError(r)) { + msg_err_protocol("cannot finalize compress: %s", ZSTD_getErrorName(r)); + rspamd_fstring_free(compressed_reply); + rspamd_http_message_set_body_from_fstring_steal(msg, reply); + + goto end; + } + + msg_info_protocol("writing compressed results: %z bytes before " + "%z bytes after", + zin.pos, zout.pos); + compressed_reply->len = zout.pos; + rspamd_fstring_free(reply); + rspamd_http_message_set_body_from_fstring_steal(msg, compressed_reply); + rspamd_http_message_add_header(msg, COMPRESSION_HEADER, "zstd"); + + if (task->cfg->libs_ctx->out_dict && + task->cfg->libs_ctx->out_dict->id != 0) { + gchar dict_str[32]; + + rspamd_snprintf(dict_str, sizeof(dict_str), "%ud", + task->cfg->libs_ctx->out_dict->id); + rspamd_http_message_add_header(msg, "Dictionary", dict_str); + } + } + else { + rspamd_http_message_set_body_from_fstring_steal(msg, reply); + } + +end: + if (!(task->flags & RSPAMD_TASK_FLAG_NO_STAT)) { + /* Update stat for default metric */ + + msg_debug_protocol("skip stats update due to no_stat flag"); + metric_res = task->result; + + if (metric_res != NULL) { + + action = rspamd_check_action_metric(task, NULL, NULL); + /* TODO: handle custom actions in stats */ + if (action->action_type == METRIC_ACTION_SOFT_REJECT && + (task->flags & RSPAMD_TASK_FLAG_GREYLISTED)) { + /* Set stat action to greylist to display greylisted messages */ +#ifndef HAVE_ATOMIC_BUILTINS + task->worker->srv->stat->actions_stat[METRIC_ACTION_GREYLIST]++; +#else + __atomic_add_fetch(&task->worker->srv->stat->actions_stat[METRIC_ACTION_GREYLIST], + 1, __ATOMIC_RELEASE); +#endif + } + else if (action->action_type < METRIC_ACTION_MAX) { +#ifndef HAVE_ATOMIC_BUILTINS + task->worker->srv->stat->actions_stat[action->action_type]++; +#else + __atomic_add_fetch(&task->worker->srv->stat->actions_stat[action->action_type], + 1, __ATOMIC_RELEASE); +#endif + } + } + + /* Increase counters */ +#ifndef HAVE_ATOMIC_BUILTINS + task->worker->srv->stat->messages_scanned++; +#else + __atomic_add_fetch(&task->worker->srv->stat->messages_scanned, + 1, __ATOMIC_RELEASE); +#endif + + /* Set average processing time */ + guint32 slot; + float processing_time = task->time_real_finish - task->task_timestamp; + +#ifndef HAVE_ATOMIC_BUILTINS + slot = task->worker->srv->stat->avg_time.cur_slot++; +#else + slot = __atomic_fetch_add(&task->worker->srv->stat->avg_time.cur_slot, + 1, __ATOMIC_RELEASE); +#endif + slot = slot % MAX_AVG_TIME_SLOTS; + /* TODO: this should be atomic but it is not supported in C */ + task->worker->srv->stat->avg_time.avg_time[slot] = processing_time; + } +} + +void rspamd_protocol_write_log_pipe(struct rspamd_task *task) +{ + struct rspamd_worker_log_pipe *lp; + struct rspamd_protocol_log_message_sum *ls; + lua_State *L = task->cfg->lua_state; + struct rspamd_scan_result *mres; + struct rspamd_symbol_result *sym; + gint id, i; + guint32 n = 0, nextra = 0; + gsize sz; + GArray *extra; + struct rspamd_protocol_log_symbol_result er; + struct rspamd_task **ptask; + + /* Get extra results from lua plugins */ + extra = g_array_new(FALSE, FALSE, sizeof(er)); + + lua_getglobal(L, "rspamd_plugins"); + if (lua_istable(L, -1)) { + lua_pushnil(L); + + while (lua_next(L, -2)) { + if (lua_istable(L, -1)) { + lua_pushvalue(L, -2); + /* stack: + * -1: copy of key + * -2: value (module table) + * -3: key (module name) + * -4: global + */ + lua_pushstring(L, "log_callback"); + lua_gettable(L, -3); + /* stack: + * -1: func + * -2: copy of key + * -3: value (module table) + * -3: key (module name) + * -4: global + */ + if (lua_isfunction(L, -1)) { + ptask = lua_newuserdata(L, sizeof(*ptask)); + *ptask = task; + rspamd_lua_setclass(L, "rspamd{task}", -1); + /* stack: + * -1: task + * -2: func + * -3: key copy + * -4: value (module table) + * -5: key (module name) + * -6: global + */ + msg_debug_protocol("calling for %s", lua_tostring(L, -3)); + if (lua_pcall(L, 1, 1, 0) != 0) { + msg_info_protocol("call to log callback %s failed: %s", + lua_tostring(L, -2), lua_tostring(L, -1)); + lua_pop(L, 1); + /* stack: + * -1: key copy + * -2: value + * -3: key + */ + } + else { + /* stack: + * -1: result + * -2: key copy + * -3: value + * -4: key + */ + if (lua_istable(L, -1)) { + /* Another iteration */ + lua_pushnil(L); + + while (lua_next(L, -2)) { + /* stack: + * -1: value + * -2: key + * -3: result table (pcall) + * -4: key copy (parent) + * -5: value (parent) + * -6: key (parent) + */ + if (lua_istable(L, -1)) { + er.id = 0; + er.score = 0.0; + + lua_rawgeti(L, -1, 1); + if (lua_isnumber(L, -1)) { + er.id = lua_tonumber(L, -1); + } + lua_rawgeti(L, -2, 2); + if (lua_isnumber(L, -1)) { + er.score = lua_tonumber(L, -1); + } + /* stack: + * -1: value[2] + * -2: value[1] + * -3: values + * -4: key + * -5: result table (pcall) + * -6: key copy (parent) + * -7: value (parent) + * -8: key (parent) + */ + lua_pop(L, 2); /* Values */ + g_array_append_val(extra, er); + } + + lua_pop(L, 1); /* Value for lua_next */ + } + + lua_pop(L, 1); /* Table result of pcall */ + } + else { + msg_info_protocol("call to log callback %s returned " + "wrong type: %s", + lua_tostring(L, -2), + lua_typename(L, lua_type(L, -1))); + lua_pop(L, 1); /* Returned error */ + } + } + } + else { + lua_pop(L, 1); + /* stack: + * -1: key copy + * -2: value + * -3: key + */ + } + } + + lua_pop(L, 2); /* Top table + key copy */ + } + + lua_pop(L, 1); /* rspamd_plugins global */ + } + else { + lua_pop(L, 1); + } + + nextra = extra->len; + + LL_FOREACH(task->cfg->log_pipes, lp) + { + if (lp->fd != -1) { + switch (lp->type) { + case RSPAMD_LOG_PIPE_SYMBOLS: + mres = task->result; + + if (mres) { + n = kh_size(mres->symbols); + sz = sizeof(*ls) + + sizeof(struct rspamd_protocol_log_symbol_result) * + (n + nextra); + ls = g_malloc0(sz); + + /* Handle settings id */ + + if (task->settings_elt) { + ls->settings_id = task->settings_elt->id; + } + else { + ls->settings_id = 0; + } + + ls->score = mres->score; + ls->required_score = rspamd_task_get_required_score(task, + mres); + ls->nresults = n; + ls->nextra = nextra; + + i = 0; + + kh_foreach_value(mres->symbols, sym, { + id = rspamd_symcache_find_symbol(task->cfg->cache, + sym->name); + + if (id >= 0) { + ls->results[i].id = id; + ls->results[i].score = sym->score; + } + else { + ls->results[i].id = -1; + ls->results[i].score = 0.0; + } + + i++; + }); + + memcpy(&ls->results[n], extra->data, nextra * sizeof(er)); + } + else { + sz = sizeof(*ls); + ls = g_malloc0(sz); + ls->nresults = 0; + } + + /* We don't really care about return value here */ + if (write(lp->fd, ls, sz) == -1) { + msg_info_protocol("cannot write to log pipe: %s", + strerror(errno)); + } + + g_free(ls); + break; + default: + msg_err_protocol("unknown log format %d", lp->type); + break; + } + } + } + + g_array_free(extra, TRUE); +} + +void rspamd_protocol_write_reply(struct rspamd_task *task, ev_tstamp timeout) +{ + struct rspamd_http_message *msg; + const gchar *ctype = "application/json"; + rspamd_fstring_t *reply; + + msg = rspamd_http_new_message(HTTP_RESPONSE); + + if (rspamd_http_connection_is_encrypted(task->http_conn)) { + msg_info_protocol("<%s> writing encrypted reply", + MESSAGE_FIELD_CHECK(task, message_id)); + } + + /* Compatibility */ + if (task->cmd == CMD_CHECK_RSPAMC) { + msg->method = HTTP_SYMBOLS; + } + else if (task->cmd == CMD_CHECK_SPAMC) { + msg->method = HTTP_SYMBOLS; + msg->flags |= RSPAMD_HTTP_FLAG_SPAMC; + } + + if (task->err != NULL) { + msg_debug_protocol("writing error reply to client"); + ucl_object_t *top = NULL; + + top = ucl_object_typed_new(UCL_OBJECT); + msg->code = 500 + task->err->code % 100; + msg->status = rspamd_fstring_new_init(task->err->message, + strlen(task->err->message)); + ucl_object_insert_key(top, ucl_object_fromstring(task->err->message), + "error", 0, false); + ucl_object_insert_key(top, + ucl_object_fromstring(g_quark_to_string(task->err->domain)), + "error_domain", 0, false); + reply = rspamd_fstring_sized_new(256); + rspamd_ucl_emit_fstring(top, UCL_EMIT_JSON_COMPACT, &reply); + ucl_object_unref(top); + + /* We also need to validate utf8 */ + if (rspamd_fast_utf8_validate(reply->str, reply->len) != 0) { + gsize valid_len; + gchar *validated; + + /* We copy reply several times here but it should be a rare case */ + validated = rspamd_str_make_utf_valid(reply->str, reply->len, + &valid_len, task->task_pool); + rspamd_http_message_set_body(msg, validated, valid_len); + rspamd_fstring_free(reply); + } + else { + rspamd_http_message_set_body_from_fstring_steal(msg, reply); + } + } + else { + msg->status = rspamd_fstring_new_init("OK", 2); + + switch (task->cmd) { + case CMD_CHECK: + case CMD_CHECK_RSPAMC: + case CMD_CHECK_SPAMC: + case CMD_SKIP: + case CMD_CHECK_V2: + rspamd_protocol_http_reply(msg, task, NULL); + rspamd_protocol_write_log_pipe(task); + break; + case CMD_PING: + msg_debug_protocol("writing pong to client"); + rspamd_http_message_set_body(msg, "pong" CRLF, 6); + ctype = "text/plain"; + break; + default: + msg_err_protocol("BROKEN"); + break; + } + } + + ev_now_update(task->event_loop); + msg->date = ev_time(); + + rspamd_http_connection_reset(task->http_conn); + rspamd_http_connection_write_message(task->http_conn, msg, NULL, + ctype, task, timeout); + + task->processed_stages |= RSPAMD_TASK_STAGE_REPLIED; +} diff --git a/src/libserver/protocol.h b/src/libserver/protocol.h new file mode 100644 index 0000000..0e3c187 --- /dev/null +++ b/src/libserver/protocol.h @@ -0,0 +1,130 @@ +/** + * @file protocol.h + * Rspamd protocol definition + */ + +#ifndef RSPAMD_PROTOCOL_H +#define RSPAMD_PROTOCOL_H + +#include "config.h" +#include "scan_result.h" +#include "libserver/http/http_connection.h" +#include "task.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RSPAMD_BASE_ERROR 500 +#define RSPAMD_FILTER_ERROR RSPAMD_BASE_ERROR + 1 +#define RSPAMD_NETWORK_ERROR RSPAMD_BASE_ERROR + 2 +#define RSPAMD_PROTOCOL_ERROR RSPAMD_BASE_ERROR + 3 +#define RSPAMD_LENGTH_ERROR RSPAMD_BASE_ERROR + 4 +#define RSPAMD_STATFILE_ERROR RSPAMD_BASE_ERROR + 5 + +struct rspamd_protocol_log_symbol_result { + guint32 id; + float score; +}; +struct rspamd_protocol_log_message_sum { + guint32 nresults; + guint32 nextra; + guint32 settings_id; + gdouble score; + gdouble required_score; + struct rspamd_protocol_log_symbol_result results[]; +}; + +struct rspamd_metric; + +/** + * Process headers into HTTP message and set appropriate task fields + * @param task + * @param msg + * @return + */ +gboolean rspamd_protocol_handle_headers(struct rspamd_task *task, + struct rspamd_http_message *msg); + +/** + * Process control chunk and update task structure accordingly + * @param task + * @param control + * @return + */ +gboolean rspamd_protocol_handle_control(struct rspamd_task *task, + const ucl_object_t *control); + +/** + * Process HTTP request to the task structure + * @param task + * @param msg + * @return + */ +gboolean rspamd_protocol_handle_request(struct rspamd_task *task, + struct rspamd_http_message *msg); + +/** + * Write task results to http message + * @param msg + * @param task + */ +void rspamd_protocol_http_reply(struct rspamd_http_message *msg, + struct rspamd_task *task, ucl_object_t **pobj); + +/** + * Write data to log pipes + * @param task + */ +void rspamd_protocol_write_log_pipe(struct rspamd_task *task); + +enum rspamd_protocol_flags { + RSPAMD_PROTOCOL_BASIC = 1 << 0, + RSPAMD_PROTOCOL_METRICS = 1 << 1, + RSPAMD_PROTOCOL_MESSAGES = 1 << 2, + RSPAMD_PROTOCOL_RMILTER = 1 << 3, + RSPAMD_PROTOCOL_DKIM = 1 << 4, + RSPAMD_PROTOCOL_URLS = 1 << 5, + RSPAMD_PROTOCOL_EXTRA = 1 << 6, +}; + +#define RSPAMD_PROTOCOL_DEFAULT (RSPAMD_PROTOCOL_BASIC | \ + RSPAMD_PROTOCOL_METRICS | \ + RSPAMD_PROTOCOL_MESSAGES | \ + RSPAMD_PROTOCOL_RMILTER | \ + RSPAMD_PROTOCOL_DKIM | \ + RSPAMD_PROTOCOL_EXTRA) + +/** + * Write reply to ucl object filling log buffer + * @param task + * @param logbuf + * @return + */ +ucl_object_t *rspamd_protocol_write_ucl(struct rspamd_task *task, + enum rspamd_protocol_flags flags); + +/** + * Write reply for specified task command + * @param task task object + * @return 0 if we wrote reply and -1 if there was some error + */ +void rspamd_protocol_write_reply(struct rspamd_task *task, ev_tstamp timeout); + +/** + * Convert rspamd output to legacy protocol reply + * @param task + * @param top + * @param out + */ +void rspamd_ucl_torspamc_output(const ucl_object_t *top, + rspamd_fstring_t **out); + +void rspamd_ucl_tospamc_output(const ucl_object_t *top, + rspamd_fstring_t **out); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/protocol_internal.h b/src/libserver/protocol_internal.h new file mode 100644 index 0000000..c604e96 --- /dev/null +++ b/src/libserver/protocol_internal.h @@ -0,0 +1,99 @@ +/*- + * Copyright 2017 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_PROTOCOL_INTERNAL_H +#define RSPAMD_PROTOCOL_INTERNAL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Just check if the passed message is spam or not and reply as + * described below + */ +#define MSG_CMD_CHECK "check" + +/* + * Modern check version + */ +#define MSG_CMD_CHECK_V2 "checkv2" +#define MSG_CMD_SCAN "scan" + +/* + * Check if message is spam or not, and return score plus list + * of symbols hit + */ +#define MSG_CMD_SYMBOLS "symbols" +/* + * Check if message is spam or not, and return score plus report + */ +#define MSG_CMD_REPORT "report" +/* + * Check if message is spam or not, and return score plus report + * if the message is spam + */ +#define MSG_CMD_REPORT_IFSPAM "report_ifspam" +/* + * Ignore this message -- client opened connection then changed + */ +#define MSG_CMD_SKIP "skip" +/* + * Return a confirmation that spamd is alive + */ +#define MSG_CMD_PING "ping" +/* + * Process this message as described above and return modified message + */ +#define MSG_CMD_PROCESS "process" +/* + * Headers + */ +#define HELO_HEADER "Helo" +#define FROM_HEADER "From" +#define IP_ADDR_HEADER "IP" +#define RCPT_HEADER "Rcpt" +#define SUBJECT_HEADER "Subject" +#define SETTINGS_ID_HEADER "Settings-ID" +#define SETTINGS_HEADER "Settings" +#define QUEUE_ID_HEADER "Queue-ID" +#define USER_HEADER "User" +#define URLS_HEADER "URL-Format" +#define PASS_HEADER "Pass" +#define HOSTNAME_HEADER "Hostname" +#define DELIVER_TO_HEADER "Deliver-To" +#define NO_LOG_HEADER "Log" +#define MLEN_HEADER "Message-Length" +#define USER_AGENT_HEADER "User-Agent" +#define MTA_TAG_HEADER "MTA-Tag" +#define PROFILE_HEADER "Profile" +#define TLS_CIPHER_HEADER "TLS-Cipher" +#define TLS_VERSION_HEADER "TLS-Version" +#define MTA_NAME_HEADER "MTA-Name" +#define MILTER_HEADER "Milter" +#define FILENAME_HEADER "Filename" +#define FLAGS_HEADER "Flags" +#define CERT_ISSUER_HEADER "TLS-Cert-Issuer" +#define MAILER_HEADER "Mailer" +#define RAW_DATA_HEADER "Raw" +#define COMPRESSION_HEADER "Compression" +#define MESSAGE_OFFSET_HEADER "Message-Offset" + +#ifdef __cplusplus +} +#endif + +#endif//RSPAMD_PROTOCOL_INTERNAL_H diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c new file mode 100644 index 0000000..d51dba6 --- /dev/null +++ b/src/libserver/re_cache.c @@ -0,0 +1,2712 @@ +/* + * Copyright 2024 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "libmime/message.h" +#include "re_cache.h" +#include "cryptobox.h" +#include "ref.h" +#include "libserver/url.h" +#include "libserver/task.h" +#include "libserver/cfg_file.h" +#include "libutil/util.h" +#include "libutil/regexp.h" +#include "lua/lua_common.h" +#include "libstat/stat_api.h" +#include "contrib/uthash/utlist.h" + +#include "khash.h" + +#ifdef WITH_HYPERSCAN +#include "hs.h" +#include "hyperscan_tools.h" +#endif + +#include "unix-std.h" +#include <signal.h> +#include <stdalign.h> +#include <math.h> +#include "contrib/libev/ev.h" + +#ifndef WITH_PCRE2 +#include <pcre.h> +#else +#include <pcre2.h> +#endif + +#include "contrib/fastutf8/fastutf8.h" + +#ifdef HAVE_SYS_WAIT_H +#include <sys/wait.h> +#endif + +#define msg_err_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "re_cache", cache->hash, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "re_cache", cache->hash, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_re_cache(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "re_cache", cache->hash, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +#define msg_debug_re_task(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_re_cache_log_id, "re_cache", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_re_cache(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_re_cache_log_id, "re_cache", cache->hash, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(re_cache) + +#ifdef WITH_HYPERSCAN +#define RSPAMD_HS_MAGIC_LEN (sizeof(rspamd_hs_magic)) +static const guchar rspamd_hs_magic[] = {'r', 's', 'h', 's', 'r', 'e', '1', '1'}, + rspamd_hs_magic_vector[] = {'r', 's', 'h', 's', 'r', 'v', '1', '1'}; +#endif + + +struct rspamd_re_class { + guint64 id; + enum rspamd_re_type type; + gboolean has_utf8; /* if there are any utf8 regexps */ + gpointer type_data; + gsize type_len; + GHashTable *re; + rspamd_cryptobox_hash_state_t *st; + + gchar hash[rspamd_cryptobox_HASHBYTES + 1]; + +#ifdef WITH_HYPERSCAN + rspamd_hyperscan_t *hs_db; + hs_scratch_t *hs_scratch; + gint *hs_ids; + guint nhs; +#endif +}; + +enum rspamd_re_cache_elt_match_type { + RSPAMD_RE_CACHE_PCRE = 0, + RSPAMD_RE_CACHE_HYPERSCAN, + RSPAMD_RE_CACHE_HYPERSCAN_PRE +}; + +struct rspamd_re_cache_elt { + rspamd_regexp_t *re; + gint lua_cbref; + enum rspamd_re_cache_elt_match_type match_type; +}; + +KHASH_INIT(lua_selectors_hash, gchar *, int, 1, kh_str_hash_func, kh_str_hash_equal); + +struct rspamd_re_cache { + GHashTable *re_classes; + + GPtrArray *re; + khash_t(lua_selectors_hash) * selectors; + ref_entry_t ref; + guint nre; + guint max_re_data; + gchar hash[rspamd_cryptobox_HASHBYTES + 1]; + lua_State *L; +#ifdef WITH_HYPERSCAN + enum rspamd_hyperscan_status hyperscan_loaded; + gboolean disable_hyperscan; + hs_platform_info_t plt; +#endif +}; + +struct rspamd_re_selector_result { + guchar **scvec; + guint *lenvec; + guint cnt; +}; + +KHASH_INIT(selectors_results_hash, int, struct rspamd_re_selector_result, 1, + kh_int_hash_func, kh_int_hash_equal); + +struct rspamd_re_runtime { + guchar *checked; + guchar *results; + khash_t(selectors_results_hash) * sel_cache; + struct rspamd_re_cache *cache; + struct rspamd_re_cache_stat stat; + gboolean has_hs; +}; + +static GQuark +rspamd_re_cache_quark(void) +{ + return g_quark_from_static_string("re_cache"); +} + +static guint64 +rspamd_re_cache_class_id(enum rspamd_re_type type, + gconstpointer type_data, + gsize datalen) +{ + rspamd_cryptobox_fast_hash_state_t st; + + rspamd_cryptobox_fast_hash_init(&st, 0xdeadbabe); + rspamd_cryptobox_fast_hash_update(&st, &type, sizeof(type)); + + if (datalen > 0) { + rspamd_cryptobox_fast_hash_update(&st, type_data, datalen); + } + + return rspamd_cryptobox_fast_hash_final(&st); +} + +static void +rspamd_re_cache_destroy(struct rspamd_re_cache *cache) +{ + GHashTableIter it; + gpointer k, v; + struct rspamd_re_class *re_class; + gchar *skey; + gint sref; + + g_assert(cache != NULL); + g_hash_table_iter_init(&it, cache->re_classes); + + while (g_hash_table_iter_next(&it, &k, &v)) { + re_class = v; + g_hash_table_iter_steal(&it); + g_hash_table_unref(re_class->re); + + if (re_class->type_data) { + g_free(re_class->type_data); + } + +#ifdef WITH_HYPERSCAN + if (re_class->hs_db) { + rspamd_hyperscan_free(re_class->hs_db, false); + } + if (re_class->hs_scratch) { + hs_free_scratch(re_class->hs_scratch); + } + if (re_class->hs_ids) { + g_free(re_class->hs_ids); + } +#endif + g_free(re_class); + } + + if (cache->L) { + kh_foreach(cache->selectors, skey, sref, { + luaL_unref(cache->L, LUA_REGISTRYINDEX, sref); + g_free(skey); + }); + + struct rspamd_re_cache_elt *elt; + guint i; + + PTR_ARRAY_FOREACH(cache->re, i, elt) + { + if (elt->lua_cbref != -1) { + luaL_unref(cache->L, LUA_REGISTRYINDEX, elt->lua_cbref); + } + } + } + + kh_destroy(lua_selectors_hash, cache->selectors); + + g_hash_table_unref(cache->re_classes); + g_ptr_array_free(cache->re, TRUE); + g_free(cache); +} + +static void +rspamd_re_cache_elt_dtor(gpointer e) +{ + struct rspamd_re_cache_elt *elt = e; + + rspamd_regexp_unref(elt->re); + g_free(elt); +} + +struct rspamd_re_cache * +rspamd_re_cache_new(void) +{ + struct rspamd_re_cache *cache; + + cache = g_malloc0(sizeof(*cache)); + cache->re_classes = g_hash_table_new(g_int64_hash, g_int64_equal); + cache->nre = 0; + cache->re = g_ptr_array_new_full(256, rspamd_re_cache_elt_dtor); + cache->selectors = kh_init(lua_selectors_hash); +#ifdef WITH_HYPERSCAN + cache->hyperscan_loaded = RSPAMD_HYPERSCAN_UNKNOWN; +#endif + REF_INIT_RETAIN(cache, rspamd_re_cache_destroy); + + return cache; +} + +enum rspamd_hyperscan_status +rspamd_re_cache_is_hs_loaded(struct rspamd_re_cache *cache) +{ + g_assert(cache != NULL); + +#ifdef WITH_HYPERSCAN + return cache->hyperscan_loaded; +#else + return RSPAMD_HYPERSCAN_UNSUPPORTED; +#endif +} + +rspamd_regexp_t * +rspamd_re_cache_add(struct rspamd_re_cache *cache, + rspamd_regexp_t *re, + enum rspamd_re_type type, + gconstpointer type_data, gsize datalen, + gint lua_cbref) +{ + guint64 class_id; + struct rspamd_re_class *re_class; + rspamd_regexp_t *nre; + struct rspamd_re_cache_elt *elt; + + g_assert(cache != NULL); + g_assert(re != NULL); + + class_id = rspamd_re_cache_class_id(type, type_data, datalen); + re_class = g_hash_table_lookup(cache->re_classes, &class_id); + + if (re_class == NULL) { + re_class = g_malloc0(sizeof(*re_class)); + re_class->id = class_id; + re_class->type_len = datalen; + re_class->type = type; + re_class->re = g_hash_table_new_full(rspamd_regexp_hash, + rspamd_regexp_equal, NULL, (GDestroyNotify) rspamd_regexp_unref); + + if (datalen > 0) { + re_class->type_data = g_malloc0(datalen); + memcpy(re_class->type_data, type_data, datalen); + } + + g_hash_table_insert(cache->re_classes, &re_class->id, re_class); + } + + if ((nre = g_hash_table_lookup(re_class->re, rspamd_regexp_get_id(re))) == NULL) { + /* + * We set re id based on the global position in the cache + */ + elt = g_malloc0(sizeof(*elt)); + /* One ref for re_class */ + nre = rspamd_regexp_ref(re); + rspamd_regexp_set_cache_id(re, cache->nre++); + /* One ref for cache */ + elt->re = rspamd_regexp_ref(re); + g_ptr_array_add(cache->re, elt); + rspamd_regexp_set_class(re, re_class); + elt->lua_cbref = lua_cbref; + + g_hash_table_insert(re_class->re, rspamd_regexp_get_id(nre), nre); + } + + if (rspamd_regexp_get_flags(re) & RSPAMD_REGEXP_FLAG_UTF) { + re_class->has_utf8 = TRUE; + } + + return nre; +} + +void rspamd_re_cache_replace(struct rspamd_re_cache *cache, + rspamd_regexp_t *what, + rspamd_regexp_t *with) +{ + guint64 re_id; + struct rspamd_re_class *re_class; + rspamd_regexp_t *src; + struct rspamd_re_cache_elt *elt; + + g_assert(cache != NULL); + g_assert(what != NULL); + g_assert(with != NULL); + + re_class = rspamd_regexp_get_class(what); + + if (re_class != NULL) { + re_id = rspamd_regexp_get_cache_id(what); + + g_assert(re_id != RSPAMD_INVALID_ID); + src = g_hash_table_lookup(re_class->re, rspamd_regexp_get_id(what)); + elt = g_ptr_array_index(cache->re, re_id); + g_assert(elt != NULL); + g_assert(src != NULL); + + rspamd_regexp_set_cache_id(what, RSPAMD_INVALID_ID); + rspamd_regexp_set_class(what, NULL); + rspamd_regexp_set_cache_id(with, re_id); + rspamd_regexp_set_class(with, re_class); + /* + * On calling of this function, we actually unref old re (what) + */ + g_hash_table_insert(re_class->re, + rspamd_regexp_get_id(what), + rspamd_regexp_ref(with)); + + rspamd_regexp_unref(elt->re); + elt->re = rspamd_regexp_ref(with); + /* XXX: do not touch match type here */ + } +} + +static gint +rspamd_re_cache_sort_func(gconstpointer a, gconstpointer b) +{ + struct rspamd_re_cache_elt *const *re1 = a, *const *re2 = b; + + return rspamd_regexp_cmp(rspamd_regexp_get_id((*re1)->re), + rspamd_regexp_get_id((*re2)->re)); +} + +void rspamd_re_cache_init(struct rspamd_re_cache *cache, struct rspamd_config *cfg) +{ + guint i, fl; + GHashTableIter it; + gpointer k, v; + struct rspamd_re_class *re_class; + rspamd_cryptobox_hash_state_t st_global; + rspamd_regexp_t *re; + struct rspamd_re_cache_elt *elt; + guchar hash_out[rspamd_cryptobox_HASHBYTES]; + + g_assert(cache != NULL); + + rspamd_cryptobox_hash_init(&st_global, NULL, 0); + /* Resort all regexps */ + g_ptr_array_sort(cache->re, rspamd_re_cache_sort_func); + + for (i = 0; i < cache->re->len; i++) { + elt = g_ptr_array_index(cache->re, i); + re = elt->re; + re_class = rspamd_regexp_get_class(re); + g_assert(re_class != NULL); + rspamd_regexp_set_cache_id(re, i); + + if (re_class->st == NULL) { + (void) !posix_memalign((void **) &re_class->st, RSPAMD_ALIGNOF(rspamd_cryptobox_hash_state_t), + sizeof(*re_class->st)); + g_assert(re_class->st != NULL); + rspamd_cryptobox_hash_init(re_class->st, NULL, 0); + } + + /* Update hashes */ + /* Id of re class */ + rspamd_cryptobox_hash_update(re_class->st, (gpointer) &re_class->id, + sizeof(re_class->id)); + rspamd_cryptobox_hash_update(&st_global, (gpointer) &re_class->id, + sizeof(re_class->id)); + /* Id of re expression */ + rspamd_cryptobox_hash_update(re_class->st, rspamd_regexp_get_id(re), + rspamd_cryptobox_HASHBYTES); + rspamd_cryptobox_hash_update(&st_global, rspamd_regexp_get_id(re), + rspamd_cryptobox_HASHBYTES); + /* PCRE flags */ + fl = rspamd_regexp_get_pcre_flags(re); + rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &fl, + sizeof(fl)); + rspamd_cryptobox_hash_update(&st_global, (const guchar *) &fl, + sizeof(fl)); + /* Rspamd flags */ + fl = rspamd_regexp_get_flags(re); + rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &fl, + sizeof(fl)); + rspamd_cryptobox_hash_update(&st_global, (const guchar *) &fl, + sizeof(fl)); + /* Limit of hits */ + fl = rspamd_regexp_get_maxhits(re); + rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &fl, + sizeof(fl)); + rspamd_cryptobox_hash_update(&st_global, (const guchar *) &fl, + sizeof(fl)); + /* Numeric order */ + rspamd_cryptobox_hash_update(re_class->st, (const guchar *) &i, + sizeof(i)); + rspamd_cryptobox_hash_update(&st_global, (const guchar *) &i, + sizeof(i)); + } + + rspamd_cryptobox_hash_final(&st_global, hash_out); + rspamd_snprintf(cache->hash, sizeof(cache->hash), "%*xs", + (gint) rspamd_cryptobox_HASHBYTES, hash_out); + + /* Now finalize all classes */ + g_hash_table_iter_init(&it, cache->re_classes); + + while (g_hash_table_iter_next(&it, &k, &v)) { + re_class = v; + + if (re_class->st) { + /* + * We finally update all classes with the number of expressions + * in the cache to ensure that if even a single re has been changed + * we won't be broken due to id mismatch + */ + rspamd_cryptobox_hash_update(re_class->st, + (gpointer) &cache->re->len, + sizeof(cache->re->len)); + rspamd_cryptobox_hash_final(re_class->st, hash_out); + rspamd_snprintf(re_class->hash, sizeof(re_class->hash), "%*xs", + (gint) rspamd_cryptobox_HASHBYTES, hash_out); + free(re_class->st); /* Due to posix_memalign */ + re_class->st = NULL; + } + } + + cache->L = cfg->lua_state; + +#ifdef WITH_HYPERSCAN + const gchar *platform = "generic"; + rspamd_fstring_t *features = rspamd_fstring_new(); + + cache->disable_hyperscan = cfg->disable_hyperscan; + + g_assert(hs_populate_platform(&cache->plt) == HS_SUCCESS); + + /* Now decode what we do have */ + switch (cache->plt.tune) { + case HS_TUNE_FAMILY_HSW: + platform = "haswell"; + break; + case HS_TUNE_FAMILY_SNB: + platform = "sandy"; + break; + case HS_TUNE_FAMILY_BDW: + platform = "broadwell"; + break; + case HS_TUNE_FAMILY_IVB: + platform = "ivy"; + break; + default: + break; + } + + if (cache->plt.cpu_features & HS_CPU_FEATURES_AVX2) { + features = rspamd_fstring_append(features, "AVX2", 4); + } + + hs_set_allocator(g_malloc, g_free); + + msg_info_re_cache("loaded hyperscan engine with cpu tune '%s' and features '%V'", + platform, features); + + rspamd_fstring_free(features); +#endif +} + +struct rspamd_re_runtime * +rspamd_re_cache_runtime_new(struct rspamd_re_cache *cache) +{ + struct rspamd_re_runtime *rt; + g_assert(cache != NULL); + + rt = g_malloc0(sizeof(*rt) + NBYTES(cache->nre) + cache->nre); + rt->cache = cache; + REF_RETAIN(cache); + rt->checked = ((guchar *) rt) + sizeof(*rt); + rt->results = rt->checked + NBYTES(cache->nre); + rt->stat.regexp_total = cache->nre; +#ifdef WITH_HYPERSCAN + rt->has_hs = cache->hyperscan_loaded; +#endif + + return rt; +} + +const struct rspamd_re_cache_stat * +rspamd_re_cache_get_stat(struct rspamd_re_runtime *rt) +{ + g_assert(rt != NULL); + + return &rt->stat; +} + +static gboolean +rspamd_re_cache_check_lua_condition(struct rspamd_task *task, + rspamd_regexp_t *re, + const guchar *in, gsize len, + goffset start, goffset end, + gint lua_cbref) +{ + lua_State *L = (lua_State *) task->cfg->lua_state; + GError *err = NULL; + struct rspamd_lua_text __attribute__((unused)) * t; + gint text_pos; + + if (G_LIKELY(lua_cbref == -1)) { + return TRUE; + } + + t = lua_new_text(L, in, len, FALSE); + text_pos = lua_gettop(L); + + if (!rspamd_lua_universal_pcall(L, lua_cbref, + G_STRLOC, 1, "utii", &err, + "rspamd{task}", task, + text_pos, start, end)) { + msg_warn_task("cannot call for re_cache_check_lua_condition for re %s: %e", + rspamd_regexp_get_pattern(re), err); + g_error_free(err); + lua_settop(L, text_pos - 1); + + return TRUE; + } + + gboolean res = lua_toboolean(L, -1); + + lua_settop(L, text_pos - 1); + + return res; +} + +static guint +rspamd_re_cache_process_pcre(struct rspamd_re_runtime *rt, + rspamd_regexp_t *re, struct rspamd_task *task, + const guchar *in, gsize len, + gboolean is_raw, + gint lua_cbref) +{ + guint r = 0; + const gchar *start = NULL, *end = NULL; + guint max_hits = rspamd_regexp_get_maxhits(re); + guint64 id = rspamd_regexp_get_cache_id(re); + gdouble t1 = NAN, t2, pr; + const gdouble slow_time = 1e8; + + if (in == NULL) { + return rt->results[id]; + } + + if (len == 0) { + return rt->results[id]; + } + + if (rt->cache->max_re_data > 0 && len > rt->cache->max_re_data) { + len = rt->cache->max_re_data; + } + + r = rt->results[id]; + + if (max_hits == 0 || r < max_hits) { + pr = rspamd_random_double_fast(); + + if (pr > 0.9) { + t1 = rspamd_get_ticks(TRUE); + } + + while (rspamd_regexp_search(re, + in, + len, + &start, + &end, + is_raw, + NULL)) { + if (rspamd_re_cache_check_lua_condition(task, re, in, len, + start - (const gchar *) in, end - (const gchar *) in, lua_cbref)) { + r++; + msg_debug_re_task("found regexp /%s/, total hits: %d", + rspamd_regexp_get_pattern(re), r); + } + + if (max_hits > 0 && r >= max_hits) { + break; + } + } + + rt->results[id] += r; + rt->stat.regexp_checked++; + rt->stat.bytes_scanned_pcre += len; + rt->stat.bytes_scanned += len; + + if (r > 0) { + rt->stat.regexp_matched += r; + } + + if (!isnan(t1)) { + t2 = rspamd_get_ticks(TRUE); + + if (t2 - t1 > slow_time) { + rspamd_symcache_enable_profile(task); + msg_info_task("regexp '%16s' took %.0f ticks to execute", + rspamd_regexp_get_pattern(re), t2 - t1); + } + } + } + + return r; +} + +#ifdef WITH_HYPERSCAN +struct rspamd_re_hyperscan_cbdata { + struct rspamd_re_runtime *rt; + const guchar **ins; + const guint *lens; + guint count; + rspamd_regexp_t *re; + struct rspamd_task *task; +}; + +static gint +rspamd_re_cache_hyperscan_cb(unsigned int id, + unsigned long long from, + unsigned long long to, + unsigned int flags, + void *ud) +{ + struct rspamd_re_hyperscan_cbdata *cbdata = ud; + struct rspamd_re_runtime *rt; + struct rspamd_re_cache_elt *cache_elt; + guint ret, maxhits, i, processed; + struct rspamd_task *task; + + rt = cbdata->rt; + task = cbdata->task; + cache_elt = g_ptr_array_index(rt->cache->re, id); + maxhits = rspamd_regexp_get_maxhits(cache_elt->re); + + if (cache_elt->match_type == RSPAMD_RE_CACHE_HYPERSCAN) { + if (rspamd_re_cache_check_lua_condition(task, cache_elt->re, + cbdata->ins[0], cbdata->lens[0], from, to, cache_elt->lua_cbref)) { + ret = 1; + setbit(rt->checked, id); + + if (maxhits == 0 || rt->results[id] < maxhits) { + rt->results[id] += ret; + rt->stat.regexp_matched++; + } + msg_debug_re_task("found regexp /%s/ using hyperscan only, total hits: %d", + rspamd_regexp_get_pattern(cache_elt->re), rt->results[id]); + } + } + else { + if (!isset(rt->checked, id)) { + + processed = 0; + + for (i = 0; i < cbdata->count; i++) { + rspamd_re_cache_process_pcre(rt, + cache_elt->re, + cbdata->task, + cbdata->ins[i], + cbdata->lens[i], + FALSE, + cache_elt->lua_cbref); + setbit(rt->checked, id); + + processed += cbdata->lens[i]; + + if (processed >= to) { + break; + } + } + } + } + + return 0; +} +#endif + +static guint +rspamd_re_cache_process_regexp_data(struct rspamd_re_runtime *rt, + rspamd_regexp_t *re, struct rspamd_task *task, + const guchar **in, guint *lens, + guint count, + gboolean is_raw, + gboolean *processed_hyperscan) +{ + + guint64 re_id; + guint ret = 0; + guint i; + struct rspamd_re_cache_elt *cache_elt; + + re_id = rspamd_regexp_get_cache_id(re); + + if (count == 0 || in == NULL) { + /* We assume this as absence of the specified data */ + setbit(rt->checked, re_id); + rt->results[re_id] = ret; + return ret; + } + + cache_elt = (struct rspamd_re_cache_elt *) g_ptr_array_index(rt->cache->re, re_id); + +#ifndef WITH_HYPERSCAN + for (i = 0; i < count; i++) { + ret = rspamd_re_cache_process_pcre(rt, + re, + task, + in[i], + lens[i], + is_raw, + cache_elt->lua_cbref); + rt->results[re_id] = ret; + } + + setbit(rt->checked, re_id); +#else + struct rspamd_re_class *re_class; + struct rspamd_re_hyperscan_cbdata cbdata; + + cache_elt = g_ptr_array_index(rt->cache->re, re_id); + re_class = rspamd_regexp_get_class(re); + + if (rt->cache->disable_hyperscan || cache_elt->match_type == RSPAMD_RE_CACHE_PCRE || + !rt->has_hs || (is_raw && re_class->has_utf8)) { + for (i = 0; i < count; i++) { + ret = rspamd_re_cache_process_pcre(rt, + re, + task, + in[i], + lens[i], + is_raw, + cache_elt->lua_cbref); + } + + setbit(rt->checked, re_id); + } + else { + for (i = 0; i < count; i++) { + /* For Hyperscan we can probably safely disable all those limits */ +#if 0 + if (rt->cache->max_re_data > 0 && lens[i] > rt->cache->max_re_data) { + lens[i] = rt->cache->max_re_data; + } +#endif + rt->stat.bytes_scanned += lens[i]; + } + + g_assert(re_class->hs_scratch != NULL); + g_assert(re_class->hs_db != NULL); + + /* Go through hyperscan API */ + for (i = 0; i < count; i++) { + cbdata.ins = &in[i]; + cbdata.re = re; + cbdata.rt = rt; + cbdata.lens = &lens[i]; + cbdata.count = 1; + cbdata.task = task; + + if ((hs_scan(rspamd_hyperscan_get_database(re_class->hs_db), + in[i], lens[i], 0, + re_class->hs_scratch, + rspamd_re_cache_hyperscan_cb, &cbdata)) != HS_SUCCESS) { + ret = 0; + } + else { + ret = rt->results[re_id]; + *processed_hyperscan = TRUE; + } + } + } +#endif + + return ret; +} + +static void +rspamd_re_cache_finish_class(struct rspamd_task *task, + struct rspamd_re_runtime *rt, + struct rspamd_re_class *re_class, + const gchar *class_name) +{ +#ifdef WITH_HYPERSCAN + guint i; + guint64 re_id; + guint found = 0; + + /* Set all bits that are not checked and included in hyperscan to 1 */ + for (i = 0; i < re_class->nhs; i++) { + re_id = re_class->hs_ids[i]; + + if (!isset(rt->checked, re_id)) { + g_assert(rt->results[re_id] == 0); + rt->results[re_id] = 0; + setbit(rt->checked, re_id); + } + else { + found++; + } + } + + msg_debug_re_task("finished hyperscan for class %s; %d " + "matches found; %d hyperscan supported regexps; %d total regexps", + class_name, found, re_class->nhs, (gint) g_hash_table_size(re_class->re)); +#endif +} + +static gboolean +rspamd_re_cache_process_selector(struct rspamd_task *task, + struct rspamd_re_runtime *rt, + const gchar *name, + guchar ***svec, + guint **lenvec, + guint *n) +{ + gint ref; + khiter_t k; + lua_State *L; + gint err_idx, ret; + struct rspamd_task **ptask; + gboolean result = FALSE; + struct rspamd_re_cache *cache = rt->cache; + struct rspamd_re_selector_result *sr; + + L = cache->L; + k = kh_get(lua_selectors_hash, cache->selectors, (gchar *) name); + + if (k == kh_end(cache->selectors)) { + msg_err_task("cannot find selector %s, not registered", name); + + return FALSE; + } + + ref = kh_value(cache->selectors, k); + + /* First, search for the cached result */ + if (rt->sel_cache) { + k = kh_get(selectors_results_hash, rt->sel_cache, ref); + + if (k != kh_end(rt->sel_cache)) { + sr = &kh_value(rt->sel_cache, k); + + *svec = sr->scvec; + *lenvec = sr->lenvec; + *n = sr->cnt; + + return TRUE; + } + } + else { + rt->sel_cache = kh_init(selectors_results_hash); + } + + lua_pushcfunction(L, &rspamd_lua_traceback); + err_idx = lua_gettop(L); + + lua_rawgeti(L, LUA_REGISTRYINDEX, ref); + ptask = lua_newuserdata(L, sizeof(*ptask)); + *ptask = task; + rspamd_lua_setclass(L, "rspamd{task}", -1); + + if ((ret = lua_pcall(L, 1, 1, err_idx)) != 0) { + msg_err_task("call to selector %s " + "failed (%d): %s", + name, ret, + lua_tostring(L, -1)); + } + else { + struct rspamd_lua_text *txt; + gsize slen; + const gchar *sel_data; + + if (lua_type(L, -1) != LUA_TTABLE) { + txt = lua_check_text_or_string(L, -1); + + + if (txt) { + msg_debug_re_cache("re selector %s returned 1 element", name); + sel_data = txt->start; + slen = txt->len; + *n = 1; + *svec = g_malloc(sizeof(guchar *)); + *lenvec = g_malloc(sizeof(guint)); + (*svec)[0] = g_malloc(slen); + memcpy((*svec)[0], sel_data, slen); + (*lenvec)[0] = slen; + result = TRUE; + } + else { + msg_debug_re_cache("re selector %s returned NULL", name); + } + } + else { + *n = rspamd_lua_table_size(L, -1); + + msg_debug_re_cache("re selector %s returned %d elements", name, *n); + + if (*n > 0) { + *svec = g_malloc(sizeof(guchar *) * (*n)); + *lenvec = g_malloc(sizeof(guint) * (*n)); + + for (int i = 0; i < *n; i++) { + lua_rawgeti(L, -1, i + 1); + + txt = lua_check_text_or_string(L, -1); + if (txt && txt->len > 0) { + sel_data = txt->start; + slen = txt->len; + (*svec)[i] = g_malloc(slen); + memcpy((*svec)[i], sel_data, slen); + } + else { + /* A hack to avoid malloc(0) */ + sel_data = ""; + slen = 0; + (*svec)[i] = g_malloc(1); + memcpy((*svec)[i], sel_data, 1); + } + + (*lenvec)[i] = slen; + lua_pop(L, 1); + } + } + + /* Empty table is also a valid result */ + result = TRUE; + } + } + + lua_settop(L, err_idx - 1); + + if (result) { + k = kh_put(selectors_results_hash, rt->sel_cache, ref, &ret); + sr = &kh_value(rt->sel_cache, k); + + sr->cnt = *n; + sr->scvec = *svec; + sr->lenvec = *lenvec; + } + + return result; +} + +static inline guint +rspamd_process_words_vector(GArray *words, + const guchar **scvec, + guint *lenvec, + struct rspamd_re_class *re_class, + guint cnt, + gboolean *raw) +{ + guint j; + rspamd_stat_token_t *tok; + + if (words) { + for (j = 0; j < words->len; j++) { + tok = &g_array_index(words, rspamd_stat_token_t, j); + + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) { + if (!re_class->has_utf8) { + *raw = TRUE; + } + else { + continue; /* Skip */ + } + } + } + else { + continue; /* Skip non text */ + } + + if (re_class->type == RSPAMD_RE_RAWWORDS) { + if (tok->original.len > 0) { + scvec[cnt] = tok->original.begin; + lenvec[cnt++] = tok->original.len; + } + } + else if (re_class->type == RSPAMD_RE_WORDS) { + if (tok->normalized.len > 0) { + scvec[cnt] = tok->normalized.begin; + lenvec[cnt++] = tok->normalized.len; + } + } + else { + /* Stemmed words */ + if (tok->stemmed.len > 0) { + scvec[cnt] = tok->stemmed.begin; + lenvec[cnt++] = tok->stemmed.len; + } + } + } + } + + return cnt; +} + +static guint +rspamd_re_cache_process_headers_list(struct rspamd_task *task, + struct rspamd_re_runtime *rt, + rspamd_regexp_t *re, + struct rspamd_re_class *re_class, + struct rspamd_mime_header *rh, + gboolean is_strong, + gboolean *processed_hyperscan) +{ + const guchar **scvec, *in; + gboolean raw = FALSE; + guint *lenvec; + struct rspamd_mime_header *cur; + guint cnt = 0, i = 0, ret = 0; + + DL_COUNT(rh, cur, cnt); + + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + + DL_FOREACH(rh, cur) + { + + if (is_strong && strcmp(cur->name, re_class->type_data) != 0) { + /* Skip a different case */ + continue; + } + + if (re_class->type == RSPAMD_RE_RAWHEADER) { + in = (const guchar *) cur->value; + lenvec[i] = strlen(cur->value); + + if (rspamd_fast_utf8_validate(in, lenvec[i]) != 0) { + raw = TRUE; + } + } + else { + in = (const guchar *) cur->decoded; + /* Validate input^W^WNo need to validate as it is already valid */ + if (!in) { + lenvec[i] = 0; + scvec[i] = (guchar *) ""; + continue; + } + + lenvec[i] = strlen(in); + } + + scvec[i] = in; + + i++; + } + + if (i > 0) { + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, i, raw, processed_hyperscan); + msg_debug_re_task("checking header %s regexp: %s=%*s -> %d", + re_class->type_data, + rspamd_regexp_get_pattern(re), + (int) lenvec[0], scvec[0], ret); + } + + g_free(scvec); + g_free(lenvec); + + return ret; +} + +/* + * Calculates the specified regexp for the specified class if it's not calculated + */ +static guint +rspamd_re_cache_exec_re(struct rspamd_task *task, + struct rspamd_re_runtime *rt, + rspamd_regexp_t *re, + struct rspamd_re_class *re_class, + gboolean is_strong) +{ + guint ret = 0, i, re_id; + struct rspamd_mime_header *rh; + const gchar *in; + const guchar **scvec = NULL; + guint *lenvec = NULL; + gboolean raw = FALSE, processed_hyperscan = FALSE; + struct rspamd_mime_text_part *text_part; + struct rspamd_mime_part *mime_part; + struct rspamd_url *url; + guint len = 0, cnt = 0; + const gchar *class_name; + + class_name = rspamd_re_cache_type_to_string(re_class->type); + msg_debug_re_task("start check re type: %s: /%s/", + class_name, + rspamd_regexp_get_pattern(re)); + re_id = rspamd_regexp_get_cache_id(re); + + switch (re_class->type) { + case RSPAMD_RE_HEADER: + case RSPAMD_RE_RAWHEADER: + /* Get list of specified headers */ + rh = rspamd_message_get_header_array(task, + re_class->type_data, FALSE); + + if (rh) { + ret = rspamd_re_cache_process_headers_list(task, rt, re, + re_class, rh, is_strong, &processed_hyperscan); + msg_debug_re_task("checked header(%s) regexp: %s -> %d", + (const char *) re_class->type_data, + rspamd_regexp_get_pattern(re), + ret); + } + break; + case RSPAMD_RE_ALLHEADER: + raw = TRUE; + in = MESSAGE_FIELD(task, raw_headers_content).begin; + len = MESSAGE_FIELD(task, raw_headers_content).len; + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, (const guchar **) &in, &len, 1, raw, &processed_hyperscan); + msg_debug_re_task("checked allheader regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + break; + case RSPAMD_RE_MIMEHEADER: + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, mime_part) + { + if (mime_part->parent_part == NULL || + !IS_PART_MULTIPART(mime_part->parent_part) || + IS_PART_MESSAGE(mime_part)) { + /* We filter parts that have no multipart parent or are a messages here */ + continue; + } + rh = rspamd_message_get_header_from_hash(mime_part->raw_headers, + re_class->type_data, FALSE); + + if (rh) { + ret += rspamd_re_cache_process_headers_list(task, rt, re, + re_class, rh, is_strong, &processed_hyperscan); + } + msg_debug_re_task("checked mime header(%s) regexp: %s -> %d", + (const char *) re_class->type_data, + rspamd_regexp_get_pattern(re), + ret); + } + break; + case RSPAMD_RE_MIME: + case RSPAMD_RE_RAWMIME: + /* Iterate through text parts */ + if (MESSAGE_FIELD(task, text_parts)->len > 0) { + cnt = MESSAGE_FIELD(task, text_parts)->len; + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + /* Select data for regexp */ + if (re_class->type == RSPAMD_RE_RAWMIME) { + if (text_part->raw.len == 0) { + len = 0; + in = ""; + } + else { + in = text_part->raw.begin; + len = text_part->raw.len; + } + + raw = TRUE; + } + else { + /* Skip empty parts */ + if (IS_TEXT_PART_EMPTY(text_part)) { + len = 0; + in = ""; + } + else { + /* Check raw flags */ + if (!IS_TEXT_PART_UTF(text_part)) { + raw = TRUE; + } + + in = text_part->utf_content.begin; + len = text_part->utf_content.len; + } + } + + scvec[i] = (guchar *) in; + lenvec[i] = len; + } + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, cnt, raw, &processed_hyperscan); + msg_debug_re_task("checked mime regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + } + break; + case RSPAMD_RE_URL: + cnt = kh_size(MESSAGE_FIELD(task, urls)); + + if (cnt > 0) { + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + i = 0; + raw = FALSE; + + kh_foreach_key(MESSAGE_FIELD(task, urls), url, { + if ((url->protocol & PROTOCOL_MAILTO)) { + continue; + } + in = url->string; + len = url->urllen; + + if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) { + scvec[i] = (guchar *) in; + lenvec[i++] = len; + } + }); + + /* URL regexps do not include emails, that's why the code below is commented */ +#if 0 + g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails)); + + while (g_hash_table_iter_next (&it, &k, &v)) { + url = v; + in = url->string; + len = url->urllen; + + if (len > 0 && !(url->flags & RSPAMD_URL_FLAG_IMAGE)) { + scvec[i] = (guchar *) in; + lenvec[i++] = len; + } + } +#endif + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, i, raw, &processed_hyperscan); + msg_debug_re_task("checked url regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + } + break; + case RSPAMD_RE_EMAIL: + cnt = kh_size(MESSAGE_FIELD(task, urls)); + + if (cnt > 0) { + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + i = 0; + raw = FALSE; + + kh_foreach_key(MESSAGE_FIELD(task, urls), url, { + if (!(url->protocol & PROTOCOL_MAILTO)) { + continue; + } + if (url->userlen == 0 || url->hostlen == 0) { + continue; + } + + in = rspamd_url_user_unsafe(url); + len = url->userlen + 1 + url->hostlen; + scvec[i] = (guchar *) in; + lenvec[i++] = len; + }); + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, i, raw, &processed_hyperscan); + msg_debug_re_task("checked email regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + } + break; + case RSPAMD_RE_BODY: + raw = TRUE; + in = task->msg.begin; + len = task->msg.len; + + ret = rspamd_re_cache_process_regexp_data(rt, re, task, + (const guchar **) &in, &len, 1, raw, &processed_hyperscan); + msg_debug_re_task("checked rawbody regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + break; + case RSPAMD_RE_SABODY: + /* According to SA docs: + * The 'body' in this case is the textual parts of the message body; + * any non-text MIME parts are stripped, and the message decoded from + * Quoted-Printable or Base-64-encoded format if necessary. The message + * Subject header is considered part of the body and becomes the first + * paragraph when running the rules. All HTML tags and line breaks will + * be removed before matching. + */ + cnt = MESSAGE_FIELD(task, text_parts)->len + 1; + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + + /* + * Body rules also include the Subject as the first line + * of the body content. + */ + + rh = rspamd_message_get_header_array(task, "Subject", FALSE); + + if (rh) { + scvec[0] = (guchar *) rh->decoded; + lenvec[0] = strlen(rh->decoded); + } + else { + scvec[0] = (guchar *) ""; + lenvec[0] = 0; + } + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + if (text_part->utf_stripped_content) { + scvec[i + 1] = (guchar *) text_part->utf_stripped_content->data; + lenvec[i + 1] = text_part->utf_stripped_content->len; + + if (!IS_TEXT_PART_UTF(text_part)) { + raw = TRUE; + } + } + else { + scvec[i + 1] = (guchar *) ""; + lenvec[i + 1] = 0; + } + } + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, cnt, raw, &processed_hyperscan); + msg_debug_re_task("checked sa body regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + break; + case RSPAMD_RE_SARAWBODY: + /* According to SA docs: + * The 'raw body' of a message is the raw data inside all textual + * parts. The text will be decoded from base64 or quoted-printable + * encoding, but HTML tags and line breaks will still be present. + * Multiline expressions will need to be used to match strings that are + * broken by line breaks. + */ + if (MESSAGE_FIELD(task, text_parts)->len > 0) { + cnt = MESSAGE_FIELD(task, text_parts)->len; + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + + for (i = 0; i < cnt; i++) { + text_part = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), i); + + if (text_part->parsed.len > 0) { + scvec[i] = (guchar *) text_part->parsed.begin; + lenvec[i] = text_part->parsed.len; + + if (!IS_TEXT_PART_UTF(text_part)) { + raw = TRUE; + } + } + else { + scvec[i] = (guchar *) ""; + lenvec[i] = 0; + } + } + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, cnt, raw, &processed_hyperscan); + msg_debug_re_task("checked sa rawbody regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + } + break; + case RSPAMD_RE_WORDS: + case RSPAMD_RE_STEMWORDS: + case RSPAMD_RE_RAWWORDS: + if (MESSAGE_FIELD(task, text_parts)->len > 0) { + cnt = 0; + raw = FALSE; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + if (text_part->utf_words) { + cnt += text_part->utf_words->len; + } + } + + if (task->meta_words && task->meta_words->len > 0) { + cnt += task->meta_words->len; + } + + if (cnt > 0) { + scvec = g_malloc(sizeof(*scvec) * cnt); + lenvec = g_malloc(sizeof(*lenvec) * cnt); + + cnt = 0; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part) + { + if (text_part->utf_words) { + cnt = rspamd_process_words_vector(text_part->utf_words, + scvec, lenvec, re_class, cnt, &raw); + } + } + + if (task->meta_words) { + cnt = rspamd_process_words_vector(task->meta_words, + scvec, lenvec, re_class, cnt, &raw); + } + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, cnt, raw, &processed_hyperscan); + + msg_debug_re_task("checked sa words regexp: %s -> %d", + rspamd_regexp_get_pattern(re), ret); + g_free(scvec); + g_free(lenvec); + } + } + break; + case RSPAMD_RE_SELECTOR: + if (rspamd_re_cache_process_selector(task, rt, + re_class->type_data, + (guchar ***) &scvec, + &lenvec, &cnt)) { + + ret = rspamd_re_cache_process_regexp_data(rt, re, + task, scvec, lenvec, cnt, raw, &processed_hyperscan); + msg_debug_re_task("checked selector(%s) regexp: %s -> %d", + re_class->type_data, + rspamd_regexp_get_pattern(re), ret); + + /* Do not free vectors as they are managed by rt->sel_cache */ + } + break; + case RSPAMD_RE_MAX: + msg_err_task("regexp of class invalid has been called: %s", + rspamd_regexp_get_pattern(re)); + break; + } + +#if WITH_HYPERSCAN + if (processed_hyperscan) { + rspamd_re_cache_finish_class(task, rt, re_class, class_name); + } +#endif + + setbit(rt->checked, re_id); + + return rt->results[re_id]; +} + +gint rspamd_re_cache_process(struct rspamd_task *task, + rspamd_regexp_t *re, + enum rspamd_re_type type, + gconstpointer type_data, + gsize datalen, + gboolean is_strong) +{ + guint64 re_id; + struct rspamd_re_class *re_class; + struct rspamd_re_cache *cache; + struct rspamd_re_runtime *rt; + + g_assert(task != NULL); + rt = task->re_rt; + g_assert(rt != NULL); + g_assert(re != NULL); + + cache = rt->cache; + re_id = rspamd_regexp_get_cache_id(re); + + if (re_id == RSPAMD_INVALID_ID || re_id > cache->nre) { + msg_err_task("re '%s' has no valid id for the cache", + rspamd_regexp_get_pattern(re)); + return 0; + } + + if (isset(rt->checked, re_id)) { + /* Fast path */ + rt->stat.regexp_fast_cached++; + return rt->results[re_id]; + } + else { + /* Slow path */ + re_class = rspamd_regexp_get_class(re); + + if (re_class == NULL) { + msg_err_task("cannot find re class for regexp '%s'", + rspamd_regexp_get_pattern(re)); + return 0; + } + + return rspamd_re_cache_exec_re(task, rt, re, re_class, + is_strong); + } + + return 0; +} + +int rspamd_re_cache_process_ffi(void *ptask, + void *pre, + int type, + void *type_data, + int is_strong) +{ + struct rspamd_lua_regexp **lua_re = pre; + struct rspamd_task **real_task = ptask; + gsize typelen = 0; + + if (type_data) { + typelen = strlen(type_data); + } + + return rspamd_re_cache_process(*real_task, (*lua_re)->re, + type, type_data, typelen, is_strong); +} + +void rspamd_re_cache_runtime_destroy(struct rspamd_re_runtime *rt) +{ + g_assert(rt != NULL); + + if (rt->sel_cache) { + struct rspamd_re_selector_result sr; + + kh_foreach_value(rt->sel_cache, sr, { + for (guint i = 0; i < sr.cnt; i++) { + g_free((gpointer) sr.scvec[i]); + } + + g_free(sr.scvec); + g_free(sr.lenvec); + }); + kh_destroy(selectors_results_hash, rt->sel_cache); + } + + REF_RELEASE(rt->cache); + g_free(rt); +} + +void rspamd_re_cache_unref(struct rspamd_re_cache *cache) +{ + if (cache) { + REF_RELEASE(cache); + } +} + +struct rspamd_re_cache * +rspamd_re_cache_ref(struct rspamd_re_cache *cache) +{ + if (cache) { + REF_RETAIN(cache); + } + + return cache; +} + +guint rspamd_re_cache_set_limit(struct rspamd_re_cache *cache, guint limit) +{ + guint old; + + g_assert(cache != NULL); + + old = cache->max_re_data; + cache->max_re_data = limit; + + return old; +} + +const gchar * +rspamd_re_cache_type_to_string(enum rspamd_re_type type) +{ + const gchar *ret = "unknown"; + + switch (type) { + case RSPAMD_RE_HEADER: + ret = "header"; + break; + case RSPAMD_RE_RAWHEADER: + ret = "raw header"; + break; + case RSPAMD_RE_MIMEHEADER: + ret = "mime header"; + break; + case RSPAMD_RE_ALLHEADER: + ret = "all headers"; + break; + case RSPAMD_RE_MIME: + ret = "part"; + break; + case RSPAMD_RE_RAWMIME: + ret = "raw part"; + break; + case RSPAMD_RE_BODY: + ret = "rawbody"; + break; + case RSPAMD_RE_URL: + ret = "url"; + break; + case RSPAMD_RE_EMAIL: + ret = "email"; + break; + case RSPAMD_RE_SABODY: + ret = "sa body"; + break; + case RSPAMD_RE_SARAWBODY: + ret = "sa raw body"; + break; + case RSPAMD_RE_SELECTOR: + ret = "selector"; + break; + case RSPAMD_RE_WORDS: + ret = "words"; + break; + case RSPAMD_RE_RAWWORDS: + ret = "raw_words"; + break; + case RSPAMD_RE_STEMWORDS: + ret = "stem_words"; + break; + case RSPAMD_RE_MAX: + default: + ret = "invalid class"; + break; + } + + return ret; +} + +enum rspamd_re_type +rspamd_re_cache_type_from_string(const char *str) +{ + enum rspamd_re_type ret; + guint64 h; + + /* + * To optimize this function, we apply hash to input string and + * pre-select it from the values + */ + + if (str != NULL) { + h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_XXHASH64, + str, strlen(str), 0xdeadbabe); + + switch (h) { + case G_GUINT64_CONSTANT(0x298b9c8a58887d44): /* header */ + ret = RSPAMD_RE_HEADER; + break; + case G_GUINT64_CONSTANT(0x467bfb5cd7ddf890): /* rawheader */ + ret = RSPAMD_RE_RAWHEADER; + break; + case G_GUINT64_CONSTANT(0xda081341fb600389): /* mime */ + ret = RSPAMD_RE_MIME; + break; + case G_GUINT64_CONSTANT(0xc35831e067a8221d): /* rawmime */ + ret = RSPAMD_RE_RAWMIME; + break; + case G_GUINT64_CONSTANT(0xc625e13dbe636de2): /* body */ + case G_GUINT64_CONSTANT(0xCCDEBA43518F721C): /* message */ + ret = RSPAMD_RE_BODY; + break; + case G_GUINT64_CONSTANT(0x286edbe164c791d2): /* url */ + case G_GUINT64_CONSTANT(0x7D9ACDF6685661A1): /* uri */ + ret = RSPAMD_RE_URL; + break; + case G_GUINT64_CONSTANT(0x7e232b0f60b571be): /* email */ + ret = RSPAMD_RE_EMAIL; + break; + case G_GUINT64_CONSTANT(0x796d62205a8778c7): /* allheader */ + ret = RSPAMD_RE_ALLHEADER; + break; + case G_GUINT64_CONSTANT(0xa3c6c153b3b00a5e): /* mimeheader */ + ret = RSPAMD_RE_MIMEHEADER; + break; + case G_GUINT64_CONSTANT(0x7794501506e604e9): /* sabody */ + ret = RSPAMD_RE_SABODY; + break; + case G_GUINT64_CONSTANT(0x28828962E7D2A05F): /* sarawbody */ + ret = RSPAMD_RE_SARAWBODY; + break; + default: + ret = RSPAMD_RE_MAX; + break; + } + } + else { + ret = RSPAMD_RE_MAX; + } + + return ret; +} + +#ifdef WITH_HYPERSCAN +static gchar * +rspamd_re_cache_hs_pattern_from_pcre(rspamd_regexp_t *re) +{ + /* + * Workaround for bug in ragel 7.0.0.11 + * https://github.com/intel/hyperscan/issues/133 + */ + const gchar *pat = rspamd_regexp_get_pattern(re); + guint flags = rspamd_regexp_get_flags(re), esc_flags = RSPAMD_REGEXP_ESCAPE_RE; + gchar *escaped; + gsize esc_len; + + if (flags & RSPAMD_REGEXP_FLAG_UTF) { + esc_flags |= RSPAMD_REGEXP_ESCAPE_UTF; + } + + escaped = rspamd_str_regexp_escape(pat, strlen(pat), &esc_len, esc_flags); + + return escaped; +} + +static gboolean +rspamd_re_cache_is_finite(struct rspamd_re_cache *cache, + rspamd_regexp_t *re, gint flags, gdouble max_time) +{ + pid_t cld; + gint status; + struct timespec ts; + hs_compile_error_t *hs_errors; + hs_database_t *test_db; + gdouble wait_time; + const gint max_tries = 10; + gint tries = 0, rc; + void (*old_hdl)(int); + + wait_time = max_time / max_tries; + /* We need to restore SIGCHLD processing */ + old_hdl = signal(SIGCHLD, SIG_DFL); + cld = fork(); + + if (cld == 0) { + /* Try to compile pattern */ + + gchar *pat = rspamd_re_cache_hs_pattern_from_pcre(re); + + if (hs_compile(pat, + flags | HS_FLAG_PREFILTER, + HS_MODE_BLOCK, + &cache->plt, + &test_db, + &hs_errors) != HS_SUCCESS) { + + msg_info_re_cache("cannot compile (prefilter mode) '%s' to hyperscan: '%s'", + pat, + hs_errors != NULL ? hs_errors->message : "unknown error"); + + hs_free_compile_error(hs_errors); + g_free(pat); + + exit(EXIT_FAILURE); + } + + g_free(pat); + exit(EXIT_SUCCESS); + } + else if (cld > 0) { + double_to_ts(wait_time, &ts); + + while ((rc = waitpid(cld, &status, WNOHANG)) == 0 && tries++ < max_tries) { + (void) nanosleep(&ts, NULL); + } + + /* Child has been terminated */ + if (rc > 0) { + /* Forget about SIGCHLD after this point */ + signal(SIGCHLD, old_hdl); + + if (WIFEXITED(status) && WEXITSTATUS(status) == EXIT_SUCCESS) { + return TRUE; + } + else { + msg_err_re_cache( + "cannot approximate %s to hyperscan", + rspamd_regexp_get_pattern(re)); + + return FALSE; + } + } + else { + /* We consider that as timeout */ + kill(cld, SIGKILL); + g_assert(waitpid(cld, &status, 0) != -1); + msg_err_re_cache( + "cannot approximate %s to hyperscan: timeout waiting", + rspamd_regexp_get_pattern(re)); + signal(SIGCHLD, old_hdl); + } + } + else { + msg_err_re_cache( + "cannot approximate %s to hyperscan: fork failed: %s", + rspamd_regexp_get_pattern(re), strerror(errno)); + signal(SIGCHLD, old_hdl); + } + + return FALSE; +} +#endif + +#ifdef WITH_HYPERSCAN +struct rspamd_re_cache_hs_compile_cbdata { + GHashTableIter it; + struct rspamd_re_cache *cache; + const char *cache_dir; + gdouble max_time; + gboolean silent; + guint total; + void (*cb)(guint ncompiled, GError *err, void *cbd); + void *cbd; +}; + +static void +rspamd_re_cache_compile_err(EV_P_ ev_timer *w, GError *err, + struct rspamd_re_cache_hs_compile_cbdata *cbdata, bool is_fatal) +{ + cbdata->cb(cbdata->total, err, cbdata->cbd); + + if (is_fatal) { + ev_timer_stop(EV_A_ w); + g_free(w); + g_free(cbdata); + } + else { + /* Continue compilation */ + ev_timer_again(EV_A_ w); + } + g_error_free(err); +} + +static void +rspamd_re_cache_compile_timer_cb(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_re_cache_hs_compile_cbdata *cbdata = + (struct rspamd_re_cache_hs_compile_cbdata *) w->data; + GHashTableIter cit; + gpointer k, v; + struct rspamd_re_class *re_class; + gchar path[PATH_MAX], npath[PATH_MAX]; + hs_database_t *test_db; + gint fd, i, n, *hs_ids = NULL, pcre_flags, re_flags; + rspamd_cryptobox_fast_hash_state_t crc_st; + guint64 crc; + rspamd_regexp_t *re; + hs_compile_error_t *hs_errors = NULL; + guint *hs_flags = NULL; + const hs_expr_ext_t **hs_exts = NULL; + gchar **hs_pats = NULL; + gchar *hs_serialized = NULL; + gsize serialized_len; + struct iovec iov[7]; + struct rspamd_re_cache *cache; + GError *err; + pid_t our_pid = getpid(); + + cache = cbdata->cache; + + if (!g_hash_table_iter_next(&cbdata->it, &k, &v)) { + /* All done */ + ev_timer_stop(EV_A_ w); + cbdata->cb(cbdata->total, NULL, cbdata->cbd); + g_free(w); + g_free(cbdata); + + return; + } + + re_class = v; + rspamd_snprintf(path, sizeof(path), "%s%c%s.hs", cbdata->cache_dir, + G_DIR_SEPARATOR, re_class->hash); + + if (rspamd_re_cache_is_valid_hyperscan_file(cache, path, TRUE, TRUE, NULL)) { + + fd = open(path, O_RDONLY, 00600); + + /* Read number of regexps */ + g_assert(fd != -1); + g_assert(lseek(fd, RSPAMD_HS_MAGIC_LEN + sizeof(cache->plt), SEEK_SET) != -1); + g_assert(read(fd, &n, sizeof(n)) == sizeof(n)); + close(fd); + + if (re_class->type_len > 0) { + if (!cbdata->silent) { + msg_info_re_cache( + "skip already valid class %s(%*s) to cache %6s, %d regexps", + rspamd_re_cache_type_to_string(re_class->type), + (gint) re_class->type_len - 1, + re_class->type_data, + re_class->hash, + n); + } + } + else { + if (!cbdata->silent) { + msg_info_re_cache( + "skip already valid class %s to cache %6s, %d regexps", + rspamd_re_cache_type_to_string(re_class->type), + re_class->hash, + n); + } + } + + ev_timer_again(EV_A_ w); + return; + } + + rspamd_snprintf(path, sizeof(path), "%s%c%s%P-XXXXXXXXXX", cbdata->cache_dir, + G_DIR_SEPARATOR, re_class->hash, our_pid); + fd = g_mkstemp_full(path, O_CREAT | O_TRUNC | O_EXCL | O_WRONLY, 00600); + + if (fd == -1) { + err = g_error_new(rspamd_re_cache_quark(), errno, + "cannot open file %s: %s", path, strerror(errno)); + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + return; + } + + g_hash_table_iter_init(&cit, re_class->re); + n = g_hash_table_size(re_class->re); + hs_flags = g_new0(guint, n); + hs_ids = g_new0(guint, n); + hs_pats = g_new0(char *, n); + hs_exts = g_new0(const hs_expr_ext_t *, n); + i = 0; + + while (g_hash_table_iter_next(&cit, &k, &v)) { + re = v; + + pcre_flags = rspamd_regexp_get_pcre_flags(re); + re_flags = rspamd_regexp_get_flags(re); + + if (re_flags & RSPAMD_REGEXP_FLAG_PCRE_ONLY) { + /* Do not try to compile bad regexp */ + msg_info_re_cache( + "do not try compile %s to hyperscan as it is PCRE only", + rspamd_regexp_get_pattern(re)); + continue; + } + + hs_flags[i] = 0; + hs_exts[i] = NULL; +#ifndef WITH_PCRE2 + if (pcre_flags & PCRE_FLAG(UTF8)) { + hs_flags[i] |= HS_FLAG_UTF8; + } +#else + if (pcre_flags & PCRE_FLAG(UTF)) { + hs_flags[i] |= HS_FLAG_UTF8; + } +#endif + if (pcre_flags & PCRE_FLAG(CASELESS)) { + hs_flags[i] |= HS_FLAG_CASELESS; + } + if (pcre_flags & PCRE_FLAG(MULTILINE)) { + hs_flags[i] |= HS_FLAG_MULTILINE; + } + if (pcre_flags & PCRE_FLAG(DOTALL)) { + hs_flags[i] |= HS_FLAG_DOTALL; + } + + + if (re_flags & RSPAMD_REGEXP_FLAG_LEFTMOST) { + hs_flags[i] |= HS_FLAG_SOM_LEFTMOST; + } + else if (rspamd_regexp_get_maxhits(re) == 1) { + hs_flags[i] |= HS_FLAG_SINGLEMATCH; + } + + gchar *pat = rspamd_re_cache_hs_pattern_from_pcre(re); + + if (hs_compile(pat, + hs_flags[i], + HS_MODE_BLOCK, + &cache->plt, + &test_db, + &hs_errors) != HS_SUCCESS) { + msg_info_re_cache("cannot compile '%s' to hyperscan: '%s', try prefilter match", + pat, + hs_errors != NULL ? hs_errors->message : "unknown error"); + hs_free_compile_error(hs_errors); + + /* The approximation operation might take a significant + * amount of time, so we need to check if it's finite + */ + if (rspamd_re_cache_is_finite(cache, re, hs_flags[i], cbdata->max_time)) { + hs_flags[i] |= HS_FLAG_PREFILTER; + hs_ids[i] = rspamd_regexp_get_cache_id(re); + hs_pats[i] = pat; + i++; + } + else { + g_free(pat); /* Avoid leak */ + } + } + else { + hs_ids[i] = rspamd_regexp_get_cache_id(re); + hs_pats[i] = pat; + i++; + hs_free_database(test_db); + } + } + /* Adjust real re number */ + n = i; + +#define CLEANUP_ALLOCATED(is_err) \ + do { \ + g_free(hs_flags); \ + g_free(hs_ids); \ + for (guint j = 0; j < i; j++) { \ + g_free(hs_pats[j]); \ + } \ + g_free(hs_pats); \ + g_free(hs_exts); \ + if (is_err) { \ + close(fd); \ + unlink(path); \ + if (hs_errors) hs_free_compile_error(hs_errors); \ + } \ + } while (0) + + if (n > 0) { + /* Create the hs tree */ + hs_errors = NULL; + if (hs_compile_ext_multi((const char **) hs_pats, + hs_flags, + hs_ids, + hs_exts, + n, + HS_MODE_BLOCK, + &cache->plt, + &test_db, + &hs_errors) != HS_SUCCESS) { + + err = g_error_new(rspamd_re_cache_quark(), EINVAL, + "cannot create tree of regexp when processing '%s': %s", + hs_pats[hs_errors->expression], hs_errors->message); + CLEANUP_ALLOCATED(true); + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + + return; + } + + if (hs_serialize_database(test_db, &hs_serialized, + &serialized_len) != HS_SUCCESS) { + err = g_error_new(rspamd_re_cache_quark(), + errno, + "cannot serialize tree of regexp for %s", + re_class->hash); + + CLEANUP_ALLOCATED(true); + hs_free_database(test_db); + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + return; + } + + hs_free_database(test_db); + + /* + * Magic - 8 bytes + * Platform - sizeof (platform) + * n - number of regexps + * n * <regexp ids> + * n * <regexp flags> + * crc - 8 bytes checksum + * <hyperscan blob> + */ + rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe); + /* IDs -> Flags -> Hs blob */ + rspamd_cryptobox_fast_hash_update(&crc_st, + hs_ids, sizeof(*hs_ids) * n); + rspamd_cryptobox_fast_hash_update(&crc_st, + hs_flags, sizeof(*hs_flags) * n); + rspamd_cryptobox_fast_hash_update(&crc_st, + hs_serialized, serialized_len); + crc = rspamd_cryptobox_fast_hash_final(&crc_st); + + + iov[0].iov_base = (void *) rspamd_hs_magic; + iov[0].iov_len = RSPAMD_HS_MAGIC_LEN; + iov[1].iov_base = &cache->plt; + iov[1].iov_len = sizeof(cache->plt); + iov[2].iov_base = &n; + iov[2].iov_len = sizeof(n); + iov[3].iov_base = hs_ids; + iov[3].iov_len = sizeof(*hs_ids) * n; + iov[4].iov_base = hs_flags; + iov[4].iov_len = sizeof(*hs_flags) * n; + iov[5].iov_base = &crc; + iov[5].iov_len = sizeof(crc); + iov[6].iov_base = hs_serialized; + iov[6].iov_len = serialized_len; + + if (writev(fd, iov, G_N_ELEMENTS(iov)) == -1) { + err = g_error_new(rspamd_re_cache_quark(), + errno, + "cannot serialize tree of regexp to %s: %s", + path, strerror(errno)); + + CLEANUP_ALLOCATED(true); + g_free(hs_serialized); + + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + return; + } + + if (re_class->type_len > 0) { + msg_info_re_cache( + "compiled class %s(%*s) to cache %6s, %d/%d regexps", + rspamd_re_cache_type_to_string(re_class->type), + (gint) re_class->type_len - 1, + re_class->type_data, + re_class->hash, + n, + (gint) g_hash_table_size(re_class->re)); + } + else { + msg_info_re_cache( + "compiled class %s to cache %6s, %d/%d regexps", + rspamd_re_cache_type_to_string(re_class->type), + re_class->hash, + n, + (gint) g_hash_table_size(re_class->re)); + } + + cbdata->total += n; + CLEANUP_ALLOCATED(false); + + /* Now rename temporary file to the new .hs file */ + rspamd_snprintf(npath, sizeof(npath), "%s%c%s.hs", cbdata->cache_dir, + G_DIR_SEPARATOR, re_class->hash); + + if (rename(path, npath) == -1) { + err = g_error_new(rspamd_re_cache_quark(), + errno, + "cannot rename %s to %s: %s", + path, npath, strerror(errno)); + unlink(path); + close(fd); + + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + return; + } + + close(fd); + } + else { + err = g_error_new(rspamd_re_cache_quark(), + errno, + "no suitable regular expressions %s (%d original): " + "remove temporary file %s", + rspamd_re_cache_type_to_string(re_class->type), + (gint) g_hash_table_size(re_class->re), + path); + + CLEANUP_ALLOCATED(true); + rspamd_re_cache_compile_err(EV_A_ w, err, cbdata, false); + + return; + } + + /* Continue process */ + ev_timer_again(EV_A_ w); +} + +#endif + +gint rspamd_re_cache_compile_hyperscan(struct rspamd_re_cache *cache, + const char *cache_dir, + gdouble max_time, + gboolean silent, + struct ev_loop *event_loop, + void (*cb)(guint ncompiled, GError *err, void *cbd), + void *cbd) +{ + g_assert(cache != NULL); + g_assert(cache_dir != NULL); + +#ifndef WITH_HYPERSCAN + return -1; +#else + static ev_timer *timer; + static const ev_tstamp timer_interval = 0.1; + struct rspamd_re_cache_hs_compile_cbdata *cbdata; + + cbdata = g_malloc0(sizeof(*cbdata)); + g_hash_table_iter_init(&cbdata->it, cache->re_classes); + cbdata->cache = cache; + cbdata->cache_dir = cache_dir; + cbdata->cb = cb; + cbdata->cbd = cbd; + cbdata->max_time = max_time; + cbdata->silent = silent; + cbdata->total = 0; + timer = g_malloc0(sizeof(*timer)); + timer->data = (void *) cbdata; /* static */ + + ev_timer_init(timer, rspamd_re_cache_compile_timer_cb, + timer_interval, timer_interval); + ev_timer_start(event_loop, timer); + + return 0; +#endif +} + +gboolean +rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache *cache, + const char *path, gboolean silent, gboolean try_load, GError **err) +{ + g_assert(cache != NULL); + g_assert(path != NULL); + +#ifndef WITH_HYPERSCAN + return FALSE; +#else + gint fd, n, ret; + guchar magicbuf[RSPAMD_HS_MAGIC_LEN]; + const guchar *mb; + GHashTableIter it; + gpointer k, v; + struct rspamd_re_class *re_class; + gsize len; + const gchar *hash_pos; + hs_platform_info_t test_plt; + hs_database_t *test_db = NULL; + guchar *map, *p, *end; + rspamd_cryptobox_fast_hash_state_t crc_st; + guint64 crc, valid_crc; + + len = strlen(path); + + if (len < sizeof(rspamd_cryptobox_HASHBYTES + 3)) { + if (!silent) { + msg_err_re_cache("cannot open hyperscan cache file %s: too short filename", + path); + } + g_set_error(err, rspamd_re_cache_quark(), 0, + "too short filename"); + + return FALSE; + } + + if (memcmp(path + len - 3, ".hs", 3) != 0) { + if (!silent) { + msg_err_re_cache("cannot open hyperscan cache file %s: not ending with .hs", + path); + } + g_set_error(err, rspamd_re_cache_quark(), 0, + "not ending with .hs"); + return FALSE; + } + + hash_pos = path + len - 3 - (sizeof(re_class->hash) - 1); + g_hash_table_iter_init(&it, cache->re_classes); + + while (g_hash_table_iter_next(&it, &k, &v)) { + re_class = v; + + if (memcmp(hash_pos, re_class->hash, sizeof(re_class->hash) - 1) == 0) { + /* Open file and check magic */ + gssize r; + + fd = open(path, O_RDONLY); + + if (fd == -1) { + if (errno != ENOENT || !silent) { + msg_err_re_cache("cannot open hyperscan cache file %s: %s", + path, strerror(errno)); + } + g_set_error(err, rspamd_re_cache_quark(), 0, + "%s", + strerror(errno)); + return FALSE; + } + + if ((r = read(fd, magicbuf, sizeof(magicbuf))) != sizeof(magicbuf)) { + if (r == -1) { + msg_err_re_cache("cannot read magic from hyperscan " + "cache file %s: %s", + path, strerror(errno)); + g_set_error(err, rspamd_re_cache_quark(), 0, + "cannot read magic: %s", + strerror(errno)); + } + else { + msg_err_re_cache("truncated read magic from hyperscan " + "cache file %s: %z, %z wanted", + path, r, (gsize) sizeof(magicbuf)); + g_set_error(err, rspamd_re_cache_quark(), 0, + "truncated read magic %zd, %zd wanted", + r, (gsize) sizeof(magicbuf)); + } + + close(fd); + return FALSE; + } + + mb = rspamd_hs_magic; + + if (memcmp(magicbuf, mb, sizeof(magicbuf)) != 0) { + msg_err_re_cache("cannot open hyperscan cache file %s: " + "bad magic ('%*xs', '%*xs' expected)", + path, (int) RSPAMD_HS_MAGIC_LEN, magicbuf, + (int) RSPAMD_HS_MAGIC_LEN, mb); + + close(fd); + g_set_error(err, rspamd_re_cache_quark(), 0, "invalid magic"); + return FALSE; + } + + if ((r = read(fd, &test_plt, sizeof(test_plt))) != sizeof(test_plt)) { + if (r == -1) { + msg_err_re_cache("cannot read platform data from hyperscan " + "cache file %s: %s", + path, strerror(errno)); + } + else { + msg_err_re_cache("truncated read platform data from hyperscan " + "cache file %s: %z, %z wanted", + path, r, (gsize) sizeof(magicbuf)); + } + + g_set_error(err, rspamd_re_cache_quark(), 0, + "cannot read platform data: %s", strerror(errno)); + + close(fd); + return FALSE; + } + + if (test_plt.cpu_features != cache->plt.cpu_features) { + msg_err_re_cache("cannot open hyperscan cache file %s: " + "compiled for a different platform", + path); + g_set_error(err, rspamd_re_cache_quark(), 0, + "compiled for a different platform"); + + close(fd); + return FALSE; + } + + close(fd); + + if (try_load) { + map = rspamd_file_xmap(path, PROT_READ, &len, TRUE); + + if (map == NULL) { + msg_err_re_cache("cannot mmap hyperscan cache file %s: " + "%s", + path, strerror(errno)); + g_set_error(err, rspamd_re_cache_quark(), 0, + "mmap error: %s", strerror(errno)); + return FALSE; + } + + p = map + RSPAMD_HS_MAGIC_LEN + sizeof(test_plt); + end = map + len; + memcpy(&n, p, sizeof(n)); + p += sizeof(gint); + + if (n <= 0 || 2 * n * sizeof(gint) + /* IDs + flags */ + sizeof(guint64) + /* crc */ + RSPAMD_HS_MAGIC_LEN + /* header */ + sizeof(cache->plt) > + len) { + /* Some wrong amount of regexps */ + msg_err_re_cache("bad number of expressions in %s: %d", + path, n); + g_set_error(err, rspamd_re_cache_quark(), 0, + "bad number of expressions: %d", n); + munmap(map, len); + return FALSE; + } + + /* + * Magic - 8 bytes + * Platform - sizeof (platform) + * n - number of regexps + * n * <regexp ids> + * n * <regexp flags> + * crc - 8 bytes checksum + * <hyperscan blob> + */ + + memcpy(&crc, p + n * 2 * sizeof(gint), sizeof(crc)); + rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe); + /* IDs */ + rspamd_cryptobox_fast_hash_update(&crc_st, p, n * sizeof(gint)); + /* Flags */ + rspamd_cryptobox_fast_hash_update(&crc_st, p + n * sizeof(gint), + n * sizeof(gint)); + /* HS database */ + p += n * sizeof(gint) * 2 + sizeof(guint64); + rspamd_cryptobox_fast_hash_update(&crc_st, p, end - p); + valid_crc = rspamd_cryptobox_fast_hash_final(&crc_st); + + if (crc != valid_crc) { + msg_warn_re_cache("outdated or invalid hs database in %s: " + "crc read %xL, crc expected %xL", + path, crc, valid_crc); + g_set_error(err, rspamd_re_cache_quark(), 0, + "outdated or invalid hs database, crc check failure"); + munmap(map, len); + + return FALSE; + } + + if ((ret = hs_deserialize_database(p, end - p, &test_db)) != HS_SUCCESS) { + msg_err_re_cache("bad hs database in %s: %d", path, ret); + g_set_error(err, rspamd_re_cache_quark(), 0, + "deserialize error: %d", ret); + munmap(map, len); + + return FALSE; + } + + hs_free_database(test_db); + munmap(map, len); + } + /* XXX: add crc check */ + + return TRUE; + } + } + + if (!silent) { + msg_warn_re_cache("unknown hyperscan cache file %s", path); + } + + g_set_error(err, rspamd_re_cache_quark(), 0, + "unknown hyperscan file"); + + return FALSE; +#endif +} + + +enum rspamd_hyperscan_status +rspamd_re_cache_load_hyperscan(struct rspamd_re_cache *cache, + const char *cache_dir, bool try_load) +{ + g_assert(cache != NULL); + g_assert(cache_dir != NULL); + +#ifndef WITH_HYPERSCAN + return RSPAMD_HYPERSCAN_UNSUPPORTED; +#else + gchar path[PATH_MAX]; + gint fd, i, n, *hs_ids = NULL, *hs_flags = NULL, total = 0, ret; + GHashTableIter it; + gpointer k, v; + guint8 *map, *p; + struct rspamd_re_class *re_class; + struct rspamd_re_cache_elt *elt; + struct stat st; + gboolean has_valid = FALSE, all_valid = FALSE; + + g_hash_table_iter_init(&it, cache->re_classes); + + while (g_hash_table_iter_next(&it, &k, &v)) { + re_class = v; + rspamd_snprintf(path, sizeof(path), "%s%c%s.hs", cache_dir, + G_DIR_SEPARATOR, re_class->hash); + + if (rspamd_re_cache_is_valid_hyperscan_file(cache, path, try_load, FALSE, NULL)) { + msg_debug_re_cache("load hyperscan database from '%s'", + re_class->hash); + + fd = open(path, O_RDONLY); + + /* Read number of regexps */ + g_assert(fd != -1); + fstat(fd, &st); + + map = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); + + if (map == MAP_FAILED) { + if (!try_load) { + msg_err_re_cache("cannot mmap %s: %s", path, strerror(errno)); + } + else { + msg_debug_re_cache("cannot mmap %s: %s", path, strerror(errno)); + } + + close(fd); + all_valid = FALSE; + continue; + } + + close(fd); + p = map + RSPAMD_HS_MAGIC_LEN + sizeof(cache->plt); + n = *(gint *) p; + + if (n <= 0 || 2 * n * sizeof(gint) + /* IDs + flags */ + sizeof(guint64) + /* crc */ + RSPAMD_HS_MAGIC_LEN + /* header */ + sizeof(cache->plt) > + (gsize) st.st_size) { + /* Some wrong amount of regexps */ + if (!try_load) { + msg_err_re_cache("bad number of expressions in %s: %d", + path, n); + } + else { + msg_debug_re_cache("bad number of expressions in %s: %d", + path, n); + } + + munmap(map, st.st_size); + all_valid = FALSE; + continue; + } + + total += n; + p += sizeof(n); + hs_ids = g_malloc(n * sizeof(*hs_ids)); + memcpy(hs_ids, p, n * sizeof(*hs_ids)); + p += n * sizeof(*hs_ids); + hs_flags = g_malloc(n * sizeof(*hs_flags)); + memcpy(hs_flags, p, n * sizeof(*hs_flags)); + + /* Skip crc */ + p += n * sizeof(*hs_ids) + sizeof(guint64); + + /* Cleanup */ + if (re_class->hs_scratch != NULL) { + hs_free_scratch(re_class->hs_scratch); + } + + if (re_class->hs_db != NULL) { + rspamd_hyperscan_free(re_class->hs_db, false); + } + + if (re_class->hs_ids) { + g_free(re_class->hs_ids); + } + + re_class->hs_ids = NULL; + re_class->hs_scratch = NULL; + re_class->hs_db = NULL; + munmap(map, st.st_size); + + re_class->hs_db = rspamd_hyperscan_maybe_load(path, p - map); + if (re_class->hs_db == NULL) { + if (!try_load) { + msg_err_re_cache("bad hs database in %s", path); + } + else { + msg_debug_re_cache("bad hs database in %s", path); + } + g_free(hs_ids); + g_free(hs_flags); + + re_class->hs_ids = NULL; + re_class->hs_scratch = NULL; + re_class->hs_db = NULL; + all_valid = FALSE; + + continue; + } + + if ((ret = hs_alloc_scratch(rspamd_hyperscan_get_database(re_class->hs_db), + &re_class->hs_scratch)) != HS_SUCCESS) { + if (!try_load) { + msg_err_re_cache("bad hs database in %s; error code: %d", path, ret); + } + else { + msg_debug_re_cache("bad hs database in %s; error code: %d", path, ret); + } + g_free(hs_ids); + g_free(hs_flags); + + rspamd_hyperscan_free(re_class->hs_db, true); + re_class->hs_ids = NULL; + re_class->hs_scratch = NULL; + re_class->hs_db = NULL; + all_valid = FALSE; + + continue; + } + + /* + * Now find hyperscan elts that are successfully compiled and + * specify that they should be matched using hyperscan + */ + for (i = 0; i < n; i++) { + g_assert((gint) cache->re->len > hs_ids[i] && hs_ids[i] >= 0); + elt = g_ptr_array_index(cache->re, hs_ids[i]); + + if (hs_flags[i] & HS_FLAG_PREFILTER) { + elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN_PRE; + } + else { + elt->match_type = RSPAMD_RE_CACHE_HYPERSCAN; + } + } + + re_class->hs_ids = hs_ids; + g_free(hs_flags); + re_class->nhs = n; + + if (!has_valid) { + has_valid = TRUE; + all_valid = TRUE; + } + } + else { + if (!try_load) { + msg_err_re_cache("invalid hyperscan hash file '%s'", + path); + } + else { + msg_debug_re_cache("invalid hyperscan hash file '%s'", + path); + } + all_valid = FALSE; + continue; + } + } + + if (has_valid) { + if (all_valid) { + msg_info_re_cache("full hyperscan database of %d regexps has been loaded", total); + cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_FULL; + } + else { + msg_info_re_cache("partial hyperscan database of %d regexps has been loaded", total); + cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOADED_PARTIAL; + } + } + else { + msg_info_re_cache("hyperscan database has NOT been loaded; no valid expressions"); + cache->hyperscan_loaded = RSPAMD_HYPERSCAN_LOAD_ERROR; + } + + + return cache->hyperscan_loaded; +#endif +} + +void rspamd_re_cache_add_selector(struct rspamd_re_cache *cache, + const gchar *sname, + gint ref) +{ + khiter_t k; + + k = kh_get(lua_selectors_hash, cache->selectors, (gchar *) sname); + + if (k == kh_end(cache->selectors)) { + gchar *cpy = g_strdup(sname); + gint res; + + k = kh_put(lua_selectors_hash, cache->selectors, cpy, &res); + + kh_value(cache->selectors, k) = ref; + } + else { + msg_warn_re_cache("replacing selector with name %s", sname); + + if (cache->L) { + luaL_unref(cache->L, LUA_REGISTRYINDEX, kh_value(cache->selectors, k)); + } + + kh_value(cache->selectors, k) = ref; + } +} diff --git a/src/libserver/re_cache.h b/src/libserver/re_cache.h new file mode 100644 index 0000000..d6449a9 --- /dev/null +++ b/src/libserver/re_cache.h @@ -0,0 +1,212 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_RE_CACHE_H +#define RSPAMD_RE_CACHE_H + +#include "config.h" +#include "libutil/regexp.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_re_cache; +struct rspamd_re_runtime; +struct rspamd_task; +struct rspamd_config; + +enum rspamd_re_type { + RSPAMD_RE_HEADER, + RSPAMD_RE_RAWHEADER, + RSPAMD_RE_ALLHEADER, + RSPAMD_RE_MIMEHEADER, + RSPAMD_RE_MIME, + RSPAMD_RE_RAWMIME, + RSPAMD_RE_URL, + RSPAMD_RE_EMAIL, + RSPAMD_RE_BODY, /* full in SA */ + RSPAMD_RE_SABODY, /* body in SA */ + RSPAMD_RE_SARAWBODY, /* rawbody in SA */ + RSPAMD_RE_WORDS, /* normalized words */ + RSPAMD_RE_RAWWORDS, /* raw words */ + RSPAMD_RE_STEMWORDS, /* stemmed words */ + RSPAMD_RE_SELECTOR, /* use lua selector to process regexp */ + RSPAMD_RE_MAX +}; + +struct rspamd_re_cache_stat { + guint64 bytes_scanned; + guint64 bytes_scanned_pcre; + guint regexp_checked; + guint regexp_matched; + guint regexp_total; + guint regexp_fast_cached; +}; + +/** + * Initialize re_cache persistent structure + */ +struct rspamd_re_cache *rspamd_re_cache_new(void); + +/** + * Add the existing regexp to the cache + * @param cache cache object + * @param re regexp object + * @param type type of object + * @param type_data associated data with the type (e.g. header name) + * @param datalen associated data length + * @param lua_cbref optional lua callback reference for matching purposes + */ +rspamd_regexp_t * +rspamd_re_cache_add(struct rspamd_re_cache *cache, rspamd_regexp_t *re, + enum rspamd_re_type type, + gconstpointer type_data, gsize datalen, + gint lua_cbref); + +/** + * Replace regexp in the cache with another regexp + * @param cache cache object + * @param what re to replace + * @param with regexp object to replace the origin + */ +void rspamd_re_cache_replace(struct rspamd_re_cache *cache, + rspamd_regexp_t *what, + rspamd_regexp_t *with); + +/** + * Initialize and optimize re cache structure + */ +void rspamd_re_cache_init(struct rspamd_re_cache *cache, + struct rspamd_config *cfg); + +enum rspamd_hyperscan_status { + RSPAMD_HYPERSCAN_UNKNOWN = 0, + RSPAMD_HYPERSCAN_UNSUPPORTED, + RSPAMD_HYPERSCAN_LOADED_PARTIAL, + RSPAMD_HYPERSCAN_LOADED_FULL, + RSPAMD_HYPERSCAN_LOAD_ERROR, +}; + +/** + * Returns true when hyperscan is loaded + * @param cache + * @return + */ +enum rspamd_hyperscan_status rspamd_re_cache_is_hs_loaded(struct rspamd_re_cache *cache); + +/** + * Get runtime data for a cache + */ +struct rspamd_re_runtime *rspamd_re_cache_runtime_new(struct rspamd_re_cache *cache); + +/** + * Get runtime statistics + */ +const struct rspamd_re_cache_stat * +rspamd_re_cache_get_stat(struct rspamd_re_runtime *rt); + +/** + * Process regexp runtime and return the result for a specific regexp + * @param task task object + * @param rt cache runtime object + * @param re regexp object + * @param type type of object + * @param type_data associated data with the type (e.g. header name) + * @param datalen associated data length + * @param is_strong use case sensitive match when looking for headers + */ +gint rspamd_re_cache_process(struct rspamd_task *task, + rspamd_regexp_t *re, + enum rspamd_re_type type, + gconstpointer type_data, + gsize datalen, + gboolean is_strong); + +int rspamd_re_cache_process_ffi(void *ptask, + void *pre, + int type, + void *type_data, + int is_strong); + +/** + * Destroy runtime data + */ +void rspamd_re_cache_runtime_destroy(struct rspamd_re_runtime *rt); + +/** + * Unref re cache + */ +void rspamd_re_cache_unref(struct rspamd_re_cache *cache); + +/** + * Retain reference to re cache + */ +struct rspamd_re_cache *rspamd_re_cache_ref(struct rspamd_re_cache *cache); + +/** + * Set limit for all regular expressions in the cache, returns previous limit + */ +guint rspamd_re_cache_set_limit(struct rspamd_re_cache *cache, guint limit); + +/** + * Convert re type to a human readable string (constant one) + */ +const gchar *rspamd_re_cache_type_to_string(enum rspamd_re_type type); + +/** + * Convert re type string to the type enum + */ +enum rspamd_re_type rspamd_re_cache_type_from_string(const char *str); + +struct ev_loop; +/** + * Compile expressions to the hyperscan tree and store in the `cache_dir` + */ +gint rspamd_re_cache_compile_hyperscan(struct rspamd_re_cache *cache, + const char *cache_dir, + gdouble max_time, + gboolean silent, + struct ev_loop *event_loop, + void (*cb)(guint ncompiled, GError *err, void *cbd), + void *cbd); + +/** + * Returns TRUE if the specified file is valid hyperscan cache + */ +gboolean rspamd_re_cache_is_valid_hyperscan_file(struct rspamd_re_cache *cache, + const char *path, + gboolean silent, + gboolean try_load, + GError **err); + +/** + * Loads all hyperscan regexps precompiled + */ +enum rspamd_hyperscan_status rspamd_re_cache_load_hyperscan( + struct rspamd_re_cache *cache, + const char *cache_dir, bool try_load); + +/** + * Registers lua selector in the cache + */ +void rspamd_re_cache_add_selector(struct rspamd_re_cache *cache, + const gchar *sname, gint ref); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/redis_pool.cxx b/src/libserver/redis_pool.cxx new file mode 100644 index 0000000..9c2d6cf --- /dev/null +++ b/src/libserver/redis_pool.cxx @@ -0,0 +1,663 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "contrib/libev/ev.h" +#include "redis_pool.h" +#include "cfg_file.h" +#include "contrib/hiredis/hiredis.h" +#include "contrib/hiredis/async.h" +#include "contrib/hiredis/adapters/libev.h" +#include "cryptobox.h" +#include "logger.h" +#include "contrib/ankerl/unordered_dense.h" + +#include <list> +#include <unordered_map> + +namespace rspamd { +class redis_pool_elt; +class redis_pool; + +#define msg_debug_rpool(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_redis_pool_log_id, "redis_pool", conn->tag, \ + __FUNCTION__, \ + __VA_ARGS__) + +INIT_LOG_MODULE(redis_pool) + +enum class rspamd_redis_pool_connection_state : std::uint8_t { + RSPAMD_REDIS_POOL_CONN_INACTIVE = 0, + RSPAMD_REDIS_POOL_CONN_ACTIVE, + RSPAMD_REDIS_POOL_CONN_FINALISING +}; + +struct redis_pool_connection { + using redis_pool_connection_ptr = std::unique_ptr<redis_pool_connection>; + using conn_iter_t = std::list<redis_pool_connection_ptr>::iterator; + struct redisAsyncContext *ctx; + redis_pool_elt *elt; + redis_pool *pool; + conn_iter_t elt_pos; + ev_timer timeout; + gchar tag[MEMPOOL_UID_LEN]; + rspamd_redis_pool_connection_state state; + + auto schedule_timeout() -> void; + ~redis_pool_connection(); + + explicit redis_pool_connection(redis_pool *_pool, + redis_pool_elt *_elt, + const std::string &db, + const std::string &username, + const std::string &password, + struct redisAsyncContext *_ctx); + +private: + static auto redis_conn_timeout_cb(EV_P_ ev_timer *w, int revents) -> void; + static auto redis_quit_cb(redisAsyncContext *c, void *r, void *priv) -> void; + static auto redis_on_disconnect(const struct redisAsyncContext *ac, int status) -> auto; +}; + + +using redis_pool_key_t = std::uint64_t; +class redis_pool; + +class redis_pool_elt { + using redis_pool_connection_ptr = std::unique_ptr<redis_pool_connection>; + redis_pool *pool; + /* + * These lists owns connections, so if an element is removed from both + * lists, it is destructed + */ + std::list<redis_pool_connection_ptr> active; + std::list<redis_pool_connection_ptr> inactive; + std::list<redis_pool_connection_ptr> terminating; + std::string ip; + std::string db; + std::string username; + std::string password; + int port; + redis_pool_key_t key; + bool is_unix; + +public: + /* Disable copy */ + redis_pool_elt() = delete; + redis_pool_elt(const redis_pool_elt &) = delete; + /* Enable move */ + redis_pool_elt(redis_pool_elt &&other) = default; + + explicit redis_pool_elt(redis_pool *_pool, + const gchar *_db, const gchar *_username, + const gchar *_password, + const char *_ip, int _port) + : pool(_pool), ip(_ip), port(_port), + key(redis_pool_elt::make_key(_db, _username, _password, _ip, _port)) + { + is_unix = ip[0] == '.' || ip[0] == '/'; + + if (_db) { + db = _db; + } + if (_username) { + username = _username; + } + if (_password) { + password = _password; + } + } + + auto new_connection() -> redisAsyncContext *; + + auto release_connection(const redis_pool_connection *conn) -> void + { + switch (conn->state) { + case rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_ACTIVE: + active.erase(conn->elt_pos); + break; + case rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_INACTIVE: + inactive.erase(conn->elt_pos); + break; + case rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_FINALISING: + terminating.erase(conn->elt_pos); + break; + } + } + + auto move_to_inactive(redis_pool_connection *conn) -> void + { + inactive.splice(std::end(inactive), active, conn->elt_pos); + conn->elt_pos = std::prev(std::end(inactive)); + } + + auto move_to_terminating(redis_pool_connection *conn) -> void + { + terminating.splice(std::end(terminating), inactive, conn->elt_pos); + conn->elt_pos = std::prev(std::end(terminating)); + } + + inline static auto make_key(const gchar *db, const gchar *username, + const gchar *password, const char *ip, int port) -> redis_pool_key_t + { + rspamd_cryptobox_fast_hash_state_t st; + + rspamd_cryptobox_fast_hash_init(&st, rspamd_hash_seed()); + + if (db) { + rspamd_cryptobox_fast_hash_update(&st, db, strlen(db)); + } + if (username) { + rspamd_cryptobox_fast_hash_update(&st, username, strlen(username)); + } + if (password) { + rspamd_cryptobox_fast_hash_update(&st, password, strlen(password)); + } + + rspamd_cryptobox_fast_hash_update(&st, ip, strlen(ip)); + rspamd_cryptobox_fast_hash_update(&st, &port, sizeof(port)); + + return rspamd_cryptobox_fast_hash_final(&st); + } + + auto num_active() const -> auto + { + return active.size(); + } + + ~redis_pool_elt() + { + rspamd_explicit_memzero(password.data(), password.size()); + } + +private: + auto redis_async_new() -> redisAsyncContext * + { + struct redisAsyncContext *ctx; + + if (is_unix) { + ctx = redisAsyncConnectUnix(ip.c_str()); + } + else { + ctx = redisAsyncConnect(ip.c_str(), port); + } + + if (ctx && ctx->err != REDIS_OK) { + msg_err("cannot connect to redis %s (port %d): %s", ip.c_str(), port, + ctx->errstr); + redisAsyncFree(ctx); + + return nullptr; + } + + return ctx; + } +}; + +class redis_pool final { + static constexpr const double default_timeout = 10.0; + static constexpr const unsigned default_max_conns = 100; + + /* We want to have references integrity */ + ankerl::unordered_dense::map<redisAsyncContext *, + redis_pool_connection *> + conns_by_ctx; + /* + * We store a pointer to the element in each connection, so this has to be + * a buckets map with pointers/references stability guarantees. + */ + std::unordered_map<redis_pool_key_t, redis_pool_elt> elts_by_key; + bool wanna_die = false; /* Hiredis is 'clever' so we can call ourselves from destructor */ +public: + double timeout = default_timeout; + unsigned max_conns = default_max_conns; + struct ev_loop *event_loop; + struct rspamd_config *cfg; + +public: + explicit redis_pool() + : event_loop(nullptr), cfg(nullptr) + { + conns_by_ctx.reserve(max_conns); + } + + /* Legacy stuff */ + auto do_config(struct ev_loop *_loop, struct rspamd_config *_cfg) -> void + { + event_loop = _loop; + cfg = _cfg; + } + + auto new_connection(const gchar *db, const gchar *username, + const gchar *password, const char *ip, int port) -> redisAsyncContext *; + + auto release_connection(redisAsyncContext *ctx, + enum rspamd_redis_pool_release_type how) -> void; + + auto unregister_context(redisAsyncContext *ctx) -> void + { + conns_by_ctx.erase(ctx); + } + + auto register_context(redisAsyncContext *ctx, redis_pool_connection *conn) + { + conns_by_ctx.emplace(ctx, conn); + } + + /* Hack to prevent Redis callbacks to be executed */ + auto prepare_to_die() -> void + { + wanna_die = true; + } + + ~redis_pool() + { + } +}; + + +redis_pool_connection::~redis_pool_connection() +{ + const auto *conn = this; /* For debug */ + + if (state == rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_ACTIVE) { + msg_debug_rpool("active connection destructed: %p", ctx); + + if (ctx) { + pool->unregister_context(ctx); + + if (!(ctx->c.flags & REDIS_FREEING)) { + auto *ac = ctx; + ctx = nullptr; + ac->onDisconnect = nullptr; + redisAsyncFree(ac); + } + } + } + else { + msg_debug_rpool("inactive connection destructed: %p", ctx); + + ev_timer_stop(pool->event_loop, &timeout); + if (ctx) { + pool->unregister_context(ctx); + + if (!(ctx->c.flags & REDIS_FREEING)) { + auto *ac = ctx; + /* To prevent on_disconnect here */ + ctx = nullptr; + ac->onDisconnect = nullptr; + redisAsyncFree(ac); + } + } + } +} + +auto redis_pool_connection::redis_quit_cb(redisAsyncContext *c, void *r, void *priv) -> void +{ + struct redis_pool_connection *conn = + (struct redis_pool_connection *) priv; + + msg_debug_rpool("quit command reply for the connection %p", + conn->ctx); + /* + * The connection will be freed by hiredis itself as we are here merely after + * quit command has succeeded and we have timer being set already. + * The problem is that when this callback is called, our connection is likely + * dead, so probably even on_disconnect callback has been already called... + * + * Hence, the connection might already be freed, so even (conn) pointer may be + * inaccessible. + * + * TODO: Use refcounts to prevent this stuff to happen, the problem is how + * to handle Redis timeout on `quit` command in fact... The good thing is that + * it will not likely happen. + */ +} + +/* + * Called for inactive connections that due to be removed + */ +auto redis_pool_connection::redis_conn_timeout_cb(EV_P_ ev_timer *w, int revents) -> void +{ + auto *conn = (struct redis_pool_connection *) w->data; + + g_assert(conn->state != rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_ACTIVE); + + if (conn->state == rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_INACTIVE) { + msg_debug_rpool("scheduled soft removal of connection %p", + conn->ctx); + conn->state = rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_FINALISING; + ev_timer_again(EV_A_ w); + redisAsyncCommand(conn->ctx, redis_pool_connection::redis_quit_cb, conn, "QUIT"); + conn->elt->move_to_terminating(conn); + } + else { + /* Finalising by timeout */ + ev_timer_stop(EV_A_ w); + msg_debug_rpool("final removal of connection %p, refcount: %d", + conn->ctx); + + /* Erasure of shared pointer will cause it to be removed */ + conn->elt->release_connection(conn); + } +} + +auto redis_pool_connection::redis_on_disconnect(const struct redisAsyncContext *ac, int status) -> auto +{ + auto *conn = (struct redis_pool_connection *) ac->data; + + /* + * Here, we know that redis itself will free this connection + * so, we need to do something very clever about it + */ + if (conn->state != rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_ACTIVE) { + /* Do nothing for active connections as it is already handled somewhere */ + if (conn->ctx) { + msg_debug_rpool("inactive connection terminated: %s", + conn->ctx->errstr); + } + + /* Erasure of shared pointer will cause it to be removed */ + conn->elt->release_connection(conn); + } +} + +auto redis_pool_connection::schedule_timeout() -> void +{ + const auto *conn = this; /* For debug */ + double real_timeout; + auto active_elts = elt->num_active(); + + if (active_elts > pool->max_conns) { + real_timeout = pool->timeout / 2.0; + real_timeout = rspamd_time_jitter(real_timeout, real_timeout / 4.0); + } + else { + real_timeout = pool->timeout; + real_timeout = rspamd_time_jitter(real_timeout, real_timeout / 2.0); + } + + msg_debug_rpool("scheduled connection %p cleanup in %.1f seconds", + ctx, real_timeout); + + timeout.data = this; + /* Restore in case if these fields have been modified externally */ + ctx->data = this; + redisAsyncSetDisconnectCallback(ctx, redis_pool_connection::redis_on_disconnect); + ev_timer_init(&timeout, + redis_pool_connection::redis_conn_timeout_cb, + real_timeout, real_timeout / 2.0); + ev_timer_start(pool->event_loop, &timeout); +} + + +redis_pool_connection::redis_pool_connection(redis_pool *_pool, + redis_pool_elt *_elt, + const std::string &db, + const std::string &username, + const std::string &password, + struct redisAsyncContext *_ctx) + : ctx(_ctx), elt(_elt), pool(_pool) +{ + + state = rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_ACTIVE; + + pool->register_context(ctx, this); + ctx->data = this; + memset(tag, 0, sizeof(tag)); + rspamd_random_hex(tag, sizeof(tag) - 1); + + redisLibevAttach(pool->event_loop, ctx); + redisAsyncSetDisconnectCallback(ctx, redis_pool_connection::redis_on_disconnect); + + if (!username.empty()) { + if (!password.empty()) { + redisAsyncCommand(ctx, nullptr, nullptr, + "AUTH %s %s", username.c_str(), password.c_str()); + } + else { + msg_warn("Redis requires a password when username is supplied"); + } + } + else if (!password.empty()) { + redisAsyncCommand(ctx, nullptr, nullptr, + "AUTH %s", password.c_str()); + } + if (!db.empty()) { + redisAsyncCommand(ctx, nullptr, nullptr, + "SELECT %s", db.c_str()); + } +} + +auto redis_pool_elt::new_connection() -> redisAsyncContext * +{ + if (!inactive.empty()) { + decltype(inactive)::value_type conn; + conn.swap(inactive.back()); + inactive.pop_back(); + + g_assert(conn->state != rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_ACTIVE); + if (conn->ctx->err == REDIS_OK) { + /* Also check SO_ERROR */ + gint err; + socklen_t len = sizeof(gint); + + if (getsockopt(conn->ctx->c.fd, SOL_SOCKET, SO_ERROR, + (void *) &err, &len) == -1) { + err = errno; + } + + if (err != 0) { + /* + * We cannot reuse connection, so we just recursively call + * this function one more time + */ + return new_connection(); + } + else { + /* Reuse connection */ + ev_timer_stop(pool->event_loop, &conn->timeout); + conn->state = rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_ACTIVE; + msg_debug_rpool("reused existing connection to %s:%d: %p", + ip.c_str(), port, conn->ctx); + active.emplace_front(std::move(conn)); + active.front()->elt_pos = active.begin(); + + return active.front()->ctx; + } + } + else { + auto *nctx = redis_async_new(); + if (nctx) { + active.emplace_front(std::make_unique<redis_pool_connection>(pool, this, + db.c_str(), username.c_str(), password.c_str(), nctx)); + active.front()->elt_pos = active.begin(); + } + + return nctx; + } + } + else { + auto *nctx = redis_async_new(); + if (nctx) { + active.emplace_front(std::make_unique<redis_pool_connection>(pool, this, + db.c_str(), username.c_str(), password.c_str(), nctx)); + active.front()->elt_pos = active.begin(); + } + + return nctx; + } + + RSPAMD_UNREACHABLE; +} + +auto redis_pool::new_connection(const gchar *db, const gchar *username, + const gchar *password, const char *ip, int port) -> redisAsyncContext * +{ + + if (!wanna_die) { + auto key = redis_pool_elt::make_key(db, username, password, ip, port); + auto found_elt = elts_by_key.find(key); + + if (found_elt != elts_by_key.end()) { + auto &elt = found_elt->second; + + return elt.new_connection(); + } + else { + /* Need to create a pool */ + auto nelt = elts_by_key.try_emplace(key, + this, db, username, password, ip, port); + + return nelt.first->second.new_connection(); + } + } + + return nullptr; +} + +auto redis_pool::release_connection(redisAsyncContext *ctx, + enum rspamd_redis_pool_release_type how) -> void +{ + if (!wanna_die) { + auto conn_it = conns_by_ctx.find(ctx); + if (conn_it != conns_by_ctx.end()) { + auto *conn = conn_it->second; + g_assert(conn->state == rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_ACTIVE); + + if (ctx->err != REDIS_OK) { + /* We need to terminate connection forcefully */ + msg_debug_rpool("closed connection %p due to an error", conn->ctx); + } + else { + if (how == RSPAMD_REDIS_RELEASE_DEFAULT) { + /* Ensure that there are no callbacks attached to this conn */ + if (ctx->replies.head == nullptr && (ctx->c.flags & REDIS_CONNECTED)) { + /* Just move it to the inactive queue */ + conn->state = rspamd_redis_pool_connection_state::RSPAMD_REDIS_POOL_CONN_INACTIVE; + conn->elt->move_to_inactive(conn); + conn->schedule_timeout(); + msg_debug_rpool("mark connection %p inactive", conn->ctx); + + return; + } + else { + msg_debug_rpool("closed connection %p due to callbacks left", + conn->ctx); + } + } + else { + if (how == RSPAMD_REDIS_RELEASE_FATAL) { + msg_debug_rpool("closed connection %p due to an fatal termination", + conn->ctx); + } + else { + msg_debug_rpool("closed connection %p due to explicit termination", + conn->ctx); + } + } + } + + conn->elt->release_connection(conn); + } + else { + msg_err("fatal internal error, connection with ctx %p is not found in the Redis pool", + ctx); + RSPAMD_UNREACHABLE; + } + } +} + +}// namespace rspamd + +void * +rspamd_redis_pool_init(void) +{ + return new rspamd::redis_pool{}; +} + +void rspamd_redis_pool_config(void *p, + struct rspamd_config *cfg, + struct ev_loop *ev_base) +{ + g_assert(p != NULL); + auto *pool = reinterpret_cast<class rspamd::redis_pool *>(p); + + pool->do_config(ev_base, cfg); +} + + +struct redisAsyncContext * +rspamd_redis_pool_connect(void *p, + const gchar *db, const gchar *username, + const gchar *password, const char *ip, int port) +{ + g_assert(p != NULL); + auto *pool = reinterpret_cast<class rspamd::redis_pool *>(p); + + return pool->new_connection(db, username, password, ip, port); +} + + +void rspamd_redis_pool_release_connection(void *p, + struct redisAsyncContext *ctx, enum rspamd_redis_pool_release_type how) +{ + g_assert(p != NULL); + g_assert(ctx != NULL); + auto *pool = reinterpret_cast<class rspamd::redis_pool *>(p); + + pool->release_connection(ctx, how); +} + + +void rspamd_redis_pool_destroy(void *p) +{ + auto *pool = reinterpret_cast<class rspamd::redis_pool *>(p); + + pool->prepare_to_die(); + delete pool; +} + +const gchar * +rspamd_redis_type_to_string(int type) +{ + const gchar *ret = "unknown"; + + switch (type) { + case REDIS_REPLY_STRING: + ret = "string"; + break; + case REDIS_REPLY_ARRAY: + ret = "array"; + break; + case REDIS_REPLY_INTEGER: + ret = "int"; + break; + case REDIS_REPLY_STATUS: + ret = "status"; + break; + case REDIS_REPLY_NIL: + ret = "nil"; + break; + case REDIS_REPLY_ERROR: + ret = "error"; + break; + default: + break; + } + + return ret; +} diff --git a/src/libserver/redis_pool.h b/src/libserver/redis_pool.h new file mode 100644 index 0000000..ecdaa0f --- /dev/null +++ b/src/libserver/redis_pool.h @@ -0,0 +1,91 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBSERVER_REDIS_POOL_H_ +#define SRC_LIBSERVER_REDIS_POOL_H_ + +#include "config.h" + +#ifdef __cplusplus +extern "C" { +#endif +struct rspamd_config; +struct redisAsyncContext; +struct ev_loop; + +/** + * Creates new redis pool + * @return + */ +void *rspamd_redis_pool_init(void); + +/** + * Configure redis pool and binds it to a specific event base + * @param cfg + * @param ev_base + */ +void rspamd_redis_pool_config(void *pool, + struct rspamd_config *cfg, + struct ev_loop *ev_base); + + +/** + * Create or reuse the specific redis connection + * @param pool + * @param db + * @param username + * @param password + * @param ip + * @param port + * @return + */ +struct redisAsyncContext *rspamd_redis_pool_connect( + void *pool, + const gchar *db, const gchar *username, const gchar *password, + const char *ip, int port); + +enum rspamd_redis_pool_release_type { + RSPAMD_REDIS_RELEASE_DEFAULT = 0, + RSPAMD_REDIS_RELEASE_FATAL = 1, + RSPAMD_REDIS_RELEASE_ENFORCE +}; + +/** + * Release a connection to the pool + * @param pool + * @param ctx + */ +void rspamd_redis_pool_release_connection(void *pool, + struct redisAsyncContext *ctx, + enum rspamd_redis_pool_release_type how); + +/** + * Stops redis pool and destroys it + * @param pool + */ +void rspamd_redis_pool_destroy(void *pool); + +/** + * Missing in hiredis + * @param type + * @return + */ +const gchar *rspamd_redis_type_to_string(int type); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBSERVER_REDIS_POOL_H_ */ diff --git a/src/libserver/roll_history.c b/src/libserver/roll_history.c new file mode 100644 index 0000000..f567b0b --- /dev/null +++ b/src/libserver/roll_history.c @@ -0,0 +1,432 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "rspamd.h" +#include "libmime/message.h" +#include "lua/lua_common.h" +#include "unix-std.h" +#include "cfg_file_private.h" + +static const gchar rspamd_history_magic_old[] = {'r', 's', 'h', '1'}; + +/** + * Returns new roll history + * @param pool pool for shared memory + * @return new structure + */ +struct roll_history * +rspamd_roll_history_new(rspamd_mempool_t *pool, guint max_rows, + struct rspamd_config *cfg) +{ + struct roll_history *history; + lua_State *L = cfg->lua_state; + + if (pool == NULL || max_rows == 0) { + return NULL; + } + + history = rspamd_mempool_alloc0_shared(pool, sizeof(struct roll_history)); + + /* + * Here, we check if there is any plugin that handles history, + * in this case, we disable this code completely + */ + lua_getglobal(L, "rspamd_plugins"); + if (lua_istable(L, -1)) { + lua_pushstring(L, "history"); + lua_gettable(L, -2); + + if (lua_istable(L, -1)) { + history->disabled = TRUE; + } + + lua_pop(L, 1); + } + + lua_pop(L, 1); + + if (!history->disabled) { + history->rows = rspamd_mempool_alloc0_shared(pool, + sizeof(struct roll_history_row) * max_rows); + history->nrows = max_rows; + } + + return history; +} + +struct history_metric_callback_data { + gchar *pos; + gint remain; +}; + +static void +roll_history_symbols_callback(gpointer key, gpointer value, void *user_data) +{ + struct history_metric_callback_data *cb = user_data; + struct rspamd_symbol_result *s = value; + guint wr; + + if (s->flags & RSPAMD_SYMBOL_RESULT_IGNORED) { + return; + } + + if (cb->remain > 0) { + wr = rspamd_snprintf(cb->pos, cb->remain, "%s, ", s->name); + cb->pos += wr; + cb->remain -= wr; + } +} + +/** + * Update roll history with data from task + * @param history roll history object + * @param task task object + */ +void rspamd_roll_history_update(struct roll_history *history, + struct rspamd_task *task) +{ + guint row_num; + struct roll_history_row *row; + struct rspamd_scan_result *metric_res; + struct history_metric_callback_data cbdata; + struct rspamd_action *action; + + if (history->disabled) { + return; + } + + /* First of all obtain check and obtain row number */ + g_atomic_int_compare_and_exchange(&history->cur_row, history->nrows, 0); +#if ((GLIB_MAJOR_VERSION == 2) && (GLIB_MINOR_VERSION > 30)) + row_num = g_atomic_int_add(&history->cur_row, 1); +#else + row_num = g_atomic_int_exchange_and_add(&history->cur_row, 1); +#endif + + if (row_num < history->nrows) { + row = &history->rows[row_num]; + g_atomic_int_set(&row->completed, FALSE); + } + else { + /* Race condition */ + history->cur_row = 0; + return; + } + + /* Add information from task to roll history */ + if (task->from_addr) { + rspamd_strlcpy(row->from_addr, + rspamd_inet_address_to_string(task->from_addr), + sizeof(row->from_addr)); + } + else { + rspamd_strlcpy(row->from_addr, "unknown", sizeof(row->from_addr)); + } + + row->timestamp = task->task_timestamp; + + /* Strings */ + if (task->message) { + rspamd_strlcpy(row->message_id, MESSAGE_FIELD(task, message_id), + sizeof(row->message_id)); + } + if (task->auth_user) { + rspamd_strlcpy(row->user, task->auth_user, sizeof(row->user)); + } + else { + row->user[0] = '\0'; + } + + /* Get default metric */ + metric_res = task->result; + + if (metric_res == NULL) { + row->symbols[0] = '\0'; + row->action = METRIC_ACTION_NOACTION; + } + else { + row->score = metric_res->score; + action = rspamd_check_action_metric(task, NULL, NULL); + row->action = action->action_type; + row->required_score = rspamd_task_get_required_score(task, metric_res); + cbdata.pos = row->symbols; + cbdata.remain = sizeof(row->symbols); + rspamd_task_symbol_result_foreach(task, NULL, + roll_history_symbols_callback, + &cbdata); + if (cbdata.remain > 0) { + /* Remove last whitespace and comma */ + *cbdata.pos-- = '\0'; + *cbdata.pos-- = '\0'; + *cbdata.pos = '\0'; + } + } + + row->scan_time = task->time_real_finish - task->task_timestamp; + row->len = task->msg.len; + g_atomic_int_set(&row->completed, TRUE); +} + +/** + * Load previously saved history from file + * @param history roll history object + * @param filename filename to load from + * @return TRUE if history has been loaded + */ +gboolean +rspamd_roll_history_load(struct roll_history *history, const gchar *filename) +{ + gint fd; + struct stat st; + gchar magic[sizeof(rspamd_history_magic_old)]; + ucl_object_t *top; + const ucl_object_t *cur, *elt; + struct ucl_parser *parser; + struct roll_history_row *row; + guint n, i; + + g_assert(history != NULL); + if (history->disabled) { + return TRUE; + } + + if (stat(filename, &st) == -1) { + msg_info("cannot load history from %s: %s", filename, + strerror(errno)); + return FALSE; + } + + if ((fd = open(filename, O_RDONLY)) == -1) { + msg_info("cannot load history from %s: %s", filename, + strerror(errno)); + return FALSE; + } + + /* Check for old format */ + if (read(fd, magic, sizeof(magic)) == -1) { + close(fd); + msg_info("cannot read history from %s: %s", filename, + strerror(errno)); + return FALSE; + } + + if (memcmp(magic, rspamd_history_magic_old, sizeof(magic)) == 0) { + close(fd); + msg_warn("cannot read history from old format %s, " + "it will be replaced after restart", + filename); + return FALSE; + } + + parser = ucl_parser_new(0); + + if (!ucl_parser_add_fd(parser, fd)) { + msg_warn("cannot parse history file %s: %s", filename, + ucl_parser_get_error(parser)); + ucl_parser_free(parser); + close(fd); + + return FALSE; + } + + top = ucl_parser_get_object(parser); + ucl_parser_free(parser); + close(fd); + + if (top == NULL) { + msg_warn("cannot parse history file %s: no object", filename); + + return FALSE; + } + + if (ucl_object_type(top) != UCL_ARRAY) { + msg_warn("invalid object type read from: %s", filename); + ucl_object_unref(top); + + return FALSE; + } + + if (top->len > history->nrows) { + msg_warn("stored history is larger than the current one: %ud (file) vs " + "%ud (history)", + top->len, history->nrows); + n = history->nrows; + } + else if (top->len < history->nrows) { + msg_warn( + "stored history is smaller than the current one: %ud (file) vs " + "%ud (history)", + top->len, history->nrows); + n = top->len; + } + else { + n = top->len; + } + + for (i = 0; i < n; i++) { + cur = ucl_array_find_index(top, i); + + if (cur != NULL && ucl_object_type(cur) == UCL_OBJECT) { + row = &history->rows[i]; + memset(row, 0, sizeof(*row)); + + elt = ucl_object_lookup(cur, "time"); + + if (elt && ucl_object_type(elt) == UCL_FLOAT) { + row->timestamp = ucl_object_todouble(elt); + } + + elt = ucl_object_lookup(cur, "id"); + + if (elt && ucl_object_type(elt) == UCL_STRING) { + rspamd_strlcpy(row->message_id, ucl_object_tostring(elt), + sizeof(row->message_id)); + } + + elt = ucl_object_lookup(cur, "symbols"); + + if (elt && ucl_object_type(elt) == UCL_STRING) { + rspamd_strlcpy(row->symbols, ucl_object_tostring(elt), + sizeof(row->symbols)); + } + + elt = ucl_object_lookup(cur, "user"); + + if (elt && ucl_object_type(elt) == UCL_STRING) { + rspamd_strlcpy(row->user, ucl_object_tostring(elt), + sizeof(row->user)); + } + + elt = ucl_object_lookup(cur, "from"); + + if (elt && ucl_object_type(elt) == UCL_STRING) { + rspamd_strlcpy(row->from_addr, ucl_object_tostring(elt), + sizeof(row->from_addr)); + } + + elt = ucl_object_lookup(cur, "len"); + + if (elt && ucl_object_type(elt) == UCL_INT) { + row->len = ucl_object_toint(elt); + } + + elt = ucl_object_lookup(cur, "scan_time"); + + if (elt && ucl_object_type(elt) == UCL_FLOAT) { + row->scan_time = ucl_object_todouble(elt); + } + + elt = ucl_object_lookup(cur, "score"); + + if (elt && ucl_object_type(elt) == UCL_FLOAT) { + row->score = ucl_object_todouble(elt); + } + + elt = ucl_object_lookup(cur, "required_score"); + + if (elt && ucl_object_type(elt) == UCL_FLOAT) { + row->required_score = ucl_object_todouble(elt); + } + + elt = ucl_object_lookup(cur, "action"); + + if (elt && ucl_object_type(elt) == UCL_INT) { + row->action = ucl_object_toint(elt); + } + + row->completed = TRUE; + } + } + + ucl_object_unref(top); + + history->cur_row = n; + + return TRUE; +} + +/** + * Save history to file + * @param history roll history object + * @param filename filename to load from + * @return TRUE if history has been saved + */ +gboolean +rspamd_roll_history_save(struct roll_history *history, const gchar *filename) +{ + gint fd; + FILE *fp; + ucl_object_t *obj, *elt; + guint i; + struct roll_history_row *row; + struct ucl_emitter_functions *emitter_func; + + g_assert(history != NULL); + + if (history->disabled) { + return TRUE; + } + + if ((fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 00600)) == -1) { + msg_info("cannot save history to %s: %s", filename, strerror(errno)); + return FALSE; + } + + fp = fdopen(fd, "w"); + obj = ucl_object_typed_new(UCL_ARRAY); + + for (i = 0; i < history->nrows; i++) { + row = &history->rows[i]; + + if (!row->completed) { + continue; + } + + elt = ucl_object_typed_new(UCL_OBJECT); + + ucl_object_insert_key(elt, ucl_object_fromdouble(row->timestamp), + "time", 0, false); + ucl_object_insert_key(elt, ucl_object_fromstring(row->message_id), + "id", 0, false); + ucl_object_insert_key(elt, ucl_object_fromstring(row->symbols), + "symbols", 0, false); + ucl_object_insert_key(elt, ucl_object_fromstring(row->user), + "user", 0, false); + ucl_object_insert_key(elt, ucl_object_fromstring(row->from_addr), + "from", 0, false); + ucl_object_insert_key(elt, ucl_object_fromint(row->len), + "len", 0, false); + ucl_object_insert_key(elt, ucl_object_fromdouble(row->scan_time), + "scan_time", 0, false); + ucl_object_insert_key(elt, ucl_object_fromdouble(row->score), + "score", 0, false); + ucl_object_insert_key(elt, ucl_object_fromdouble(row->required_score), + "required_score", 0, false); + ucl_object_insert_key(elt, ucl_object_fromint(row->action), + "action", 0, false); + + ucl_array_append(obj, elt); + } + + emitter_func = ucl_object_emit_file_funcs(fp); + ucl_object_emit_full(obj, UCL_EMIT_JSON_COMPACT, emitter_func, NULL); + ucl_object_emit_funcs_free(emitter_func); + ucl_object_unref(obj); + + fclose(fp); + + return TRUE; +} diff --git a/src/libserver/roll_history.h b/src/libserver/roll_history.h new file mode 100644 index 0000000..62bce7f --- /dev/null +++ b/src/libserver/roll_history.h @@ -0,0 +1,98 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ROLL_HISTORY_H_ +#define ROLL_HISTORY_H_ + +#include "config.h" +#include "mem_pool.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Roll history is a special cycled buffer for checked messages, it is designed for writing history messages + * and displaying them in webui + */ + +#define HISTORY_MAX_ID 256 +#define HISTORY_MAX_SYMBOLS 256 +#define HISTORY_MAX_USER 32 +#define HISTORY_MAX_ADDR 32 + +struct rspamd_task; +struct rspamd_config; + +struct roll_history_row { + ev_tstamp timestamp; + gchar message_id[HISTORY_MAX_ID]; + gchar symbols[HISTORY_MAX_SYMBOLS]; + gchar user[HISTORY_MAX_USER]; + gchar from_addr[HISTORY_MAX_ADDR]; + gsize len; + gdouble scan_time; + gdouble score; + gdouble required_score; + gint action; + guint completed; +}; + +struct roll_history { + struct roll_history_row *rows; + gboolean disabled; + guint nrows; + guint cur_row; +}; + +/** + * Returns new roll history + * @param pool pool for shared memory + * @return new structure + */ +struct roll_history *rspamd_roll_history_new(rspamd_mempool_t *pool, + guint max_rows, struct rspamd_config *cfg); + +/** + * Update roll history with data from task + * @param history roll history object + * @param task task object + */ +void rspamd_roll_history_update(struct roll_history *history, + struct rspamd_task *task); + +/** + * Load previously saved history from file + * @param history roll history object + * @param filename filename to load from + * @return TRUE if history has been loaded + */ +gboolean rspamd_roll_history_load(struct roll_history *history, + const gchar *filename); + +/** + * Save history to file + * @param history roll history object + * @param filename filename to load from + * @return TRUE if history has been saved + */ +gboolean rspamd_roll_history_save(struct roll_history *history, + const gchar *filename); + +#ifdef __cplusplus +} +#endif + +#endif /* ROLL_HISTORY_H_ */ diff --git a/src/libserver/rspamd_control.c b/src/libserver/rspamd_control.c new file mode 100644 index 0000000..69af059 --- /dev/null +++ b/src/libserver/rspamd_control.c @@ -0,0 +1,1334 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "rspamd.h" +#include "rspamd_control.h" +#include "worker_util.h" +#include "libserver/http/http_connection.h" +#include "libserver/http/http_private.h" +#include "libutil/libev_helper.h" +#include "unix-std.h" +#include "utlist.h" + +#ifdef HAVE_SYS_RESOURCE_H +#include <sys/resource.h> +#endif + +#ifdef WITH_HYPERSCAN +#include "hyperscan_tools.h" +#endif + +static ev_tstamp io_timeout = 30.0; +static ev_tstamp worker_io_timeout = 0.5; + +struct rspamd_control_session; + +struct rspamd_control_reply_elt { + struct rspamd_control_reply reply; + struct rspamd_io_ev ev; + struct ev_loop *event_loop; + GQuark wrk_type; + pid_t wrk_pid; + gpointer ud; + gint attached_fd; + GHashTable *pending_elts; + struct rspamd_control_reply_elt *prev, *next; +}; + +struct rspamd_control_session { + gint fd; + struct ev_loop *event_loop; + struct rspamd_main *rspamd_main; + struct rspamd_http_connection *conn; + struct rspamd_control_command cmd; + struct rspamd_control_reply_elt *replies; + rspamd_inet_addr_t *addr; + guint replies_remain; + gboolean is_reply; +}; + +static const struct rspamd_control_cmd_match { + rspamd_ftok_t name; + enum rspamd_control_type type; +} cmd_matches[] = { + {.name = { + .begin = "/stat", + .len = sizeof("/stat") - 1}, + .type = RSPAMD_CONTROL_STAT}, + {.name = {.begin = "/reload", .len = sizeof("/reload") - 1}, .type = RSPAMD_CONTROL_RELOAD}, + {.name = {.begin = "/reresolve", .len = sizeof("/reresolve") - 1}, .type = RSPAMD_CONTROL_RERESOLVE}, + {.name = {.begin = "/recompile", .len = sizeof("/recompile") - 1}, .type = RSPAMD_CONTROL_RECOMPILE}, + {.name = {.begin = "/fuzzystat", .len = sizeof("/fuzzystat") - 1}, .type = RSPAMD_CONTROL_FUZZY_STAT}, + {.name = {.begin = "/fuzzysync", .len = sizeof("/fuzzysync") - 1}, .type = RSPAMD_CONTROL_FUZZY_SYNC}, +}; + +static void rspamd_control_ignore_io_handler(int fd, short what, void *ud); + +static void +rspamd_control_stop_pending(struct rspamd_control_reply_elt *elt) +{ + GHashTable *htb; + /* It stops event and frees hash */ + htb = elt->pending_elts; + g_hash_table_remove(elt->pending_elts, elt); + /* Release hash reference */ + g_hash_table_unref(htb); +} + +void rspamd_control_send_error(struct rspamd_control_session *session, + gint code, const gchar *error_msg, ...) +{ + struct rspamd_http_message *msg; + rspamd_fstring_t *reply; + va_list args; + + msg = rspamd_http_new_message(HTTP_RESPONSE); + + va_start(args, error_msg); + msg->status = rspamd_fstring_new(); + rspamd_vprintf_fstring(&msg->status, error_msg, args); + va_end(args); + + msg->date = time(NULL); + msg->code = code; + reply = rspamd_fstring_sized_new(msg->status->len + 16); + rspamd_printf_fstring(&reply, "{\"error\":\"%V\"}", msg->status); + rspamd_http_message_set_body_from_fstring_steal(msg, reply); + rspamd_http_connection_reset(session->conn); + rspamd_http_connection_write_message(session->conn, + msg, + NULL, + "application/json", + session, + io_timeout); +} + +static void +rspamd_control_send_ucl(struct rspamd_control_session *session, + ucl_object_t *obj) +{ + struct rspamd_http_message *msg; + rspamd_fstring_t *reply; + + msg = rspamd_http_new_message(HTTP_RESPONSE); + msg->date = time(NULL); + msg->code = 200; + msg->status = rspamd_fstring_new_init("OK", 2); + reply = rspamd_fstring_sized_new(BUFSIZ); + rspamd_ucl_emit_fstring(obj, UCL_EMIT_JSON_COMPACT, &reply); + rspamd_http_message_set_body_from_fstring_steal(msg, reply); + rspamd_http_connection_reset(session->conn); + rspamd_http_connection_write_message(session->conn, + msg, + NULL, + "application/json", + session, + io_timeout); +} + +static void +rspamd_control_connection_close(struct rspamd_control_session *session) +{ + struct rspamd_control_reply_elt *elt, *telt; + struct rspamd_main *rspamd_main; + + rspamd_main = session->rspamd_main; + msg_info_main("finished connection from %s", + rspamd_inet_address_to_string(session->addr)); + + DL_FOREACH_SAFE(session->replies, elt, telt) + { + rspamd_control_stop_pending(elt); + } + + rspamd_inet_address_free(session->addr); + rspamd_http_connection_unref(session->conn); + close(session->fd); + g_free(session); +} + +static void +rspamd_control_write_reply(struct rspamd_control_session *session) +{ + ucl_object_t *rep, *cur, *workers; + struct rspamd_control_reply_elt *elt; + gchar tmpbuf[64]; + gdouble total_utime = 0, total_systime = 0; + struct ucl_parser *parser; + guint total_conns = 0; + + rep = ucl_object_typed_new(UCL_OBJECT); + workers = ucl_object_typed_new(UCL_OBJECT); + + DL_FOREACH(session->replies, elt) + { + /* Skip incompatible worker for fuzzy_stat */ + if ((session->cmd.type == RSPAMD_CONTROL_FUZZY_STAT || + session->cmd.type == RSPAMD_CONTROL_FUZZY_SYNC) && + elt->wrk_type != g_quark_from_static_string("fuzzy")) { + continue; + } + + rspamd_snprintf(tmpbuf, sizeof(tmpbuf), "%P", elt->wrk_pid); + cur = ucl_object_typed_new(UCL_OBJECT); + + ucl_object_insert_key(cur, ucl_object_fromstring(g_quark_to_string(elt->wrk_type)), "type", 0, false); + + switch (session->cmd.type) { + case RSPAMD_CONTROL_STAT: + ucl_object_insert_key(cur, ucl_object_fromint(elt->reply.reply.stat.conns), "conns", 0, false); + ucl_object_insert_key(cur, ucl_object_fromdouble(elt->reply.reply.stat.utime), "utime", 0, false); + ucl_object_insert_key(cur, ucl_object_fromdouble(elt->reply.reply.stat.systime), "systime", 0, false); + ucl_object_insert_key(cur, ucl_object_fromdouble(elt->reply.reply.stat.uptime), "uptime", 0, false); + ucl_object_insert_key(cur, ucl_object_fromint(elt->reply.reply.stat.maxrss), "maxrss", 0, false); + + total_utime += elt->reply.reply.stat.utime; + total_systime += elt->reply.reply.stat.systime; + total_conns += elt->reply.reply.stat.conns; + + break; + + case RSPAMD_CONTROL_RELOAD: + ucl_object_insert_key(cur, ucl_object_fromint(elt->reply.reply.reload.status), "status", 0, false); + break; + case RSPAMD_CONTROL_RECOMPILE: + ucl_object_insert_key(cur, ucl_object_fromint(elt->reply.reply.recompile.status), "status", 0, false); + break; + case RSPAMD_CONTROL_RERESOLVE: + ucl_object_insert_key(cur, ucl_object_fromint(elt->reply.reply.reresolve.status), "status", 0, false); + break; + case RSPAMD_CONTROL_FUZZY_STAT: + if (elt->attached_fd != -1) { + /* We have some data to parse */ + parser = ucl_parser_new(0); + ucl_object_insert_key(cur, + ucl_object_fromint( + elt->reply.reply.fuzzy_stat.status), + "status", + 0, + false); + + if (ucl_parser_add_fd(parser, elt->attached_fd)) { + ucl_object_insert_key(cur, ucl_parser_get_object(parser), + "data", 0, false); + ucl_parser_free(parser); + } + else { + + ucl_object_insert_key(cur, ucl_object_fromstring(ucl_parser_get_error(parser)), "error", 0, false); + + ucl_parser_free(parser); + } + + ucl_object_insert_key(cur, + ucl_object_fromlstring( + elt->reply.reply.fuzzy_stat.storage_id, + MEMPOOL_UID_LEN - 1), + "id", + 0, + false); + } + else { + ucl_object_insert_key(cur, + ucl_object_fromstring("missing file"), + "error", + 0, + false); + ucl_object_insert_key(cur, + ucl_object_fromint( + elt->reply.reply.fuzzy_stat.status), + "status", + 0, + false); + } + break; + case RSPAMD_CONTROL_FUZZY_SYNC: + ucl_object_insert_key(cur, ucl_object_fromint(elt->reply.reply.fuzzy_sync.status), "status", 0, false); + break; + default: + break; + } + + if (elt->attached_fd != -1) { + close(elt->attached_fd); + elt->attached_fd = -1; + } + + ucl_object_insert_key(workers, cur, tmpbuf, 0, true); + } + + ucl_object_insert_key(rep, workers, "workers", 0, false); + + if (session->cmd.type == RSPAMD_CONTROL_STAT) { + /* Total stats */ + cur = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(cur, ucl_object_fromint(total_conns), "conns", 0, false); + ucl_object_insert_key(cur, ucl_object_fromdouble(total_utime), "utime", 0, false); + ucl_object_insert_key(cur, ucl_object_fromdouble(total_systime), "systime", 0, false); + + ucl_object_insert_key(rep, cur, "total", 0, false); + } + + rspamd_control_send_ucl(session, rep); + ucl_object_unref(rep); +} + +static void +rspamd_control_wrk_io(gint fd, short what, gpointer ud) +{ + struct rspamd_control_reply_elt *elt = ud; + struct rspamd_control_session *session; + guchar fdspace[CMSG_SPACE(sizeof(int))]; + struct iovec iov; + struct msghdr msg; + gssize r; + + session = elt->ud; + elt->attached_fd = -1; + + if (what == EV_READ) { + iov.iov_base = &elt->reply; + iov.iov_len = sizeof(elt->reply); + memset(&msg, 0, sizeof(msg)); + msg.msg_control = fdspace; + msg.msg_controllen = sizeof(fdspace); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + r = recvmsg(fd, &msg, 0); + if (r == -1) { + msg_err("cannot read reply from the worker %P (%s): %s", + elt->wrk_pid, g_quark_to_string(elt->wrk_type), + strerror(errno)); + } + else if (r >= (gssize) sizeof(elt->reply)) { + if (msg.msg_controllen >= CMSG_LEN(sizeof(int))) { + elt->attached_fd = *(int *) CMSG_DATA(CMSG_FIRSTHDR(&msg)); + } + } + } + else { + /* Timeout waiting */ + msg_warn("timeout waiting reply from %P (%s)", + elt->wrk_pid, g_quark_to_string(elt->wrk_type)); + } + + session->replies_remain--; + rspamd_ev_watcher_stop(session->event_loop, + &elt->ev); + + if (session->replies_remain == 0) { + rspamd_control_write_reply(session); + } +} + +static void +rspamd_control_error_handler(struct rspamd_http_connection *conn, GError *err) +{ + struct rspamd_control_session *session = conn->ud; + struct rspamd_main *rspamd_main; + + rspamd_main = session->rspamd_main; + + if (!session->is_reply) { + msg_info_main("abnormally closing control connection: %e", err); + session->is_reply = TRUE; + rspamd_control_send_error(session, err->code, "%s", err->message); + } + else { + rspamd_control_connection_close(session); + } +} + +void rspamd_pending_control_free(gpointer p) +{ + struct rspamd_control_reply_elt *rep_elt = (struct rspamd_control_reply_elt *) p; + + rspamd_ev_watcher_stop(rep_elt->event_loop, &rep_elt->ev); + g_free(rep_elt); +} + +static struct rspamd_control_reply_elt * +rspamd_control_broadcast_cmd(struct rspamd_main *rspamd_main, + struct rspamd_control_command *cmd, + gint attached_fd, + rspamd_ev_cb handler, + gpointer ud, + pid_t except_pid) +{ + GHashTableIter it; + struct rspamd_worker *wrk; + struct rspamd_control_reply_elt *rep_elt, *res = NULL; + gpointer k, v; + struct msghdr msg; + struct cmsghdr *cmsg; + struct iovec iov; + guchar fdspace[CMSG_SPACE(sizeof(int))]; + gssize r; + + g_hash_table_iter_init(&it, rspamd_main->workers); + + while (g_hash_table_iter_next(&it, &k, &v)) { + wrk = v; + + /* No control pipe */ + if (wrk->control_pipe[0] == -1) { + continue; + } + + if (except_pid != 0 && wrk->pid == except_pid) { + continue; + } + + /* Worker is terminating, do not bother sending stuff */ + if (wrk->state == rspamd_worker_state_terminating) { + continue; + } + + memset(&msg, 0, sizeof(msg)); + + /* Attach fd to the message */ + if (attached_fd != -1) { + memset(fdspace, 0, sizeof(fdspace)); + msg.msg_control = fdspace; + msg.msg_controllen = sizeof(fdspace); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &attached_fd, sizeof(int)); + } + + iov.iov_base = cmd; + iov.iov_len = sizeof(*cmd); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + r = sendmsg(wrk->control_pipe[0], &msg, 0); + + if (r == sizeof(*cmd)) { + rep_elt = g_malloc0(sizeof(*rep_elt)); + rep_elt->wrk_pid = wrk->pid; + rep_elt->wrk_type = wrk->type; + rep_elt->event_loop = rspamd_main->event_loop; + rep_elt->ud = ud; + rep_elt->pending_elts = g_hash_table_ref(wrk->control_events_pending); + rspamd_ev_watcher_init(&rep_elt->ev, + wrk->control_pipe[0], + EV_READ, handler, + rep_elt); + rspamd_ev_watcher_start(rspamd_main->event_loop, + &rep_elt->ev, worker_io_timeout); + g_hash_table_insert(wrk->control_events_pending, rep_elt, rep_elt); + + DL_APPEND(res, rep_elt); + } + else { + msg_err_main("cannot write command %d(%z) to the worker %P(%s), fd: %d: %s", + (int) cmd->type, iov.iov_len, + wrk->pid, + g_quark_to_string(wrk->type), + wrk->control_pipe[0], + strerror(errno)); + } + } + + return res; +} + +void rspamd_control_broadcast_srv_cmd(struct rspamd_main *rspamd_main, + struct rspamd_control_command *cmd, + pid_t except_pid) +{ + rspamd_control_broadcast_cmd(rspamd_main, cmd, -1, + rspamd_control_ignore_io_handler, NULL, except_pid); +} + +static gint +rspamd_control_finish_handler(struct rspamd_http_connection *conn, + struct rspamd_http_message *msg) +{ + struct rspamd_control_session *session = conn->ud; + rspamd_ftok_t srch; + guint i; + gboolean found = FALSE; + struct rspamd_control_reply_elt *cur; + + + if (!session->is_reply) { + if (msg->url == NULL) { + rspamd_control_connection_close(session); + + return 0; + } + + srch.begin = msg->url->str; + srch.len = msg->url->len; + + session->is_reply = TRUE; + + for (i = 0; i < G_N_ELEMENTS(cmd_matches); i++) { + if (rspamd_ftok_casecmp(&srch, &cmd_matches[i].name) == 0) { + session->cmd.type = cmd_matches[i].type; + found = TRUE; + break; + } + } + + if (!found) { + rspamd_control_send_error(session, 404, "Command not defined"); + } + else { + /* Send command to all workers */ + session->replies = rspamd_control_broadcast_cmd( + session->rspamd_main, &session->cmd, -1, + rspamd_control_wrk_io, session, 0); + + DL_FOREACH(session->replies, cur) + { + session->replies_remain++; + } + } + } + else { + rspamd_control_connection_close(session); + } + + + return 0; +} + +void rspamd_control_process_client_socket(struct rspamd_main *rspamd_main, + gint fd, rspamd_inet_addr_t *addr) +{ + struct rspamd_control_session *session; + + session = g_malloc0(sizeof(*session)); + + session->fd = fd; + session->conn = rspamd_http_connection_new_server(rspamd_main->http_ctx, + fd, + NULL, + rspamd_control_error_handler, + rspamd_control_finish_handler, + 0); + session->rspamd_main = rspamd_main; + session->addr = addr; + session->event_loop = rspamd_main->event_loop; + rspamd_http_connection_read_message(session->conn, session, + io_timeout); +} + +struct rspamd_worker_control_data { + ev_io io_ev; + struct rspamd_worker *worker; + struct ev_loop *ev_base; + struct { + rspamd_worker_control_handler handler; + gpointer ud; + } handlers[RSPAMD_CONTROL_MAX]; +}; + +static void +rspamd_control_default_cmd_handler(gint fd, + gint attached_fd, + struct rspamd_worker_control_data *cd, + struct rspamd_control_command *cmd) +{ + struct rspamd_control_reply rep; + gssize r; + struct rusage rusg; + struct rspamd_config *cfg; + struct rspamd_main *rspamd_main; + + memset(&rep, 0, sizeof(rep)); + rep.type = cmd->type; + rspamd_main = cd->worker->srv; + + switch (cmd->type) { + case RSPAMD_CONTROL_STAT: + if (getrusage(RUSAGE_SELF, &rusg) == -1) { + msg_err_main("cannot get rusage stats: %s", + strerror(errno)); + } + else { + rep.reply.stat.utime = tv_to_double(&rusg.ru_utime); + rep.reply.stat.systime = tv_to_double(&rusg.ru_stime); + rep.reply.stat.maxrss = rusg.ru_maxrss; + } + + rep.reply.stat.conns = cd->worker->nconns; + rep.reply.stat.uptime = rspamd_get_calendar_ticks() - cd->worker->start_time; + break; + case RSPAMD_CONTROL_RELOAD: + case RSPAMD_CONTROL_RECOMPILE: + case RSPAMD_CONTROL_HYPERSCAN_LOADED: + case RSPAMD_CONTROL_MONITORED_CHANGE: + case RSPAMD_CONTROL_FUZZY_STAT: + case RSPAMD_CONTROL_FUZZY_SYNC: + case RSPAMD_CONTROL_LOG_PIPE: + case RSPAMD_CONTROL_CHILD_CHANGE: + case RSPAMD_CONTROL_FUZZY_BLOCKED: + break; + case RSPAMD_CONTROL_RERESOLVE: + if (cd->worker->srv->cfg) { + REF_RETAIN(cd->worker->srv->cfg); + cfg = cd->worker->srv->cfg; + + if (cfg->ups_ctx) { + msg_info_config("reresolving upstreams"); + rspamd_upstream_reresolve(cfg->ups_ctx); + } + + rep.reply.reresolve.status = 0; + REF_RELEASE(cfg); + } + else { + rep.reply.reresolve.status = EINVAL; + } + break; + default: + break; + } + + r = write(fd, &rep, sizeof(rep)); + + if (r != sizeof(rep)) { + msg_err_main("cannot write reply to the control socket: %s", + strerror(errno)); + } + + if (attached_fd != -1) { + close(attached_fd); + } +} + +static void +rspamd_control_default_worker_handler(EV_P_ ev_io *w, int revents) +{ + struct rspamd_worker_control_data *cd = + (struct rspamd_worker_control_data *) w->data; + static struct rspamd_control_command cmd; + static struct msghdr msg; + static struct iovec iov; + static guchar fdspace[CMSG_SPACE(sizeof(int))]; + gint rfd = -1; + gssize r; + + iov.iov_base = &cmd; + iov.iov_len = sizeof(cmd); + memset(&msg, 0, sizeof(msg)); + msg.msg_control = fdspace; + msg.msg_controllen = sizeof(fdspace); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + r = recvmsg(w->fd, &msg, 0); + + if (r == -1) { + if (errno != EAGAIN && errno != EINTR) { + if (errno != ECONNRESET) { + /* + * In case of connection reset it means that main process + * has died, so do not pollute logs + */ + msg_err("cannot read request from the control socket: %s", + strerror(errno)); + } + ev_io_stop(cd->ev_base, &cd->io_ev); + close(w->fd); + } + } + else if (r < (gint) sizeof(cmd)) { + msg_err("short read of control command: %d of %d", (gint) r, + (gint) sizeof(cmd)); + + if (r == 0) { + ev_io_stop(cd->ev_base, &cd->io_ev); + close(w->fd); + } + } + else if ((gint) cmd.type >= 0 && cmd.type < RSPAMD_CONTROL_MAX) { + + if (msg.msg_controllen >= CMSG_LEN(sizeof(int))) { + rfd = *(int *) CMSG_DATA(CMSG_FIRSTHDR(&msg)); + } + + if (cd->handlers[cmd.type].handler) { + cd->handlers[cmd.type].handler(cd->worker->srv, + cd->worker, + w->fd, + rfd, + &cmd, + cd->handlers[cmd.type].ud); + } + else { + rspamd_control_default_cmd_handler(w->fd, rfd, cd, &cmd); + } + } + else { + msg_err("unknown command: %d", (gint) cmd.type); + } +} + +void rspamd_control_worker_add_default_cmd_handlers(struct rspamd_worker *worker, + struct ev_loop *ev_base) +{ + struct rspamd_worker_control_data *cd; + + cd = g_malloc0(sizeof(*cd)); + cd->worker = worker; + cd->ev_base = ev_base; + + cd->io_ev.data = cd; + ev_io_init(&cd->io_ev, rspamd_control_default_worker_handler, + worker->control_pipe[1], EV_READ); + ev_io_start(ev_base, &cd->io_ev); + + worker->control_data = cd; +} + +/** + * Register custom handler for a specific control command for this worker + */ +void rspamd_control_worker_add_cmd_handler(struct rspamd_worker *worker, + enum rspamd_control_type type, + rspamd_worker_control_handler handler, + gpointer ud) +{ + struct rspamd_worker_control_data *cd; + + g_assert(type >= 0 && type < RSPAMD_CONTROL_MAX); + g_assert(handler != NULL); + g_assert(worker->control_data != NULL); + + cd = worker->control_data; + cd->handlers[type].handler = handler; + cd->handlers[type].ud = ud; +} + +struct rspamd_srv_reply_data { + struct rspamd_worker *worker; + struct rspamd_main *srv; + gint fd; + struct rspamd_srv_reply rep; +}; + +static void +rspamd_control_ignore_io_handler(int fd, short what, void *ud) +{ + struct rspamd_control_reply_elt *elt = + (struct rspamd_control_reply_elt *) ud; + + struct rspamd_control_reply rep; + + /* At this point we just ignore replies from the workers */ + if (read(fd, &rep, sizeof(rep)) == -1) { + msg_debug("cannot read %d bytes: %s", (int) sizeof(rep), strerror(errno)); + } + rspamd_control_stop_pending(elt); +} + +static void +rspamd_control_log_pipe_io_handler(int fd, short what, void *ud) +{ + struct rspamd_control_reply_elt *elt = + (struct rspamd_control_reply_elt *) ud; + struct rspamd_control_reply rep; + + /* At this point we just ignore replies from the workers */ + (void) !read(fd, &rep, sizeof(rep)); + rspamd_control_stop_pending(elt); +} + +static void +rspamd_control_handle_on_fork(struct rspamd_srv_command *cmd, + struct rspamd_main *srv) +{ + struct rspamd_worker *parent, *child; + + parent = g_hash_table_lookup(srv->workers, + GSIZE_TO_POINTER(cmd->cmd.on_fork.ppid)); + + if (parent == NULL) { + msg_err("cannot find parent for a forked process %P (%P child)", + cmd->cmd.on_fork.ppid, cmd->cmd.on_fork.cpid); + + return; + } + + if (cmd->cmd.on_fork.state == child_dead) { + /* We need to remove stale worker */ + child = g_hash_table_lookup(srv->workers, + GSIZE_TO_POINTER(cmd->cmd.on_fork.cpid)); + + if (child == NULL) { + msg_err("cannot find child for a forked process %P (%P parent)", + cmd->cmd.on_fork.cpid, cmd->cmd.on_fork.ppid); + + return; + } + + REF_RELEASE(child->cf); + g_hash_table_remove(srv->workers, + GSIZE_TO_POINTER(cmd->cmd.on_fork.cpid)); + g_hash_table_unref(child->control_events_pending); + g_free(child); + } + else { + child = g_malloc0(sizeof(struct rspamd_worker)); + child->srv = srv; + child->type = parent->type; + child->pid = cmd->cmd.on_fork.cpid; + child->srv_pipe[0] = -1; + child->srv_pipe[1] = -1; + child->control_pipe[0] = -1; + child->control_pipe[1] = -1; + child->cf = parent->cf; + child->ppid = parent->pid; + REF_RETAIN(child->cf); + child->control_events_pending = g_hash_table_new_full(g_direct_hash, g_direct_equal, + NULL, rspamd_pending_control_free); + g_hash_table_insert(srv->workers, + GSIZE_TO_POINTER(cmd->cmd.on_fork.cpid), child); + } +} + +static void +rspamd_fill_health_reply(struct rspamd_main *srv, struct rspamd_srv_reply *rep) +{ + GHashTableIter it; + gpointer k, v; + + memset(&rep->reply.health, 0, sizeof(rep->reply)); + g_hash_table_iter_init(&it, srv->workers); + + while (g_hash_table_iter_next(&it, &k, &v)) { + struct rspamd_worker *wrk = (struct rspamd_worker *) v; + + if (wrk->hb.nbeats < 0) { + rep->reply.health.workers_hb_lost++; + } + else if (rspamd_worker_is_scanner(wrk)) { + rep->reply.health.scanners_count++; + } + + rep->reply.health.workers_count++; + } + + rep->reply.status = (g_hash_table_size(srv->workers) > 0); +} + + +static void +rspamd_srv_handler(EV_P_ ev_io *w, int revents) +{ + struct rspamd_worker *worker; + static struct rspamd_srv_command cmd; + struct rspamd_main *rspamd_main; + struct rspamd_srv_reply_data *rdata; + struct msghdr msg; + struct cmsghdr *cmsg; + static struct iovec iov; + static guchar fdspace[CMSG_SPACE(sizeof(int))]; + gint *spair, rfd = -1; + gchar *nid; + struct rspamd_control_command wcmd; + gssize r; + + if (revents == EV_READ) { + worker = (struct rspamd_worker *) w->data; + rspamd_main = worker->srv; + iov.iov_base = &cmd; + iov.iov_len = sizeof(cmd); + memset(&msg, 0, sizeof(msg)); + msg.msg_control = fdspace; + msg.msg_controllen = sizeof(fdspace); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + r = recvmsg(w->fd, &msg, 0); + + if (r == -1) { + if (errno != EAGAIN) { + msg_err_main("cannot read from worker's srv pipe: %s", + strerror(errno)); + } + else { + return; + } + } + else if (r == 0) { + /* + * Usually this means that a worker is dead, so do not try to read + * anything + */ + msg_err_main("cannot read from worker's srv pipe connection closed; command = %s", + rspamd_srv_command_to_string(cmd.type)); + ev_io_stop(EV_A_ w); + } + else if (r != sizeof(cmd)) { + msg_err_main("cannot read from worker's srv pipe incomplete command: %d != %d; command = %s", + (gint) r, (gint) sizeof(cmd), rspamd_srv_command_to_string(cmd.type)); + } + else { + rdata = g_malloc0(sizeof(*rdata)); + rdata->worker = worker; + rdata->srv = rspamd_main; + rdata->rep.id = cmd.id; + rdata->rep.type = cmd.type; + rdata->fd = -1; + worker->tmp_data = rdata; + + if (msg.msg_controllen >= CMSG_LEN(sizeof(int))) { + rfd = *(int *) CMSG_DATA(CMSG_FIRSTHDR(&msg)); + } + + switch (cmd.type) { + case RSPAMD_SRV_SOCKETPAIR: + spair = g_hash_table_lookup(rspamd_main->spairs, cmd.cmd.spair.pair_id); + if (spair == NULL) { + spair = g_malloc(sizeof(gint) * 2); + + if (rspamd_socketpair(spair, cmd.cmd.spair.af) == -1) { + rdata->rep.reply.spair.code = errno; + msg_err_main("cannot create socket pair: %s", strerror(errno)); + } + else { + nid = g_malloc(sizeof(cmd.cmd.spair.pair_id)); + memcpy(nid, cmd.cmd.spair.pair_id, + sizeof(cmd.cmd.spair.pair_id)); + g_hash_table_insert(rspamd_main->spairs, nid, spair); + rdata->rep.reply.spair.code = 0; + rdata->fd = cmd.cmd.spair.pair_num ? spair[1] : spair[0]; + } + } + else { + rdata->rep.reply.spair.code = 0; + rdata->fd = cmd.cmd.spair.pair_num ? spair[1] : spair[0]; + } + break; + case RSPAMD_SRV_HYPERSCAN_LOADED: +#ifdef WITH_HYPERSCAN + /* Load RE cache to provide it for new forks */ + if (rspamd_re_cache_is_hs_loaded(rspamd_main->cfg->re_cache) != RSPAMD_HYPERSCAN_LOADED_FULL || + cmd.cmd.hs_loaded.forced) { + rspamd_re_cache_load_hyperscan( + rspamd_main->cfg->re_cache, + cmd.cmd.hs_loaded.cache_dir, + false); + } + + /* After getting this notice, we can clean up old hyperscan files */ + + rspamd_hyperscan_notice_loaded(); + + msg_info_main("received hyperscan cache loaded from %s", + cmd.cmd.hs_loaded.cache_dir); + + /* Broadcast command to all workers */ + memset(&wcmd, 0, sizeof(wcmd)); + wcmd.type = RSPAMD_CONTROL_HYPERSCAN_LOADED; + rspamd_strlcpy(wcmd.cmd.hs_loaded.cache_dir, + cmd.cmd.hs_loaded.cache_dir, + sizeof(wcmd.cmd.hs_loaded.cache_dir)); + wcmd.cmd.hs_loaded.forced = cmd.cmd.hs_loaded.forced; + rspamd_control_broadcast_cmd(rspamd_main, &wcmd, rfd, + rspamd_control_ignore_io_handler, NULL, worker->pid); +#endif + break; + case RSPAMD_SRV_MONITORED_CHANGE: + /* Broadcast command to all workers */ + memset(&wcmd, 0, sizeof(wcmd)); + wcmd.type = RSPAMD_CONTROL_MONITORED_CHANGE; + rspamd_strlcpy(wcmd.cmd.monitored_change.tag, + cmd.cmd.monitored_change.tag, + sizeof(wcmd.cmd.monitored_change.tag)); + wcmd.cmd.monitored_change.alive = cmd.cmd.monitored_change.alive; + wcmd.cmd.monitored_change.sender = cmd.cmd.monitored_change.sender; + rspamd_control_broadcast_cmd(rspamd_main, &wcmd, rfd, + rspamd_control_ignore_io_handler, NULL, 0); + break; + case RSPAMD_SRV_LOG_PIPE: + memset(&wcmd, 0, sizeof(wcmd)); + wcmd.type = RSPAMD_CONTROL_LOG_PIPE; + wcmd.cmd.log_pipe.type = cmd.cmd.log_pipe.type; + rspamd_control_broadcast_cmd(rspamd_main, &wcmd, rfd, + rspamd_control_log_pipe_io_handler, NULL, 0); + break; + case RSPAMD_SRV_ON_FORK: + rdata->rep.reply.on_fork.status = 0; + rspamd_control_handle_on_fork(&cmd, rspamd_main); + break; + case RSPAMD_SRV_HEARTBEAT: + worker->hb.last_event = ev_time(); + rdata->rep.reply.heartbeat.status = 0; + break; + case RSPAMD_SRV_HEALTH: + rspamd_fill_health_reply(rspamd_main, &rdata->rep); + break; + case RSPAMD_SRV_NOTICE_HYPERSCAN_CACHE: +#ifdef WITH_HYPERSCAN + rspamd_hyperscan_notice_known(cmd.cmd.hyperscan_cache_file.path); +#endif + rdata->rep.reply.hyperscan_cache_file.unused = 0; + break; + case RSPAMD_SRV_FUZZY_BLOCKED: + /* Broadcast command to all workers */ + memset(&wcmd, 0, sizeof(wcmd)); + wcmd.type = RSPAMD_CONTROL_FUZZY_BLOCKED; + /* Ensure that memcpy is safe */ + G_STATIC_ASSERT(sizeof(wcmd.cmd.fuzzy_blocked) == sizeof(cmd.cmd.fuzzy_blocked)); + memcpy(&wcmd.cmd.fuzzy_blocked, &cmd.cmd.fuzzy_blocked, sizeof(wcmd.cmd.fuzzy_blocked)); + rspamd_control_broadcast_cmd(rspamd_main, &wcmd, rfd, + rspamd_control_ignore_io_handler, NULL, worker->pid); + break; + default: + msg_err_main("unknown command type: %d", cmd.type); + break; + } + + if (rfd != -1) { + /* Close our copy to avoid descriptors leak */ + close(rfd); + } + + /* Now plan write event and send data back */ + w->data = rdata; + ev_io_stop(EV_A_ w); + ev_io_set(w, worker->srv_pipe[0], EV_WRITE); + ev_io_start(EV_A_ w); + } + } + else if (revents == EV_WRITE) { + rdata = (struct rspamd_srv_reply_data *) w->data; + worker = rdata->worker; + worker->tmp_data = NULL; /* Avoid race */ + rspamd_main = rdata->srv; + + memset(&msg, 0, sizeof(msg)); + + /* Attach fd to the message */ + if (rdata->fd != -1) { + memset(fdspace, 0, sizeof(fdspace)); + msg.msg_control = fdspace; + msg.msg_controllen = sizeof(fdspace); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &rdata->fd, sizeof(int)); + } + + iov.iov_base = &rdata->rep; + iov.iov_len = sizeof(rdata->rep); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + r = sendmsg(w->fd, &msg, 0); + + if (r == -1) { + msg_err_main("cannot write to worker's srv pipe when writing reply: %s; command = %s", + strerror(errno), rspamd_srv_command_to_string(rdata->rep.type)); + } + else if (r != sizeof(rdata->rep)) { + msg_err_main("cannot write to worker's srv pipe: %d != %d; command = %s", + (int) r, (int) sizeof(rdata->rep), + rspamd_srv_command_to_string(rdata->rep.type)); + } + + g_free(rdata); + w->data = worker; + ev_io_stop(EV_A_ w); + ev_io_set(w, worker->srv_pipe[0], EV_READ); + ev_io_start(EV_A_ w); + } +} + +void rspamd_srv_start_watching(struct rspamd_main *srv, + struct rspamd_worker *worker, + struct ev_loop *ev_base) +{ + g_assert(worker != NULL); + + worker->tmp_data = NULL; + worker->srv_ev.data = worker; + ev_io_init(&worker->srv_ev, rspamd_srv_handler, worker->srv_pipe[0], EV_READ); + ev_io_start(ev_base, &worker->srv_ev); +} + +struct rspamd_srv_request_data { + struct rspamd_worker *worker; + struct rspamd_srv_command cmd; + gint attached_fd; + struct rspamd_srv_reply rep; + rspamd_srv_reply_handler handler; + ev_io io_ev; + gpointer ud; +}; + +static void +rspamd_srv_request_handler(EV_P_ ev_io *w, int revents) +{ + struct rspamd_srv_request_data *rd = (struct rspamd_srv_request_data *) w->data; + struct msghdr msg; + struct iovec iov; + guchar fdspace[CMSG_SPACE(sizeof(int))]; + struct cmsghdr *cmsg; + gssize r; + gint rfd = -1; + + if (revents == EV_WRITE) { + /* Send request to server */ + memset(&msg, 0, sizeof(msg)); + + /* Attach fd to the message */ + if (rd->attached_fd != -1) { + memset(fdspace, 0, sizeof(fdspace)); + msg.msg_control = fdspace; + msg.msg_controllen = sizeof(fdspace); + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &rd->attached_fd, sizeof(int)); + } + + iov.iov_base = &rd->cmd; + iov.iov_len = sizeof(rd->cmd); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + r = sendmsg(w->fd, &msg, 0); + + if (r == -1) { + if (r == ENOBUFS) { + /* On BSD derived systems we can have this error when trying to send + * requests too fast. + * It might be good to retry... + */ + msg_info("cannot write to server pipe: %s; command = %s; retrying sending", + strerror(errno), + rspamd_srv_command_to_string(rd->cmd.type)); + return; + } + msg_err("cannot write to server pipe: %s; command = %s", strerror(errno), + rspamd_srv_command_to_string(rd->cmd.type)); + goto cleanup; + } + else if (r != sizeof(rd->cmd)) { + msg_err("incomplete write to the server pipe: %d != %d, command = %s", + (int) r, (int) sizeof(rd->cmd), rspamd_srv_command_to_string(rd->cmd.type)); + goto cleanup; + } + + ev_io_stop(EV_A_ w); + ev_io_set(w, rd->worker->srv_pipe[1], EV_READ); + ev_io_start(EV_A_ w); + } + else { + iov.iov_base = &rd->rep; + iov.iov_len = sizeof(rd->rep); + memset(&msg, 0, sizeof(msg)); + msg.msg_control = fdspace; + msg.msg_controllen = sizeof(fdspace); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + r = recvmsg(w->fd, &msg, 0); + + if (r == -1) { + msg_err("cannot read from server pipe: %s; command = %s", strerror(errno), + rspamd_srv_command_to_string(rd->cmd.type)); + goto cleanup; + } + + if (r != (gint) sizeof(rd->rep)) { + msg_err("cannot read from server pipe, invalid length: %d != %d; command = %s", + (gint) r, (int) sizeof(rd->rep), rspamd_srv_command_to_string(rd->cmd.type)); + goto cleanup; + } + + if (msg.msg_controllen >= CMSG_LEN(sizeof(int))) { + rfd = *(int *) CMSG_DATA(CMSG_FIRSTHDR(&msg)); + } + + /* Reply has been received */ + if (rd->handler) { + rd->handler(rd->worker, &rd->rep, rfd, rd->ud); + } + + goto cleanup; + } + + return; + + +cleanup: + ev_io_stop(EV_A_ w); + g_free(rd); +} + +void rspamd_srv_send_command(struct rspamd_worker *worker, + struct ev_loop *ev_base, + struct rspamd_srv_command *cmd, + gint attached_fd, + rspamd_srv_reply_handler handler, + gpointer ud) +{ + struct rspamd_srv_request_data *rd; + + g_assert(cmd != NULL); + g_assert(worker != NULL); + + rd = g_malloc0(sizeof(*rd)); + cmd->id = ottery_rand_uint64(); + memcpy(&rd->cmd, cmd, sizeof(rd->cmd)); + rd->handler = handler; + rd->ud = ud; + rd->worker = worker; + rd->rep.id = cmd->id; + rd->rep.type = cmd->type; + rd->attached_fd = attached_fd; + + rd->io_ev.data = rd; + ev_io_init(&rd->io_ev, rspamd_srv_request_handler, + rd->worker->srv_pipe[1], EV_WRITE); + ev_io_start(ev_base, &rd->io_ev); +} + +enum rspamd_control_type +rspamd_control_command_from_string(const gchar *str) +{ + enum rspamd_control_type ret = RSPAMD_CONTROL_MAX; + + if (!str) { + return ret; + } + + if (g_ascii_strcasecmp(str, "hyperscan_loaded") == 0) { + ret = RSPAMD_CONTROL_HYPERSCAN_LOADED; + } + else if (g_ascii_strcasecmp(str, "stat") == 0) { + ret = RSPAMD_CONTROL_STAT; + } + else if (g_ascii_strcasecmp(str, "reload") == 0) { + ret = RSPAMD_CONTROL_RELOAD; + } + else if (g_ascii_strcasecmp(str, "reresolve") == 0) { + ret = RSPAMD_CONTROL_RERESOLVE; + } + else if (g_ascii_strcasecmp(str, "recompile") == 0) { + ret = RSPAMD_CONTROL_RECOMPILE; + } + else if (g_ascii_strcasecmp(str, "log_pipe") == 0) { + ret = RSPAMD_CONTROL_LOG_PIPE; + } + else if (g_ascii_strcasecmp(str, "fuzzy_stat") == 0) { + ret = RSPAMD_CONTROL_FUZZY_STAT; + } + else if (g_ascii_strcasecmp(str, "fuzzy_sync") == 0) { + ret = RSPAMD_CONTROL_FUZZY_SYNC; + } + else if (g_ascii_strcasecmp(str, "monitored_change") == 0) { + ret = RSPAMD_CONTROL_MONITORED_CHANGE; + } + else if (g_ascii_strcasecmp(str, "child_change") == 0) { + ret = RSPAMD_CONTROL_CHILD_CHANGE; + } + + return ret; +} + +const gchar * +rspamd_control_command_to_string(enum rspamd_control_type cmd) +{ + const gchar *reply = "unknown"; + + switch (cmd) { + case RSPAMD_CONTROL_STAT: + reply = "stat"; + break; + case RSPAMD_CONTROL_RELOAD: + reply = "reload"; + break; + case RSPAMD_CONTROL_RERESOLVE: + reply = "reresolve"; + break; + case RSPAMD_CONTROL_RECOMPILE: + reply = "recompile"; + break; + case RSPAMD_CONTROL_HYPERSCAN_LOADED: + reply = "hyperscan_loaded"; + break; + case RSPAMD_CONTROL_LOG_PIPE: + reply = "log_pipe"; + break; + case RSPAMD_CONTROL_FUZZY_STAT: + reply = "fuzzy_stat"; + break; + case RSPAMD_CONTROL_FUZZY_SYNC: + reply = "fuzzy_sync"; + break; + case RSPAMD_CONTROL_MONITORED_CHANGE: + reply = "monitored_change"; + break; + case RSPAMD_CONTROL_CHILD_CHANGE: + reply = "child_change"; + break; + default: + break; + } + + return reply; +} + +const gchar *rspamd_srv_command_to_string(enum rspamd_srv_type cmd) +{ + const gchar *reply = "unknown"; + + switch (cmd) { + case RSPAMD_SRV_SOCKETPAIR: + reply = "socketpair"; + break; + case RSPAMD_SRV_HYPERSCAN_LOADED: + reply = "hyperscan_loaded"; + break; + case RSPAMD_SRV_MONITORED_CHANGE: + reply = "monitored_change"; + break; + case RSPAMD_SRV_LOG_PIPE: + reply = "log_pipe"; + break; + case RSPAMD_SRV_ON_FORK: + reply = "on_fork"; + break; + case RSPAMD_SRV_HEARTBEAT: + reply = "heartbeat"; + break; + case RSPAMD_SRV_HEALTH: + reply = "health"; + break; + case RSPAMD_SRV_NOTICE_HYPERSCAN_CACHE: + reply = "notice_hyperscan_cache"; + break; + case RSPAMD_SRV_FUZZY_BLOCKED: + reply = "fuzzy_blocked"; + break; + } + + return reply; +} diff --git a/src/libserver/rspamd_control.h b/src/libserver/rspamd_control.h new file mode 100644 index 0000000..c3c861f --- /dev/null +++ b/src/libserver/rspamd_control.h @@ -0,0 +1,328 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_RSPAMD_CONTROL_H +#define RSPAMD_RSPAMD_CONTROL_H + +#include "config.h" +#include "mem_pool.h" +#include "contrib/libev/ev.h" + +G_BEGIN_DECLS + +struct rspamd_main; +struct rspamd_worker; + +enum rspamd_control_type { + RSPAMD_CONTROL_STAT = 0, + RSPAMD_CONTROL_RELOAD, + RSPAMD_CONTROL_RERESOLVE, + RSPAMD_CONTROL_RECOMPILE, + RSPAMD_CONTROL_HYPERSCAN_LOADED, + RSPAMD_CONTROL_LOG_PIPE, + RSPAMD_CONTROL_FUZZY_STAT, + RSPAMD_CONTROL_FUZZY_SYNC, + RSPAMD_CONTROL_MONITORED_CHANGE, + RSPAMD_CONTROL_CHILD_CHANGE, + RSPAMD_CONTROL_FUZZY_BLOCKED, + RSPAMD_CONTROL_MAX +}; + +enum rspamd_srv_type { + RSPAMD_SRV_SOCKETPAIR = 0, + RSPAMD_SRV_HYPERSCAN_LOADED, + RSPAMD_SRV_MONITORED_CHANGE, + RSPAMD_SRV_LOG_PIPE, + RSPAMD_SRV_ON_FORK, + RSPAMD_SRV_HEARTBEAT, + RSPAMD_SRV_HEALTH, + RSPAMD_SRV_NOTICE_HYPERSCAN_CACHE, + RSPAMD_SRV_FUZZY_BLOCKED, /* Used to notify main process about a blocked ip */ +}; + +enum rspamd_log_pipe_type { + RSPAMD_LOG_PIPE_SYMBOLS = 0, +}; +#define CONTROL_PATHLEN MIN(PATH_MAX, PIPE_BUF - sizeof(int) * 2 - sizeof(gint64) * 2) +struct rspamd_control_command { + enum rspamd_control_type type; + union { + struct { + guint unused; + } stat; + struct { + guint unused; + } reload; + struct { + guint unused; + } reresolve; + struct { + guint unused; + } recompile; + struct { + gboolean forced; + gchar cache_dir[CONTROL_PATHLEN]; + } hs_loaded; + struct { + gchar tag[32]; + gboolean alive; + pid_t sender; + } monitored_change; + struct { + enum rspamd_log_pipe_type type; + } log_pipe; + struct { + guint unused; + } fuzzy_stat; + struct { + guint unused; + } fuzzy_sync; + struct { + enum { + rspamd_child_offline, + rspamd_child_online, + rspamd_child_terminated, + } what; + pid_t pid; + guint additional; + } child_change; + struct { + union { + struct sockaddr sa; + struct sockaddr_in s4; + struct sockaddr_in6 s6; + } addr; + sa_family_t af; + } fuzzy_blocked; + } cmd; +}; + +struct rspamd_control_reply { + enum rspamd_control_type type; + union { + struct { + guint conns; + gdouble uptime; + gdouble utime; + gdouble systime; + gulong maxrss; + } stat; + struct { + guint status; + } reload; + struct { + guint status; + } reresolve; + struct { + guint status; + } recompile; + struct { + guint status; + } hs_loaded; + struct { + guint status; + } monitored_change; + struct { + guint status; + } log_pipe; + struct { + guint status; + gchar storage_id[MEMPOOL_UID_LEN]; + } fuzzy_stat; + struct { + guint status; + } fuzzy_sync; + struct { + guint status; + } fuzzy_blocked; + } reply; +}; + +#define PAIR_ID_LEN 16 + +struct rspamd_srv_command { + enum rspamd_srv_type type; + guint64 id; + union { + struct { + gint af; + gchar pair_id[PAIR_ID_LEN]; + guint pair_num; + } spair; + struct { + gboolean forced; + gchar cache_dir[CONTROL_PATHLEN]; + } hs_loaded; + struct { + gchar tag[32]; + gboolean alive; + pid_t sender; + } monitored_change; + struct { + enum rspamd_log_pipe_type type; + } log_pipe; + struct { + pid_t ppid; + pid_t cpid; + enum { + child_create = 0, + child_dead, + } state; + } on_fork; + struct { + guint status; + /* TODO: add more fields */ + } heartbeat; + struct { + guint status; + } health; + /* Used when a worker loads a valid hyperscan file */ + struct { + char path[CONTROL_PATHLEN]; + } hyperscan_cache_file; + /* Send when one worker has blocked some IP address */ + struct { + union { + struct sockaddr sa; + struct sockaddr_in s4; + struct sockaddr_in6 s6; + } addr; + sa_family_t af; + } fuzzy_blocked; + } cmd; +}; + +struct rspamd_srv_reply { + enum rspamd_srv_type type; + guint64 id; + union { + struct { + gint code; + } spair; + struct { + gint forced; + } hs_loaded; + struct { + gint status; + }; + struct { + enum rspamd_log_pipe_type type; + } log_pipe; + struct { + gint status; + } on_fork; + struct { + gint status; + } heartbeat; + struct { + guint status; + guint workers_count; + guint scanners_count; + guint workers_hb_lost; + } health; + struct { + int unused; + } hyperscan_cache_file; + struct { + int unused; + } fuzzy_blocked; + } reply; +}; + +typedef gboolean (*rspamd_worker_control_handler)(struct rspamd_main *rspamd_main, + struct rspamd_worker *worker, + gint fd, + gint attached_fd, + struct rspamd_control_command *cmd, + gpointer ud); + +typedef void (*rspamd_srv_reply_handler)(struct rspamd_worker *worker, + struct rspamd_srv_reply *rep, gint rep_fd, + gpointer ud); + +/** + * Process client socket connection + */ +void rspamd_control_process_client_socket(struct rspamd_main *rspamd_main, + gint fd, rspamd_inet_addr_t *addr); + +/** + * Register default handlers for a worker + */ +void rspamd_control_worker_add_default_cmd_handlers(struct rspamd_worker *worker, + struct ev_loop *ev_base); + +/** + * Register custom handler for a specific control command for this worker + */ +void rspamd_control_worker_add_cmd_handler(struct rspamd_worker *worker, + enum rspamd_control_type type, + rspamd_worker_control_handler handler, + gpointer ud); + +/** + * Start watching on srv pipe + */ +void rspamd_srv_start_watching(struct rspamd_main *srv, + struct rspamd_worker *worker, + struct ev_loop *ev_base); + + +/** + * Send command to srv pipe and read reply calling the specified callback at the + * end + */ +void rspamd_srv_send_command(struct rspamd_worker *worker, + struct ev_loop *ev_base, + struct rspamd_srv_command *cmd, + gint attached_fd, + rspamd_srv_reply_handler handler, + gpointer ud); + +/** + * Broadcast srv cmd from rspamd_main to workers + * @param rspamd_main + * @param cmd + * @param except_pid + */ +void rspamd_control_broadcast_srv_cmd(struct rspamd_main *rspamd_main, + struct rspamd_control_command *cmd, + pid_t except_pid); + +/** + * Returns command from a specified string (case insensitive) + * @param str + * @return + */ +enum rspamd_control_type rspamd_control_command_from_string(const gchar *str); + +/** + * Returns command name from it's type + * @param cmd + * @return + */ +const gchar *rspamd_control_command_to_string(enum rspamd_control_type cmd); + +const gchar *rspamd_srv_command_to_string(enum rspamd_srv_type cmd); + +/** + * Used to cleanup pending events + * @param p + */ +void rspamd_pending_control_free(gpointer p); + +G_END_DECLS + +#endif diff --git a/src/libserver/rspamd_symcache.h b/src/libserver/rspamd_symcache.h new file mode 100644 index 0000000..2c67cba --- /dev/null +++ b/src/libserver/rspamd_symcache.h @@ -0,0 +1,578 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_SYMBOLS_CACHE_H +#define RSPAMD_SYMBOLS_CACHE_H + +#include "config.h" +#include "ucl.h" +#include "cfg_file.h" +#include "contrib/libev/ev.h" + +#include <lua.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct rspamd_config; +struct rspamd_symcache; +struct rspamd_worker; +struct rspamd_symcache_dynamic_item; +struct rspamd_symcache_item; +struct rspamd_config_settings_elt; + +typedef void (*symbol_func_t)(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + gpointer user_data); + +enum rspamd_symbol_type { + SYMBOL_TYPE_NORMAL = (1u << 0u), + SYMBOL_TYPE_VIRTUAL = (1u << 1u), + SYMBOL_TYPE_CALLBACK = (1u << 2u), + SYMBOL_TYPE_GHOST = (1u << 3u), + SYMBOL_TYPE_SKIPPED = (1u << 4u), + SYMBOL_TYPE_COMPOSITE = (1u << 5u), + SYMBOL_TYPE_CLASSIFIER = (1u << 6u), + SYMBOL_TYPE_FINE = (1u << 7u), + SYMBOL_TYPE_EMPTY = (1u << 8u), /* Allow execution on empty tasks */ + SYMBOL_TYPE_CONNFILTER = (1u << 9u), /* Connection stage filter */ + SYMBOL_TYPE_PREFILTER = (1u << 10u), + SYMBOL_TYPE_POSTFILTER = (1u << 11u), + SYMBOL_TYPE_NOSTAT = (1u << 12u), /* Skip as statistical symbol */ + SYMBOL_TYPE_IDEMPOTENT = (1u << 13u), /* Symbol cannot change metric */ + SYMBOL_TYPE_TRIVIAL = (1u << 14u), /* Symbol is trivial */ + SYMBOL_TYPE_MIME_ONLY = (1u << 15u), /* Symbol is mime only */ + SYMBOL_TYPE_EXPLICIT_DISABLE = (1u << 16u), /* Symbol should be disabled explicitly only */ + SYMBOL_TYPE_IGNORE_PASSTHROUGH = (1u << 17u), /* Symbol ignores passthrough result */ + SYMBOL_TYPE_EXPLICIT_ENABLE = (1u << 18u), /* Symbol should be enabled explicitly only */ + SYMBOL_TYPE_USE_CORO = (1u << 19u), /* Symbol uses lua coroutines */ +}; + +/** + * Abstract structure for saving callback data for symbols + */ +struct rspamd_abstract_callback_data { + guint64 magic; + char data[]; +}; + +/** + * Shared memory block specific for each symbol + */ +struct rspamd_symcache_item_stat { + struct rspamd_counter_data time_counter; + gdouble avg_time; + gdouble weight; + guint hits; + guint64 total_hits; + struct rspamd_counter_data frequency_counter; + gdouble avg_frequency; + gdouble stddev_frequency; +}; + +/** + * Creates new cache structure + * @return + */ +struct rspamd_symcache *rspamd_symcache_new(struct rspamd_config *cfg); + +/** + * Remove the cache structure syncing data if needed + * @param cache + */ +void rspamd_symcache_destroy(struct rspamd_symcache *cache); + +/** + * Saves symbols cache to disk if possible + * @param cache + */ +void rspamd_symcache_save(struct rspamd_symcache *cache); + +/** + * Load symbols cache from file, must be called _after_ init_symbols_cache + */ +gboolean rspamd_symcache_init(struct rspamd_symcache *cache); + +/** + * Generic function to register a symbol + * @param cache + * @param name + * @param weight + * @param priority + * @param func + * @param user_data + * @param type + * @param parent + */ +gint rspamd_symcache_add_symbol(struct rspamd_symcache *cache, + const gchar *name, + gint priority, + symbol_func_t func, + gpointer user_data, + int type, + gint parent); + +/** + * Adds augmentation to the symbol + * @param cache + * @param sym_id + * @param augmentation + * @return + */ +bool rspamd_symcache_add_symbol_augmentation(struct rspamd_symcache *cache, + int sym_id, + const char *augmentation, + const char *value); + +/** + * Add callback to be executed whenever symbol has peak value + * @param cache + * @param cbref + */ +void rspamd_symcache_set_peak_callback(struct rspamd_symcache *cache, + gint cbref); + +/** + * Add delayed condition to the specific symbol in cache. So symbol can be absent + * to the moment of addition + * @param cache + * @param id id of symbol + * @param L lua state pointer + * @param cbref callback reference (returned by luaL_ref) + * @return TRUE if condition has been added + */ +gboolean rspamd_symcache_add_condition_delayed(struct rspamd_symcache *cache, + const gchar *sym, + lua_State *L, gint cbref); + +/** + * Find symbol in cache by id and returns its id resolving virtual symbols if + * applicable + * @param cache + * @param name + * @return id of symbol or (-1) if a symbol has not been found + */ +gint rspamd_symcache_find_symbol(struct rspamd_symcache *cache, + const gchar *name); + +/** + * Get statistics for a specific symbol + * @param cache + * @param name + * @param frequency + * @param tm + * @return + */ +gboolean rspamd_symcache_stat_symbol(struct rspamd_symcache *cache, + const gchar *name, + gdouble *frequency, + gdouble *freq_stddev, + gdouble *tm, + guint *nhits); + +/** + * Returns number of symbols registered in symbols cache + * @param cache + * @return number of symbols in the cache + */ +guint rspamd_symcache_stats_symbols_count(struct rspamd_symcache *cache); + +/** + * Validate cache items against theirs weights defined in metrics + * @param cache symbols cache + * @param cfg configuration + * @param strict do strict checks - symbols MUST be described in metrics + */ +gboolean rspamd_symcache_validate(struct rspamd_symcache *cache, + struct rspamd_config *cfg, + gboolean strict); + +/** + * Call function for cached symbol using saved callback + * @param task task object + * @param cache symbols cache + * @param saved_item pointer to currently saved item + */ +gboolean rspamd_symcache_process_symbols(struct rspamd_task *task, + struct rspamd_symcache *cache, + guint stage); + +/** + * Return statistics about the cache as ucl object (array of objects one per item) + * @param cache + * @return + */ +ucl_object_t *rspamd_symcache_counters(struct rspamd_symcache *cache); + +/** + * Start cache reloading + * @param cache + * @param ev_base + */ +void *rspamd_symcache_start_refresh(struct rspamd_symcache *cache, + struct ev_loop *ev_base, + struct rspamd_worker *w); + +/** + * Increases counter for a specific symbol + * @param cache + * @param symbol + */ +void rspamd_symcache_inc_frequency(struct rspamd_symcache *_cache, + struct rspamd_symcache_item *item, + const gchar *sym_name); + +/** + * Add delayed dependency that is resolved on cache post-load routine + * @param cache + * @param from + * @param to + */ +void rspamd_symcache_add_delayed_dependency(struct rspamd_symcache *cache, + const gchar *from, const gchar *to); + +/** + * Get abstract callback data for a symbol (or its parent symbol) + * @param cache cache object + * @param symbol symbol name + * @return abstract callback data or NULL if symbol is absent or has no data attached + */ +struct rspamd_abstract_callback_data *rspamd_symcache_get_cbdata( + struct rspamd_symcache *cache, const gchar *symbol); + +/** + * Returns symbol's parent name (or symbol name itself) + * @param cache + * @param symbol + * @return + */ +const gchar *rspamd_symcache_get_parent(struct rspamd_symcache *cache, + const gchar *symbol); + +guint rspamd_symcache_get_symbol_flags(struct rspamd_symcache *cache, + const gchar *symbol); + +void rspamd_symcache_get_symbol_details(struct rspamd_symcache *cache, + const gchar *symbol, + ucl_object_t *this_sym_ucl); + + +/** + * Process settings for task + * @param task + * @param cache + * @return + */ +gboolean rspamd_symcache_process_settings(struct rspamd_task *task, + struct rspamd_symcache *cache); + + +/** + * Checks if a symbol specified has been checked (or disabled) + * @param task + * @param cache + * @param symbol + * @return + */ +gboolean rspamd_symcache_is_checked(struct rspamd_task *task, + struct rspamd_symcache *cache, + const gchar *symbol); + +/** + * Returns checksum for all cache items + * @param cache + * @return + */ +guint64 rspamd_symcache_get_cksum(struct rspamd_symcache *cache); + +/** + * Checks if a symbols is enabled (not checked and conditions return true if present) + * @param task + * @param cache + * @param symbol + * @return + */ +gboolean rspamd_symcache_is_symbol_enabled(struct rspamd_task *task, + struct rspamd_symcache *cache, + const gchar *symbol); + +/** + * Enable this symbol for task + * @param task + * @param cache + * @param symbol + * @return TRUE if a symbol has been enabled (not executed before) + */ +gboolean rspamd_symcache_enable_symbol(struct rspamd_task *task, + struct rspamd_symcache *cache, + const gchar *symbol); + +/** + * Enable this symbol for task + * @param task + * @param cache + * @param symbol + * @return TRUE if a symbol has been disabled (not executed before) + */ +gboolean rspamd_symcache_disable_symbol(struct rspamd_task *task, + struct rspamd_symcache *cache, + const gchar *symbol); + +/** + * Disable execution of a symbol or a pattern (a string enclosed in `//`) permanently + * @param task + * @param cache + * @param symbol + * @return + */ +void rspamd_symcache_disable_symbol_static(struct rspamd_symcache *cache, + const gchar *symbol); +/** + * Add a symbol or a pattern to the list of explicitly and statically enabled symbols + * @param cache + * @param symbol + * @return + */ +void rspamd_symcache_enable_symbol_static(struct rspamd_symcache *cache, + const gchar *symbol); + +/** + * Process specific function for each cache element (in order they are added) + * @param cache + * @param func + * @param ud + */ +void rspamd_symcache_foreach(struct rspamd_symcache *cache, + void (*func)(struct rspamd_symcache_item *item, gpointer /* userdata */), + gpointer ud); + +/** + * Returns the current item being processed (if any) + * @param task + * @return + */ +struct rspamd_symcache_dynamic_item *rspamd_symcache_get_cur_item(struct rspamd_task *task); + +/** + * Replaces the current item being processed. + * Returns the current item being processed (if any) + * @param task + * @param item + * @return + */ +struct rspamd_symcache_dynamic_item *rspamd_symcache_set_cur_item(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item); + + +/** + * Finalize the current async element potentially calling its deps + */ +void rspamd_symcache_finalize_item(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item); + +/* + * Increase number of async events pending for an item + */ +guint rspamd_symcache_item_async_inc_full(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + const gchar *subsystem, + const gchar *loc); + +#define rspamd_symcache_item_async_inc(task, item, subsystem) \ + rspamd_symcache_item_async_inc_full(task, item, subsystem, G_STRLOC) + +/* + * Decrease number of async events pending for an item, asserts if no events pending + */ +guint rspamd_symcache_item_async_dec_full(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + const gchar *subsystem, + const gchar *loc); + +#define rspamd_symcache_item_async_dec(task, item, subsystem) \ + rspamd_symcache_item_async_dec_full(task, item, subsystem, G_STRLOC) + +/** + * Decrease number of async events pending for an item, asserts if no events pending + * If no events are left, this function calls `rspamd_symbols_cache_finalize_item` and returns TRUE + * @param task + * @param item + * @return + */ +gboolean rspamd_symcache_item_async_dec_check_full(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + const gchar *subsystem, + const gchar *loc); + +#define rspamd_symcache_item_async_dec_check(task, item, subsystem) \ + rspamd_symcache_item_async_dec_check_full(task, item, subsystem, G_STRLOC) + +/** + * Disables execution of all symbols, excluding those specified in `skip_mask` + * @param task + * @param cache + * @param skip_mask + */ +void rspamd_symcache_disable_all_symbols(struct rspamd_task *task, + struct rspamd_symcache *cache, + guint skip_mask); + +/** + * Iterates over the list of the enabled composites calling specified function + * @param task + * @param cache + * @param func + * @param fd + */ +void rspamd_symcache_composites_foreach(struct rspamd_task *task, + struct rspamd_symcache *cache, + GHFunc func, + gpointer fd); + +/** + * Sets allowed settings ids for a symbol + * @param cache + * @param symbol + * @param ids + * @param nids + */ +bool rspamd_symcache_set_allowed_settings_ids(struct rspamd_symcache *cache, + const gchar *symbol, + const guint32 *ids, + guint nids); +/** + * Sets denied settings ids for a symbol + * @param cache + * @param symbol + * @param ids + * @param nids + */ +bool rspamd_symcache_set_forbidden_settings_ids(struct rspamd_symcache *cache, + const gchar *symbol, + const guint32 *ids, + guint nids); + +/** + * Returns allowed ids for a symbol as a constant array + * @param cache + * @param symbol + * @param nids + * @return + */ +const guint32 *rspamd_symcache_get_allowed_settings_ids(struct rspamd_symcache *cache, + const gchar *symbol, + guint *nids); + +/** + * Returns denied ids for a symbol as a constant array + * @param cache + * @param symbol + * @param nids + * @return + */ +const guint32 *rspamd_symcache_get_forbidden_settings_ids(struct rspamd_symcache *cache, + const gchar *symbol, + guint *nids); + + +/** + * Processes settings_elt in cache and converts it to a set of + * adjustments for forbidden/allowed settings_ids for each symbol + * @param cache + * @param elt + */ +void rspamd_symcache_process_settings_elt(struct rspamd_symcache *cache, + struct rspamd_config_settings_elt *elt); + +/** + * Check if a symbol is allowed for execution/insertion, this does not involve + * condition scripts to be checked (so it is intended to be fast). + * @param task + * @param item + * @param exec_only + * @return + */ +gboolean rspamd_symcache_is_item_allowed(struct rspamd_task *task, + struct rspamd_symcache_item *item, + gboolean exec_only); + +/** + * Returns symcache item flags + * @param item + * @return + */ +gint rspamd_symcache_dyn_item_flags(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *dyn_item); +gint rspamd_symcache_item_flags(struct rspamd_symcache_item *item); + +/** + * Returns cache item name + * @param item + * @return + */ +const gchar *rspamd_symcache_dyn_item_name(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *dyn_item); +const gchar *rspamd_symcache_item_name(struct rspamd_symcache_item *item); + +/** + * Returns the current item stat + * @param item + * @return + */ +const struct rspamd_symcache_item_stat * +rspamd_symcache_item_stat(struct rspamd_symcache_item *item); + +/** + * Enable profiling for task (e.g. when a slow rule has been found) + * @param task + */ +void rspamd_symcache_enable_profile(struct rspamd_task *task); + +struct rspamd_symcache_timeout_item { + double timeout; + const struct rspamd_symcache_item *item; +}; + +struct rspamd_symcache_timeout_result { + double max_timeout; + struct rspamd_symcache_timeout_item *items; + size_t nitems; +}; +/** + * Gets maximum timeout announced by symbols cache + * @param cache + * @return new symcache timeout_result structure, that should be freed by call + * `rspamd_symcache_timeout_result_free` + */ +struct rspamd_symcache_timeout_result *rspamd_symcache_get_max_timeout(struct rspamd_symcache *cache); + +/** + * Frees results obtained from the previous function + * @param res + */ +void rspamd_symcache_timeout_result_free(struct rspamd_symcache_timeout_result *res); + +/** + * Destroy internal state of the symcache runtime + * @param task + */ +void rspamd_symcache_runtime_destroy(struct rspamd_task *task); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/spf.c b/src/libserver/spf.c new file mode 100644 index 0000000..72d8b99 --- /dev/null +++ b/src/libserver/spf.c @@ -0,0 +1,2799 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "dns.h" +#include "spf.h" +#include "rspamd.h" +#include "message.h" +#include "utlist.h" +#include "libserver/mempool_vars_internal.h" +#include "contrib/librdns/rdns.h" +#include "contrib/mumhash/mum.h" + +#define SPF_VER1_STR "v=spf1" +#define SPF_VER2_STR "spf2." +#define SPF_SCOPE_PRA "pra" +#define SPF_SCOPE_MFROM "mfrom" +#define SPF_ALL "all" +#define SPF_A "a" +#define SPF_IP4 "ip4" +#define SPF_IP4_ALT "ipv4" +#define SPF_IP6 "ip6" +#define SPF_IP6_ALT "ipv6" +#define SPF_PTR "ptr" +#define SPF_MX "mx" +#define SPF_EXISTS "exists" +#define SPF_INCLUDE "include" +#define SPF_REDIRECT "redirect" +#define SPF_EXP "exp" + +struct spf_resolved_element { + GPtrArray *elts; + gchar *cur_domain; + gboolean redirected; /* Ignore level, it's redirected */ +}; + +struct spf_record { + gint nested; + gint dns_requests; + gint requests_inflight; + + guint ttl; + GPtrArray *resolved; + /* Array of struct spf_resolved_element */ + const gchar *sender; + const gchar *sender_domain; + const gchar *top_record; + gchar *local_part; + struct rspamd_task *task; + spf_cb_t callback; + gpointer cbdata; + gboolean done; +}; + +struct rspamd_spf_library_ctx { + guint max_dns_nesting; + guint max_dns_requests; + guint min_cache_ttl; + gboolean disable_ipv6; + rspamd_lru_hash_t *spf_hash; +}; + +struct rspamd_spf_library_ctx *spf_lib_ctx = NULL; + +/** + * BNF for SPF record: + * + * spf_mech ::= +|-|~|? + * + * spf_body ::= spf=v1 <spf_command> [<spf_command>] + * spf_command ::= [spf_mech]all|a|<ip4>|<ip6>|ptr|mx|<exists>|<include>|<redirect> + * + * spf_domain ::= [:domain][/mask] + * spf_ip4 ::= ip[/mask] + * ip4 ::= ip4:<spf_ip4> + * mx ::= mx<spf_domain> + * a ::= a<spf_domain> + * ptr ::= ptr[:domain] + * exists ::= exists:domain + * include ::= include:domain + * redirect ::= redirect:domain + * exp ::= exp:domain + * + */ + +#undef SPF_DEBUG + +#define msg_err_spf(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "spf", rec->task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_spf(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "spf", rec->task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_spf(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "spf", rec->task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_notice_spf(...) rspamd_default_log_function(G_LOG_LEVEL_MESSAGE, \ + "spf", rec->task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_spf(...) rspamd_conditional_debug_fast(NULL, rec->task->from_addr, \ + rspamd_spf_log_id, "spf", rec->task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_spf_flatten(...) rspamd_conditional_debug_fast_num_id(NULL, NULL, \ + rspamd_spf_log_id, "spf", (flat)->digest, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +INIT_LOG_MODULE(spf) + +struct spf_dns_cb { + struct spf_record *rec; + struct spf_addr *addr; + struct spf_resolved_element *resolved; + const gchar *ptr_host; + spf_action_t cur_action; + gboolean in_include; +}; + +#define CHECK_REC(rec) \ + do { \ + if (spf_lib_ctx->max_dns_nesting > 0 && \ + (rec)->nested > spf_lib_ctx->max_dns_nesting) { \ + msg_warn_spf("spf nesting limit: %d > %d is reached, domain: %s", \ + (rec)->nested, spf_lib_ctx->max_dns_nesting, \ + (rec)->sender_domain); \ + return FALSE; \ + } \ + if (spf_lib_ctx->max_dns_requests > 0 && \ + (rec)->dns_requests > spf_lib_ctx->max_dns_requests) { \ + msg_warn_spf("spf dns requests limit: %d > %d is reached, domain: %s", \ + (rec)->dns_requests, spf_lib_ctx->max_dns_requests, \ + (rec)->sender_domain); \ + return FALSE; \ + } \ + } while (0) + +RSPAMD_CONSTRUCTOR(rspamd_spf_lib_ctx_ctor) +{ + spf_lib_ctx = g_malloc0(sizeof(*spf_lib_ctx)); + spf_lib_ctx->max_dns_nesting = SPF_MAX_NESTING; + spf_lib_ctx->max_dns_requests = SPF_MAX_DNS_REQUESTS; + spf_lib_ctx->min_cache_ttl = SPF_MIN_CACHE_TTL; + spf_lib_ctx->disable_ipv6 = FALSE; +} + +RSPAMD_DESTRUCTOR(rspamd_spf_lib_ctx_dtor) +{ + if (spf_lib_ctx->spf_hash) { + rspamd_lru_hash_destroy(spf_lib_ctx->spf_hash); + } + g_free(spf_lib_ctx); + spf_lib_ctx = NULL; +} + +static void +spf_record_cached_unref_dtor(gpointer p) +{ + struct spf_resolved *flat = (struct spf_resolved *) p; + + _spf_record_unref(flat, "LRU cache"); +} + +void spf_library_config(const ucl_object_t *obj) +{ + const ucl_object_t *value; + gint64 ival; + bool bval; + + if (obj == NULL) { + /* No specific config */ + return; + } + + if ((value = ucl_object_find_key(obj, "min_cache_ttl")) != NULL) { + if (ucl_object_toint_safe(value, &ival) && ival >= 0) { + spf_lib_ctx->min_cache_ttl = ival; + } + } + + if ((value = ucl_object_find_key(obj, "max_dns_nesting")) != NULL) { + if (ucl_object_toint_safe(value, &ival) && ival >= 0) { + spf_lib_ctx->max_dns_nesting = ival; + } + } + + if ((value = ucl_object_find_key(obj, "max_dns_requests")) != NULL) { + if (ucl_object_toint_safe(value, &ival) && ival >= 0) { + spf_lib_ctx->max_dns_requests = ival; + } + } + if ((value = ucl_object_find_key(obj, "disable_ipv6")) != NULL) { + if (ucl_object_toboolean_safe(value, &bval)) { + spf_lib_ctx->disable_ipv6 = bval; + } + } + + if (spf_lib_ctx->spf_hash) { + rspamd_lru_hash_destroy(spf_lib_ctx->spf_hash); + spf_lib_ctx->spf_hash = NULL; + } + + if ((value = ucl_object_find_key(obj, "spf_cache_size")) != NULL) { + if (ucl_object_toint_safe(value, &ival) && ival > 0) { + spf_lib_ctx->spf_hash = rspamd_lru_hash_new( + ival, + g_free, + spf_record_cached_unref_dtor); + } + } + else { + /* Preserve compatibility */ + spf_lib_ctx->spf_hash = rspamd_lru_hash_new( + 2048, + g_free, + spf_record_cached_unref_dtor); + } +} + +static gboolean start_spf_parse(struct spf_record *rec, + struct spf_resolved_element *resolved, gchar *begin); + +/* Determine spf mech */ +static spf_mech_t +check_spf_mech(const gchar *elt, gboolean *need_shift) +{ + g_assert(elt != NULL); + + *need_shift = TRUE; + + switch (*elt) { + case '-': + return SPF_FAIL; + case '~': + return SPF_SOFT_FAIL; + case '+': + return SPF_PASS; + case '?': + return SPF_NEUTRAL; + default: + *need_shift = FALSE; + return SPF_PASS; + } +} + +static const gchar * +rspamd_spf_dns_action_to_str(spf_action_t act) +{ + const char *ret = "unknown"; + + switch (act) { + case SPF_RESOLVE_MX: + ret = "MX"; + break; + case SPF_RESOLVE_A: + ret = "A"; + break; + case SPF_RESOLVE_PTR: + ret = "PTR"; + break; + case SPF_RESOLVE_AAA: + ret = "AAAA"; + break; + case SPF_RESOLVE_REDIRECT: + ret = "REDIRECT"; + break; + case SPF_RESOLVE_INCLUDE: + ret = "INCLUDE"; + break; + case SPF_RESOLVE_EXISTS: + ret = "EXISTS"; + break; + case SPF_RESOLVE_EXP: + ret = "EXP"; + break; + } + + return ret; +} + +static struct spf_addr * +rspamd_spf_new_addr(struct spf_record *rec, + struct spf_resolved_element *resolved, const gchar *elt) +{ + gboolean need_shift = FALSE; + struct spf_addr *naddr; + + naddr = g_malloc0(sizeof(*naddr)); + naddr->mech = check_spf_mech(elt, &need_shift); + + if (need_shift) { + naddr->spf_string = g_strdup(elt + 1); + } + else { + naddr->spf_string = g_strdup(elt); + } + + g_ptr_array_add(resolved->elts, naddr); + naddr->prev = naddr; + naddr->next = NULL; + + return naddr; +} + +static void +rspamd_spf_free_addr(gpointer a) +{ + struct spf_addr *addr = a, *tmp, *cur; + + if (addr) { + g_free(addr->spf_string); + DL_FOREACH_SAFE(addr, cur, tmp) + { + g_free(cur); + } + } +} + +static struct spf_resolved_element * +rspamd_spf_new_addr_list(struct spf_record *rec, const gchar *domain) +{ + struct spf_resolved_element *resolved; + + resolved = g_malloc0(sizeof(*resolved)); + resolved->redirected = FALSE; + resolved->cur_domain = g_strdup(domain); + resolved->elts = g_ptr_array_new_full(8, rspamd_spf_free_addr); + + g_ptr_array_add(rec->resolved, resolved); + + return g_ptr_array_index(rec->resolved, rec->resolved->len - 1); +} + +/* + * Destructor for spf record + */ +static void +spf_record_destructor(gpointer r) +{ + struct spf_record *rec = r; + struct spf_resolved_element *elt; + guint i; + + if (rec) { + for (i = 0; i < rec->resolved->len; i++) { + elt = g_ptr_array_index(rec->resolved, i); + g_ptr_array_free(elt->elts, TRUE); + g_free(elt->cur_domain); + g_free(elt); + } + + g_ptr_array_free(rec->resolved, TRUE); + } +} + +static void +rspamd_flatten_record_dtor(struct spf_resolved *r) +{ + struct spf_addr *addr; + guint i; + + for (i = 0; i < r->elts->len; i++) { + addr = &g_array_index(r->elts, struct spf_addr, i); + g_free(addr->spf_string); + } + + g_free(r->top_record); + g_free(r->domain); + g_array_free(r->elts, TRUE); + g_free(r); +} + +static void +rspamd_spf_process_reference(struct spf_resolved *target, + struct spf_addr *addr, struct spf_record *rec, gboolean top) +{ + struct spf_resolved_element *elt, *relt; + struct spf_addr *cur = NULL, taddr, *cur_addr; + guint i; + + if (addr) { + g_assert(addr->m.idx < rec->resolved->len); + + elt = g_ptr_array_index(rec->resolved, addr->m.idx); + } + else { + elt = g_ptr_array_index(rec->resolved, 0); + } + + if (rec->ttl < target->ttl) { + msg_debug_spf("reducing ttl from %d to %d after subrecord processing %s", + target->ttl, rec->ttl, rec->sender_domain); + target->ttl = rec->ttl; + } + + if (elt->redirected) { + g_assert(elt->elts->len > 0); + + for (i = 0; i < elt->elts->len; i++) { + cur = g_ptr_array_index(elt->elts, i); + if (cur->flags & RSPAMD_SPF_FLAG_REDIRECT) { + break; + } + } + + g_assert(cur != NULL); + if (!(cur->flags & (RSPAMD_SPF_FLAG_PARSED | RSPAMD_SPF_FLAG_RESOLVED))) { + /* Unresolved redirect */ + msg_info_spf("redirect to %s cannot be resolved for domain %s", cur->spf_string, rec->sender_domain); + } + else { + g_assert(cur->flags & RSPAMD_SPF_FLAG_REFERENCE); + g_assert(cur->m.idx < rec->resolved->len); + relt = g_ptr_array_index(rec->resolved, cur->m.idx); + msg_debug_spf("domain %s is redirected to %s", elt->cur_domain, + relt->cur_domain); + } + } + + for (i = 0; i < elt->elts->len; i++) { + cur = g_ptr_array_index(elt->elts, i); + + if (cur->flags & RSPAMD_SPF_FLAG_TEMPFAIL) { + target->flags |= RSPAMD_SPF_RESOLVED_TEMP_FAILED; + continue; + } + if (cur->flags & RSPAMD_SPF_FLAG_PERMFAIL) { + if (cur->flags & RSPAMD_SPF_FLAG_REDIRECT) { + target->flags |= RSPAMD_SPF_RESOLVED_PERM_FAILED; + } + continue; + } + if (cur->flags & RSPAMD_SPF_FLAG_NA) { + target->flags |= RSPAMD_SPF_RESOLVED_NA; + continue; + } + if (cur->flags & RSPAMD_SPF_FLAG_INVALID) { + /* Ignore invalid elements */ + continue; + } + if ((cur->flags & (RSPAMD_SPF_FLAG_PARSED | RSPAMD_SPF_FLAG_RESOLVED)) != + (RSPAMD_SPF_FLAG_RESOLVED | RSPAMD_SPF_FLAG_PARSED)) { + /* Ignore unparsed addrs */ + continue; + } + if (cur->flags & RSPAMD_SPF_FLAG_REFERENCE) { + /* Process reference */ + if (cur->flags & RSPAMD_SPF_FLAG_REDIRECT) { + /* Stop on redirected domain */ + rspamd_spf_process_reference(target, cur, rec, top); + break; + } + else { + rspamd_spf_process_reference(target, cur, rec, FALSE); + } + } + else { + if ((cur->flags & RSPAMD_SPF_FLAG_ANY) && !top) { + /* Ignore wide policies in includes */ + continue; + } + + DL_FOREACH(cur, cur_addr) + { + memcpy(&taddr, cur_addr, sizeof(taddr)); + taddr.spf_string = g_strdup(cur_addr->spf_string); + g_array_append_val(target->elts, taddr); + } + } + } +} + +/* + * Parse record and flatten it to a simple structure + */ +static struct spf_resolved * +rspamd_spf_record_flatten(struct spf_record *rec) +{ + struct spf_resolved *res; + + g_assert(rec != NULL); + + res = g_malloc0(sizeof(*res)); + res->domain = g_strdup(rec->sender_domain); + res->ttl = rec->ttl; + /* Not precise but okay */ + res->timestamp = rec->task->task_timestamp; + res->digest = mum_hash_init(0xa4aa40bbeec59e2bULL); + res->top_record = g_strdup(rec->top_record); + REF_INIT_RETAIN(res, rspamd_flatten_record_dtor); + + if (rec->resolved) { + res->elts = g_array_sized_new(FALSE, FALSE, sizeof(struct spf_addr), + rec->resolved->len); + + if (rec->resolved->len > 0) { + rspamd_spf_process_reference(res, NULL, rec, TRUE); + } + } + else { + res->elts = g_array_new(FALSE, FALSE, sizeof(struct spf_addr)); + } + + return res; +} + +static gint +rspamd_spf_elts_cmp(gconstpointer a, gconstpointer b) +{ + struct spf_addr *addr_a, *addr_b; + + addr_a = (struct spf_addr *) a; + addr_b = (struct spf_addr *) b; + + if (addr_a->flags == addr_b->flags) { + if (addr_a->flags & RSPAMD_SPF_FLAG_ANY) { + return 0; + } + else if (addr_a->flags & RSPAMD_SPF_FLAG_IPV4) { + return (addr_a->m.dual.mask_v4 - addr_b->m.dual.mask_v4) || + memcmp(addr_a->addr4, addr_b->addr4, sizeof(addr_a->addr4)); + } + else if (addr_a->flags & RSPAMD_SPF_FLAG_IPV6) { + return (addr_a->m.dual.mask_v6 - addr_b->m.dual.mask_v6) || + memcmp(addr_a->addr6, addr_b->addr6, sizeof(addr_a->addr6)); + } + else { + return 0; + } + } + else { + if (addr_a->flags & RSPAMD_SPF_FLAG_ANY) { + return 1; + } + else if (addr_b->flags & RSPAMD_SPF_FLAG_ANY) { + return -1; + } + else if (addr_a->flags & RSPAMD_SPF_FLAG_IPV4) { + return -1; + } + + return 1; + } +} + +static void +rspamd_spf_record_postprocess(struct spf_resolved *rec, struct rspamd_task *task) +{ + g_array_sort(rec->elts, rspamd_spf_elts_cmp); + + for (guint i = 0; i < rec->elts->len; i++) { + struct spf_addr *cur_addr = &g_array_index(rec->elts, struct spf_addr, i); + + if (cur_addr->flags & RSPAMD_SPF_FLAG_IPV6) { + guint64 t[3]; + + /* + * Fill hash entry for ipv6 addr with 2 int64 from ipv6 address, + * the remaining int64 has mech + mask + */ + memcpy(t, cur_addr->addr6, sizeof(guint64) * 2); + t[2] = ((guint64) (cur_addr->mech)) << 48u; + t[2] |= cur_addr->m.dual.mask_v6; + + for (guint j = 0; j < G_N_ELEMENTS(t); j++) { + rec->digest = mum_hash_step(rec->digest, t[j]); + } + } + else if (cur_addr->flags & RSPAMD_SPF_FLAG_IPV4) { + guint64 t = 0; + + memcpy(&t, cur_addr->addr4, sizeof(guint32)); + t |= ((guint64) (cur_addr->mech)) << 48u; + t |= ((guint64) cur_addr->m.dual.mask_v4) << 32u; + + rec->digest = mum_hash_step(rec->digest, t); + } + } + + if (spf_lib_ctx->min_cache_ttl > 0) { + if (rec->ttl != 0 && rec->ttl < spf_lib_ctx->min_cache_ttl) { + msg_info_task("increasing ttl from %d to %d as it lower than a limit", + rec->ttl, spf_lib_ctx->min_cache_ttl); + rec->ttl = spf_lib_ctx->min_cache_ttl; + } + } +} + +static void +rspamd_spf_maybe_return(struct spf_record *rec) +{ + struct spf_resolved *flat; + struct rspamd_task *task = rec->task; + bool cached = false; + + if (rec->requests_inflight == 0 && !rec->done) { + flat = rspamd_spf_record_flatten(rec); + rspamd_spf_record_postprocess(flat, rec->task); + + if (flat->ttl > 0 && flat->flags == 0) { + + if (spf_lib_ctx->spf_hash) { + rspamd_lru_hash_insert(spf_lib_ctx->spf_hash, + g_strdup(flat->domain), + spf_record_ref(flat), + flat->timestamp, flat->ttl); + + msg_info_task("stored SPF record for %s (0x%xuL) in LRU cache for %d seconds, " + "%d/%d elements in the cache", + flat->domain, + flat->digest, + flat->ttl, + rspamd_lru_hash_size(spf_lib_ctx->spf_hash), + rspamd_lru_hash_capacity(spf_lib_ctx->spf_hash)); + cached = true; + } + } + + if (!cached) { + /* Still write a log line */ + msg_info_task("not stored SPF record for %s (0x%xuL) in LRU cache; flags=%d; ttl=%d", + flat->domain, + flat->digest, + flat->flags, + flat->ttl); + } + + rec->callback(flat, rec->task, rec->cbdata); + spf_record_unref(flat); + rec->done = TRUE; + } +} + +static gboolean +spf_check_ptr_host(struct spf_dns_cb *cb, const char *name) +{ + const char *dend, *nend, *dstart, *nstart; + struct spf_record *rec = cb->rec; + + if (cb->ptr_host != NULL) { + dstart = cb->ptr_host; + } + else { + dstart = cb->resolved->cur_domain; + } + + if (name == NULL || dstart == NULL) { + return FALSE; + } + + msg_debug_spf("check ptr %s vs %s", name, dstart); + + /* We need to check whether `cur_domain` is a subdomain for `name` */ + dend = dstart + strlen(dstart) - 1; + nstart = name; + nend = nstart + strlen(nstart) - 1; + + if (nend <= nstart || dend <= dstart) { + return FALSE; + } + /* Strip last '.' from names */ + if (*nend == '.') { + nend--; + } + if (*dend == '.') { + dend--; + } + if (nend <= nstart || dend <= dstart) { + return FALSE; + } + + /* Now compare from end to start */ + for (;;) { + if (g_ascii_tolower(*dend) != g_ascii_tolower(*nend)) { + msg_debug_spf("ptr records mismatch: %s and %s", dend, nend); + return FALSE; + } + + if (dend == dstart) { + break; + } + if (nend == nstart) { + /* Name is shorter than cur_domain */ + return FALSE; + } + nend--; + dend--; + } + + if (nend > nstart && *(nend - 1) != '.') { + /* Not a subdomain */ + return FALSE; + } + + return TRUE; +} + +static void +spf_record_process_addr(struct spf_record *rec, struct spf_addr *addr, struct rdns_reply_entry *reply) +{ + struct spf_addr *naddr; + + if (!(addr->flags & RSPAMD_SPF_FLAG_PROCESSED)) { + /* That's the first address */ + if (reply->type == RDNS_REQUEST_AAAA) { + memcpy(addr->addr6, + &reply->content.aaa.addr, + sizeof(addr->addr6)); + addr->flags |= RSPAMD_SPF_FLAG_IPV6; + } + else if (reply->type == RDNS_REQUEST_A) { + memcpy(addr->addr4, &reply->content.a.addr, sizeof(addr->addr4)); + addr->flags |= RSPAMD_SPF_FLAG_IPV4; + } + else { + msg_err_spf( + "internal error, bad DNS reply is treated as address: %s; domain: %s", + rdns_strtype(reply->type), + rec->sender_domain); + } + + addr->flags |= RSPAMD_SPF_FLAG_PROCESSED; + } + else { + /* We need to create a new address */ + naddr = g_malloc0(sizeof(*naddr)); + memcpy(naddr, addr, sizeof(*naddr)); + naddr->next = NULL; + naddr->prev = NULL; + + if (reply->type == RDNS_REQUEST_AAAA) { + memcpy(naddr->addr6, + &reply->content.aaa.addr, + sizeof(addr->addr6)); + naddr->flags |= RSPAMD_SPF_FLAG_IPV6; + } + else if (reply->type == RDNS_REQUEST_A) { + memcpy(naddr->addr4, &reply->content.a.addr, sizeof(addr->addr4)); + naddr->flags |= RSPAMD_SPF_FLAG_IPV4; + } + else { + msg_err_spf( + "internal error, bad DNS reply is treated as address: %s; domain: %s", + rdns_strtype(reply->type), + rec->sender_domain); + } + + DL_APPEND(addr, naddr); + } +} + +static void +spf_record_addr_set(struct spf_addr *addr, gboolean allow_any) +{ + guchar fill; + + if (!(addr->flags & RSPAMD_SPF_FLAG_PROCESSED)) { + if (allow_any) { + fill = 0; + addr->m.dual.mask_v4 = 0; + addr->m.dual.mask_v6 = 0; + } + else { + fill = 0xff; + } + + memset(addr->addr4, fill, sizeof(addr->addr4)); + memset(addr->addr6, fill, sizeof(addr->addr6)); + + + addr->flags |= RSPAMD_SPF_FLAG_IPV4; + addr->flags |= RSPAMD_SPF_FLAG_IPV6; + } +} + +static gboolean +spf_process_txt_record(struct spf_record *rec, struct spf_resolved_element *resolved, + struct rdns_reply *reply, struct rdns_reply_entry **pselected) +{ + struct rdns_reply_entry *elt, *selected = NULL; + gboolean ret = FALSE; + + /* + * We prefer spf version 1 as other records are mostly likely garbage + * or incorrect records (e.g. spf2 records) + */ + LL_FOREACH(reply->entries, elt) + { + if (elt->type == RDNS_REQUEST_TXT) { + if (strncmp(elt->content.txt.data, "v=spf1", sizeof("v=spf1") - 1) == 0) { + selected = elt; + + if (pselected != NULL) { + *pselected = selected; + } + + break; + } + } + } + + if (!selected) { + LL_FOREACH(reply->entries, elt) + { + /* + * Rubbish spf record? Let's still try to process it, but merely for + * TXT RRs + */ + if (elt->type == RDNS_REQUEST_TXT) { + if (start_spf_parse(rec, resolved, elt->content.txt.data)) { + ret = TRUE; + if (pselected != NULL) { + *pselected = elt; + } + break; + } + } + } + } + else { + ret = start_spf_parse(rec, resolved, selected->content.txt.data); + } + + return ret; +} + +static void +spf_record_dns_callback(struct rdns_reply *reply, gpointer arg) +{ + struct spf_dns_cb *cb = arg; + struct rdns_reply_entry *elt_data; + struct rspamd_task *task; + struct spf_addr *addr; + struct spf_record *rec; + const struct rdns_request_name *req_name; + bool truncated = false; + + rec = cb->rec; + task = rec->task; + + cb->rec->requests_inflight--; + addr = cb->addr; + req_name = rdns_request_get_name(reply->request, NULL); + + if (reply->flags & RDNS_TRUNCATED) { + /* Do not process truncated DNS replies */ + truncated = true; + + if (req_name) { + msg_notice_spf("got a truncated record when trying to resolve %s (%s type) for SPF domain %s", + req_name->name, rdns_str_from_type(req_name->type), + rec->sender_domain); + } + else { + msg_notice_spf("got a truncated record when trying to resolve ??? " + "(internal error) for SPF domain %s", + rec->sender_domain); + } + } + + if (reply->code == RDNS_RC_NOERROR && !truncated) { + + LL_FOREACH(reply->entries, elt_data) + { + /* Adjust ttl if a resolved record has lower ttl than spf record itself */ + if ((guint) elt_data->ttl < rec->ttl) { + msg_debug_spf("reducing ttl from %d to %d after DNS resolving", + rec->ttl, elt_data->ttl); + rec->ttl = elt_data->ttl; + } + + if (elt_data->type == RDNS_REQUEST_CNAME) { + /* Skip cname aliases - it must be handled by a recursor */ + continue; + } + + switch (cb->cur_action) { + case SPF_RESOLVE_MX: + if (elt_data->type == RDNS_REQUEST_MX) { + /* Now resolve A record for this MX */ + msg_debug_spf("resolve %s after resolving of MX", + elt_data->content.mx.name); + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, + RDNS_REQUEST_A, + elt_data->content.mx.name)) { + cb->rec->requests_inflight++; + } + + if (!spf_lib_ctx->disable_ipv6) { + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, + RDNS_REQUEST_AAAA, + elt_data->content.mx.name)) { + cb->rec->requests_inflight++; + } + } + else { + msg_debug_spf("skip AAAA request for MX resolution"); + } + } + else { + cb->addr->flags |= RSPAMD_SPF_FLAG_RESOLVED; + cb->addr->flags &= ~RSPAMD_SPF_FLAG_PERMFAIL; + msg_debug_spf("resolved MX addr"); + spf_record_process_addr(rec, addr, elt_data); + } + break; + case SPF_RESOLVE_A: + case SPF_RESOLVE_AAA: + cb->addr->flags |= RSPAMD_SPF_FLAG_RESOLVED; + cb->addr->flags &= ~RSPAMD_SPF_FLAG_PERMFAIL; + spf_record_process_addr(rec, addr, elt_data); + break; + case SPF_RESOLVE_PTR: + if (elt_data->type == RDNS_REQUEST_PTR) { + /* Validate returned records prior to making A requests */ + if (spf_check_ptr_host(cb, + elt_data->content.ptr.name)) { + msg_debug_spf("resolve PTR %s after resolving of PTR", + elt_data->content.ptr.name); + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, + RDNS_REQUEST_A, + elt_data->content.ptr.name)) { + cb->rec->requests_inflight++; + } + + if (!spf_lib_ctx->disable_ipv6) { + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, + RDNS_REQUEST_AAAA, + elt_data->content.ptr.name)) { + cb->rec->requests_inflight++; + } + } + else { + msg_debug_spf("skip AAAA request for PTR resolution"); + } + } + else { + cb->addr->flags |= RSPAMD_SPF_FLAG_RESOLVED; + cb->addr->flags &= ~RSPAMD_SPF_FLAG_PERMFAIL; + } + } + else { + cb->addr->flags |= RSPAMD_SPF_FLAG_RESOLVED; + cb->addr->flags &= ~RSPAMD_SPF_FLAG_PERMFAIL; + spf_record_process_addr(rec, addr, elt_data); + } + break; + case SPF_RESOLVE_REDIRECT: + if (elt_data->type == RDNS_REQUEST_TXT) { + cb->addr->flags |= RSPAMD_SPF_FLAG_RESOLVED; + if (reply->entries) { + msg_debug_spf("got redirection record for %s: '%s'", + req_name->name, + reply->entries[0].content.txt.data); + } + + if (!spf_process_txt_record(rec, cb->resolved, reply, NULL)) { + cb->addr->flags |= RSPAMD_SPF_FLAG_PERMFAIL; + } + } + + goto end; + break; + case SPF_RESOLVE_INCLUDE: + if (elt_data->type == RDNS_REQUEST_TXT) { + struct rdns_reply_entry *selected = NULL; + + cb->addr->flags |= RSPAMD_SPF_FLAG_RESOLVED; + spf_process_txt_record(rec, cb->resolved, reply, &selected); + if (selected) { + msg_debug_spf("got include record for %s: '%s'", + req_name->name, + selected->content.txt.data); + } + else { + msg_debug_spf("no include record for %s", + req_name->name); + } + } + goto end; + + break; + case SPF_RESOLVE_EXP: + break; + case SPF_RESOLVE_EXISTS: + if (elt_data->type == RDNS_REQUEST_A || + elt_data->type == RDNS_REQUEST_AAAA) { + /* + * If specified address resolves, we can accept + * connection from every IP + */ + addr->flags |= RSPAMD_SPF_FLAG_RESOLVED; + spf_record_addr_set(addr, TRUE); + } + break; + } + } + } + else if (reply->code == RDNS_RC_NXDOMAIN || reply->code == RDNS_RC_NOREC) { + switch (cb->cur_action) { + case SPF_RESOLVE_MX: + if (!(cb->addr->flags & RSPAMD_SPF_FLAG_RESOLVED)) { + cb->addr->flags |= RSPAMD_SPF_FLAG_PERMFAIL; + msg_info_spf( + "spf error for domain %s: cannot find MX" + " record for %s: %s", + cb->rec->sender_domain, + cb->resolved->cur_domain, + rdns_strerror(reply->code)); + spf_record_addr_set(addr, FALSE); + } + break; + case SPF_RESOLVE_A: + if (!(cb->addr->flags & RSPAMD_SPF_FLAG_RESOLVED)) { + cb->addr->flags |= RSPAMD_SPF_FLAG_PERMFAIL; + msg_info_spf( + "spf error for domain %s: cannot resolve A" + " record for %s: %s", + cb->rec->sender_domain, + cb->resolved->cur_domain, + rdns_strerror(reply->code)); + + if (rdns_request_has_type(reply->request, RDNS_REQUEST_A)) { + spf_record_addr_set(addr, FALSE); + } + } + break; + case SPF_RESOLVE_AAA: + if (!(cb->addr->flags & RSPAMD_SPF_FLAG_RESOLVED)) { + cb->addr->flags |= RSPAMD_SPF_FLAG_PERMFAIL; + msg_info_spf( + "spf error for domain %s: cannot resolve AAAA" + " record for %s: %s", + cb->rec->sender_domain, + cb->resolved->cur_domain, + rdns_strerror(reply->code)); + if (rdns_request_has_type(reply->request, RDNS_REQUEST_AAAA)) { + spf_record_addr_set(addr, FALSE); + } + } + break; + case SPF_RESOLVE_PTR: + if (!(cb->addr->flags & RSPAMD_SPF_FLAG_RESOLVED)) { + msg_info_spf( + "spf error for domain %s: cannot resolve PTR" + " record for %s: %s", + cb->rec->sender_domain, + cb->resolved->cur_domain, + rdns_strerror(reply->code)); + cb->addr->flags |= RSPAMD_SPF_FLAG_PERMFAIL; + + spf_record_addr_set(addr, FALSE); + } + break; + case SPF_RESOLVE_REDIRECT: + if (!(cb->addr->flags & RSPAMD_SPF_FLAG_RESOLVED)) { + cb->addr->flags |= RSPAMD_SPF_FLAG_PERMFAIL; + msg_info_spf( + "spf error for domain %s: cannot resolve REDIRECT" + " record for %s: %s", + cb->rec->sender_domain, + cb->resolved->cur_domain, + rdns_strerror(reply->code)); + } + + break; + case SPF_RESOLVE_INCLUDE: + if (!(cb->addr->flags & RSPAMD_SPF_FLAG_RESOLVED)) { + msg_info_spf( + "spf error for domain %s: cannot resolve INCLUDE" + " record for %s: %s", + cb->rec->sender_domain, + cb->resolved->cur_domain, + rdns_strerror(reply->code)); + + cb->addr->flags |= RSPAMD_SPF_FLAG_PERMFAIL; + } + break; + case SPF_RESOLVE_EXP: + break; + case SPF_RESOLVE_EXISTS: + if (!(cb->addr->flags & RSPAMD_SPF_FLAG_RESOLVED)) { + msg_debug_spf( + "spf macro resolution for domain %s: cannot resolve EXISTS" + " macro for %s: %s", + cb->rec->sender_domain, + cb->resolved->cur_domain, + rdns_strerror(reply->code)); + spf_record_addr_set(addr, FALSE); + } + break; + } + } + else { + cb->addr->flags |= RSPAMD_SPF_FLAG_TEMPFAIL; + msg_info_spf( + "spf error for domain %s: cannot resolve %s DNS record for" + " %s: %s", + cb->rec->sender_domain, + rspamd_spf_dns_action_to_str(cb->cur_action), + cb->ptr_host, + rdns_strerror(reply->code)); + } + +end: + rspamd_spf_maybe_return(cb->rec); +} + +/* + * The syntax defined by the following BNF: + * [ ":" domain-spec ] [ dual-cidr-length ] + * ip4-cidr-length = "/" 1*DIGIT + * ip6-cidr-length = "/" 1*DIGIT + * dual-cidr-length = [ ip4-cidr-length ] [ "/" ip6-cidr-length ] + */ +static const gchar * +parse_spf_domain_mask(struct spf_record *rec, struct spf_addr *addr, + struct spf_resolved_element *resolved, + gboolean allow_mask) +{ + struct rspamd_task *task = rec->task; + enum { + parse_spf_elt = 0, + parse_semicolon, + parse_domain, + parse_slash, + parse_ipv4_mask, + parse_second_slash, + parse_ipv6_mask, + skip_garbage + } state = 0; + const gchar *p = addr->spf_string, *host, *c; + gchar *hostbuf; + gchar t; + guint16 cur_mask = 0; + + host = resolved->cur_domain; + c = p; + + while (*p) { + t = *p; + + switch (state) { + case parse_spf_elt: + if (t == ':' || t == '=') { + state = parse_semicolon; + } + else if (t == '/') { + /* No domain but mask */ + state = parse_slash; + } + p++; + break; + case parse_semicolon: + if (t == '/') { + /* Empty domain, technically an error */ + state = parse_slash; + } + else { + c = p; + state = parse_domain; + } + break; + case parse_domain: + if (t == '/') { + hostbuf = rspamd_mempool_alloc(task->task_pool, p - c + 1); + rspamd_strlcpy(hostbuf, c, p - c + 1); + host = hostbuf; + state = parse_slash; + } + p++; + break; + case parse_slash: + c = p; + if (allow_mask) { + state = parse_ipv4_mask; + } + else { + state = skip_garbage; + } + cur_mask = 0; + break; + case parse_ipv4_mask: + if (g_ascii_isdigit(t)) { + /* Ignore errors here */ + cur_mask = cur_mask * 10 + (t - '0'); + } + else if (t == '/') { + if (cur_mask <= 32) { + addr->m.dual.mask_v4 = cur_mask; + } + else { + msg_notice_spf("bad ipv4 mask for %s: %d", + rec->sender_domain, cur_mask); + } + state = parse_second_slash; + } + p++; + break; + case parse_second_slash: + c = p; + state = parse_ipv6_mask; + cur_mask = 0; + break; + case parse_ipv6_mask: + if (g_ascii_isdigit(t)) { + /* Ignore errors here */ + cur_mask = cur_mask * 10 + (t - '0'); + } + p++; + break; + case skip_garbage: + p++; + break; + } + } + + /* Process end states */ + if (state == parse_ipv4_mask) { + if (cur_mask <= 32) { + addr->m.dual.mask_v4 = cur_mask; + } + else { + msg_notice_spf("bad ipv4 mask for %s: %d", rec->sender_domain, cur_mask); + } + } + else if (state == parse_ipv6_mask) { + if (cur_mask <= 128) { + addr->m.dual.mask_v6 = cur_mask; + } + else { + msg_notice_spf("bad ipv6 mask: %d", cur_mask); + } + } + else if (state == parse_domain && p - c > 0) { + hostbuf = rspamd_mempool_alloc(task->task_pool, p - c + 1); + rspamd_strlcpy(hostbuf, c, p - c + 1); + host = hostbuf; + } + + if (cur_mask == 0) { + addr->m.dual.mask_v4 = 32; + addr->m.dual.mask_v6 = 64; + } + + return host; +} + +static gboolean +parse_spf_a(struct spf_record *rec, + struct spf_resolved_element *resolved, struct spf_addr *addr) +{ + struct spf_dns_cb *cb; + const gchar *host = NULL; + struct rspamd_task *task = rec->task; + + CHECK_REC(rec); + + host = parse_spf_domain_mask(rec, addr, resolved, TRUE); + + if (host == NULL) { + return FALSE; + } + + rec->dns_requests++; + cb = rspamd_mempool_alloc(task->task_pool, sizeof(struct spf_dns_cb)); + cb->rec = rec; + cb->ptr_host = host; + cb->addr = addr; + cb->cur_action = SPF_RESOLVE_A; + cb->resolved = resolved; + msg_debug_spf("resolve a %s", host); + + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, RDNS_REQUEST_A, host)) { + rec->requests_inflight++; + + cb = rspamd_mempool_alloc(task->task_pool, sizeof(struct spf_dns_cb)); + cb->rec = rec; + cb->ptr_host = host; + cb->addr = addr; + cb->cur_action = SPF_RESOLVE_AAA; + cb->resolved = resolved; + + if (!spf_lib_ctx->disable_ipv6) { + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, RDNS_REQUEST_AAAA, host)) { + rec->requests_inflight++; + } + } + else { + msg_debug_spf("skip AAAA request for a record resolution"); + } + + return TRUE; + } + else { + msg_notice_spf("unresolvable A element for %s: %s", addr->spf_string, + rec->sender_domain); + } + + return FALSE; +} + +static gboolean +parse_spf_ptr(struct spf_record *rec, + struct spf_resolved_element *resolved, struct spf_addr *addr) +{ + struct spf_dns_cb *cb; + const gchar *host; + gchar *ptr; + struct rspamd_task *task = rec->task; + + CHECK_REC(rec); + + host = parse_spf_domain_mask(rec, addr, resolved, FALSE); + + rec->dns_requests++; + cb = rspamd_mempool_alloc(task->task_pool, sizeof(struct spf_dns_cb)); + cb->rec = rec; + cb->addr = addr; + cb->cur_action = SPF_RESOLVE_PTR; + cb->resolved = resolved; + cb->ptr_host = rspamd_mempool_strdup(task->task_pool, host); + ptr = + rdns_generate_ptr_from_str(rspamd_inet_address_to_string( + task->from_addr)); + + if (ptr == NULL) { + return FALSE; + } + + rspamd_mempool_add_destructor(task->task_pool, free, ptr); + msg_debug_spf("resolve ptr %s for %s", ptr, host); + + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, RDNS_REQUEST_PTR, ptr)) { + rec->requests_inflight++; + rec->ttl = 0; + msg_debug_spf("disable SPF caching as there is PTR expansion"); + + return TRUE; + } + else { + msg_notice_spf("unresolvable PTR element for %s: %s", addr->spf_string, + rec->sender_domain); + } + + return FALSE; +} + +static gboolean +parse_spf_mx(struct spf_record *rec, + struct spf_resolved_element *resolved, struct spf_addr *addr) +{ + struct spf_dns_cb *cb; + const gchar *host; + struct rspamd_task *task = rec->task; + + CHECK_REC(rec); + + host = parse_spf_domain_mask(rec, addr, resolved, TRUE); + + if (host == NULL) { + return FALSE; + } + + rec->dns_requests++; + cb = rspamd_mempool_alloc(task->task_pool, sizeof(struct spf_dns_cb)); + cb->rec = rec; + cb->addr = addr; + cb->cur_action = SPF_RESOLVE_MX; + cb->ptr_host = host; + cb->resolved = resolved; + + msg_debug_spf("resolve mx for %s", host); + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, RDNS_REQUEST_MX, host)) { + rec->requests_inflight++; + + return TRUE; + } + + return FALSE; +} + +static gboolean +parse_spf_all(struct spf_record *rec, struct spf_addr *addr) +{ + /* All is 0/0 */ + memset(&addr->addr4, 0, sizeof(addr->addr4)); + memset(&addr->addr6, 0, sizeof(addr->addr6)); + /* Here we set all masks to 0 */ + addr->m.idx = 0; + addr->flags |= RSPAMD_SPF_FLAG_ANY | RSPAMD_SPF_FLAG_RESOLVED; + msg_debug_spf("parsed all elt"); + + /* Disallow +all */ + if (addr->mech == SPF_PASS) { + addr->flags |= RSPAMD_SPF_FLAG_INVALID; + msg_notice_spf("domain %s allows any SPF (+all), ignore SPF record completely", + rec->sender_domain); + } + + return TRUE; +} + +static gboolean +parse_spf_ip4(struct spf_record *rec, struct spf_addr *addr) +{ + /* ip4:addr[/mask] */ + const gchar *semicolon, *slash; + gsize len; + gchar ipbuf[INET_ADDRSTRLEN + 1]; + guint32 mask; + static const guint32 min_valid_mask = 8; + + semicolon = strchr(addr->spf_string, ':'); + + if (semicolon == NULL) { + semicolon = strchr(addr->spf_string, '='); + + if (semicolon == NULL) { + msg_notice_spf("invalid ip4 element for %s: %s, no '=' or ':'", addr->spf_string, + rec->sender_domain); + return FALSE; + } + } + + semicolon++; + slash = strchr(semicolon, '/'); + + if (slash) { + len = slash - semicolon; + } + else { + len = strlen(semicolon); + } + + rspamd_strlcpy(ipbuf, semicolon, MIN(len + 1, sizeof(ipbuf))); + + if (inet_pton(AF_INET, ipbuf, addr->addr4) != 1) { + msg_notice_spf("invalid ip4 element for %s: %s", addr->spf_string, + rec->sender_domain); + return FALSE; + } + + if (slash) { + gchar *end = NULL; + + mask = strtoul(slash + 1, &end, 10); + if (mask > 32) { + msg_notice_spf("invalid mask for ip4 element for %s: %s", addr->spf_string, + rec->sender_domain); + return FALSE; + } + + if (end != NULL && !g_ascii_isspace(*end) && *end != '\0') { + /* Invalid mask definition */ + msg_notice_spf("invalid mask for ip4 element for %s: %s", addr->spf_string, + rec->sender_domain); + return FALSE; + } + + addr->m.dual.mask_v4 = mask; + + if (mask < min_valid_mask) { + addr->flags |= RSPAMD_SPF_FLAG_INVALID; + msg_notice_spf("too wide SPF record for %s: %s/%d", + rec->sender_domain, + ipbuf, addr->m.dual.mask_v4); + } + } + else { + addr->m.dual.mask_v4 = 32; + } + + addr->flags |= RSPAMD_SPF_FLAG_IPV4 | RSPAMD_SPF_FLAG_RESOLVED; + msg_debug_spf("parsed ipv4 record %s/%d", ipbuf, addr->m.dual.mask_v4); + + return TRUE; +} + +static gboolean +parse_spf_ip6(struct spf_record *rec, struct spf_addr *addr) +{ + /* ip6:addr[/mask] */ + const gchar *semicolon, *slash; + gsize len; + gchar ipbuf[INET6_ADDRSTRLEN + 1]; + guint32 mask; + static const guint32 min_valid_mask = 8; + + semicolon = strchr(addr->spf_string, ':'); + + if (semicolon == NULL) { + semicolon = strchr(addr->spf_string, '='); + + if (semicolon == NULL) { + msg_notice_spf("invalid ip6 element for %s: %s", addr->spf_string, + rec->sender_domain); + return FALSE; + } + } + + semicolon++; + slash = strchr(semicolon, '/'); + + if (slash) { + len = slash - semicolon; + } + else { + len = strlen(semicolon); + } + + rspamd_strlcpy(ipbuf, semicolon, MIN(len + 1, sizeof(ipbuf))); + + if (inet_pton(AF_INET6, ipbuf, addr->addr6) != 1) { + msg_notice_spf("invalid ip6 element for %s: %s", addr->spf_string, + rec->sender_domain); + return FALSE; + } + + if (slash) { + gchar *end = NULL; + mask = strtoul(slash + 1, &end, 10); + if (mask > 128) { + msg_notice_spf("invalid mask for ip6 element for %s: %s", addr->spf_string, + rec->sender_domain); + return FALSE; + } + + if (end != NULL && !g_ascii_isspace(*end) && *end != '\0') { + /* Invalid mask definition */ + msg_notice_spf("invalid mask for ip4 element for %s: %s", addr->spf_string, + rec->sender_domain); + return FALSE; + } + + addr->m.dual.mask_v6 = mask; + + if (mask < min_valid_mask) { + addr->flags |= RSPAMD_SPF_FLAG_INVALID; + msg_notice_spf("too wide SPF record for %s: %s/%d", + rec->sender_domain, + ipbuf, addr->m.dual.mask_v6); + } + } + else { + addr->m.dual.mask_v6 = 128; + } + + addr->flags |= RSPAMD_SPF_FLAG_IPV6 | RSPAMD_SPF_FLAG_RESOLVED; + msg_debug_spf("parsed ipv6 record %s/%d", ipbuf, addr->m.dual.mask_v6); + + return TRUE; +} + + +static gboolean +parse_spf_include(struct spf_record *rec, struct spf_addr *addr) +{ + struct spf_dns_cb *cb; + const gchar *domain; + struct rspamd_task *task = rec->task; + + CHECK_REC(rec); + domain = strchr(addr->spf_string, ':'); + + if (domain == NULL) { + /* Common mistake */ + domain = strchr(addr->spf_string, '='); + + if (domain == NULL) { + msg_notice_spf("invalid include element for %s: %s", addr->spf_string, + rec->sender_domain); + return FALSE; + } + } + + domain++; + + rec->dns_requests++; + + cb = rspamd_mempool_alloc(task->task_pool, sizeof(struct spf_dns_cb)); + cb->rec = rec; + cb->addr = addr; + cb->cur_action = SPF_RESOLVE_INCLUDE; + addr->m.idx = rec->resolved->len; + cb->resolved = rspamd_spf_new_addr_list(rec, domain); + cb->ptr_host = domain; + /* Set reference */ + addr->flags |= RSPAMD_SPF_FLAG_REFERENCE; + msg_debug_spf("resolve include %s", domain); + + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, RDNS_REQUEST_TXT, domain)) { + rec->requests_inflight++; + + return TRUE; + } + else { + msg_notice_spf("unresolvable include element for %s: %s", addr->spf_string, + rec->sender_domain); + } + + + return FALSE; +} + +static gboolean +parse_spf_exp(struct spf_record *rec, struct spf_addr *addr) +{ + msg_info_spf("exp record is ignored"); + return TRUE; +} + +static gboolean +parse_spf_redirect(struct spf_record *rec, + struct spf_resolved_element *resolved, struct spf_addr *addr) +{ + struct spf_dns_cb *cb; + const gchar *domain; + struct rspamd_task *task = rec->task; + + CHECK_REC(rec); + + domain = strchr(addr->spf_string, '='); + + if (domain == NULL) { + /* Common mistake */ + domain = strchr(addr->spf_string, ':'); + + if (domain == NULL) { + msg_notice_spf("invalid redirect element for %s: %s", addr->spf_string, + rec->sender_domain); + return FALSE; + } + } + + domain++; + + rec->dns_requests++; + resolved->redirected = TRUE; + + cb = rspamd_mempool_alloc(task->task_pool, sizeof(struct spf_dns_cb)); + /* Set reference */ + addr->flags |= RSPAMD_SPF_FLAG_REFERENCE | RSPAMD_SPF_FLAG_REDIRECT; + addr->m.idx = rec->resolved->len; + + cb->rec = rec; + cb->addr = addr; + cb->cur_action = SPF_RESOLVE_REDIRECT; + cb->resolved = rspamd_spf_new_addr_list(rec, domain); + cb->ptr_host = domain; + msg_debug_spf("resolve redirect %s", domain); + + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, RDNS_REQUEST_TXT, domain)) { + rec->requests_inflight++; + + return TRUE; + } + else { + msg_notice_spf("unresolvable redirect element for %s: %s", addr->spf_string, + rec->sender_domain); + } + + return FALSE; +} + +static gboolean +parse_spf_exists(struct spf_record *rec, struct spf_addr *addr) +{ + struct spf_dns_cb *cb; + const gchar *host; + struct rspamd_task *task = rec->task; + struct spf_resolved_element *resolved; + + resolved = g_ptr_array_index(rec->resolved, rec->resolved->len - 1); + CHECK_REC(rec); + + host = strchr(addr->spf_string, ':'); + if (host == NULL) { + host = strchr(addr->spf_string, '='); + + if (host == NULL) { + msg_notice_spf("invalid exists element for %s: %s", addr->spf_string, + rec->sender_domain); + return FALSE; + } + } + + host++; + rec->dns_requests++; + + cb = rspamd_mempool_alloc(task->task_pool, sizeof(struct spf_dns_cb)); + cb->rec = rec; + cb->addr = addr; + cb->cur_action = SPF_RESOLVE_EXISTS; + cb->resolved = resolved; + cb->ptr_host = host; + + msg_debug_spf("resolve exists %s", host); + if (rspamd_dns_resolver_request_task_forced(task, + spf_record_dns_callback, (void *) cb, RDNS_REQUEST_A, host)) { + rec->requests_inflight++; + + return TRUE; + } + else { + msg_notice_spf("unresolvable exists element for %s: %s", addr->spf_string, + rec->sender_domain); + } + + return FALSE; +} + +static gsize +rspamd_spf_split_elt(const gchar *val, gsize len, gint *pos, + gsize poslen, gchar delim) +{ + const gchar *p, *end; + guint cur_pos = 0, cur_st = 0, nsub = 0; + + p = val; + end = val + len; + + while (p < end && cur_pos + 2 < poslen) { + if (*p == delim) { + if (p - val > cur_st) { + pos[cur_pos] = cur_st; + pos[cur_pos + 1] = p - val; + cur_st = p - val + 1; + cur_pos += 2; + nsub++; + } + + p++; + } + else { + p++; + } + } + + if (cur_pos + 2 < poslen) { + if (end - val > cur_st) { + pos[cur_pos] = cur_st; + pos[cur_pos + 1] = end - val; + nsub++; + } + } + else { + pos[cur_pos] = p - val; + pos[cur_pos + 1] = end - val; + nsub++; + } + + return nsub; +} + +static gsize +rspamd_spf_process_substitution(const gchar *macro_value, + gsize macro_len, guint ndelim, gchar delim, gboolean reversed, + gchar *dest) +{ + gchar *d = dest; + const gchar canon_delim = '.'; + guint vlen, i; + gint pos[49 * 2], tlen; + + if (!reversed && ndelim == 0 && delim == canon_delim) { + /* Trivial case */ + memcpy(dest, macro_value, macro_len); + + return macro_len; + } + + vlen = rspamd_spf_split_elt(macro_value, macro_len, + pos, G_N_ELEMENTS(pos), delim); + + if (vlen > 0) { + if (reversed) { + for (i = vlen - 1;; i--) { + tlen = pos[i * 2 + 1] - pos[i * 2]; + + if (i != 0) { + memcpy(d, ¯o_value[pos[i * 2]], tlen); + d += tlen; + *d++ = canon_delim; + } + else { + memcpy(d, ¯o_value[pos[i * 2]], tlen); + d += tlen; + break; + } + } + } + else { + for (i = 0; i < vlen; i++) { + tlen = pos[i * 2 + 1] - pos[i * 2]; + + if (i != vlen - 1) { + memcpy(d, ¯o_value[pos[i * 2]], tlen); + d += tlen; + *d++ = canon_delim; + } + else { + memcpy(d, ¯o_value[pos[i * 2]], tlen); + d += tlen; + } + } + } + } + else { + /* Trivial case */ + memcpy(dest, macro_value, macro_len); + + return macro_len; + } + + return (d - dest); +} + +static const gchar * +expand_spf_macro(struct spf_record *rec, struct spf_resolved_element *resolved, + const gchar *begin) +{ + const gchar *p, *macro_value = NULL; + gchar *c, *new, *tmp, delim = '.'; + gsize len = 0, macro_len = 0; + gint state = 0, ndelim = 0; + gchar ip_buf[64 + 1]; /* cannot use INET6_ADDRSTRLEN as we use ptr lookup */ + gboolean need_expand = FALSE, reversed; + struct rspamd_task *task; + + g_assert(rec != NULL); + g_assert(begin != NULL); + + task = rec->task; + p = begin; + /* Calculate length */ + while (*p) { + switch (state) { + case 0: + /* Skip any character and wait for % in input */ + if (*p == '%') { + state = 1; + } + else { + len++; + } + + p++; + break; + case 1: + /* We got % sign, so we should whether wait for { or for - or for _ or for % */ + if (*p == '%' || *p == '_') { + /* Just a single % sign or space */ + len++; + state = 0; + } + else if (*p == '-') { + /* %20 */ + len += sizeof("%20") - 1; + state = 0; + } + else if (*p == '{') { + state = 2; + } + else { + /* Something unknown */ + msg_notice_spf( + "spf error for domain %s: unknown spf element", + rec->sender_domain); + return begin; + } + p++; + + break; + case 2: + /* Read macro name */ + switch (g_ascii_tolower(*p)) { + case 'i': + len += sizeof(ip_buf) - 1; + break; + case 's': + if (rec->sender) { + len += strlen(rec->sender); + } + else { + len += sizeof("unknown") - 1; + } + break; + case 'l': + if (rec->local_part) { + len += strlen(rec->local_part); + } + else { + len += sizeof("unknown") - 1; + } + break; + case 'o': + if (rec->sender_domain) { + len += strlen(rec->sender_domain); + } + else { + len += sizeof("unknown") - 1; + } + break; + case 'd': + if (resolved->cur_domain) { + len += strlen(resolved->cur_domain); + } + else { + len += sizeof("unknown") - 1; + } + break; + case 'v': + len += sizeof("in-addr") - 1; + break; + case 'h': + if (task->helo) { + len += strlen(task->helo); + } + else { + len += sizeof("unknown") - 1; + } + break; + default: + msg_notice_spf( + "spf error for domain %s: unknown or " + "unsupported spf macro %c in %s", + rec->sender_domain, + *p, + begin); + return begin; + } + p++; + state = 3; + break; + case 3: + /* Read modifier */ + if (*p == '}') { + state = 0; + need_expand = TRUE; + } + p++; + break; + + default: + g_assert_not_reached(); + } + } + + if (!need_expand) { + /* No expansion needed */ + return begin; + } + + new = rspamd_mempool_alloc(task->task_pool, len + 1); + + /* Reduce TTL to avoid caching of records with macros */ + if (rec->ttl != 0) { + rec->ttl = 0; + msg_debug_spf("disable SPF caching as there is macro expansion"); + } + + c = new; + p = begin; + state = 0; + /* Begin macro expansion */ + + while (*p) { + switch (state) { + case 0: + /* Skip any character and wait for % in input */ + if (*p == '%') { + state = 1; + } + else { + *c = *p; + c++; + } + + p++; + break; + case 1: + /* We got % sign, so we should whether wait for { or for - or for _ or for % */ + if (*p == '%') { + /* Just a single % sign or space */ + *c++ = '%'; + state = 0; + } + else if (*p == '_') { + *c++ = ' '; + state = 0; + } + else if (*p == '-') { + /* %20 */ + *c++ = '%'; + *c++ = '2'; + *c++ = '0'; + state = 0; + } + else if (*p == '{') { + state = 2; + } + else { + /* Something unknown */ + msg_info_spf( + "spf error for domain %s: unknown spf element", + rec->sender_domain); + return begin; + } + p++; + break; + case 2: + /* Read macro name */ + switch (g_ascii_tolower(*p)) { + case 'i': + if (task->from_addr) { + if (rspamd_inet_address_get_af(task->from_addr) == AF_INET) { + macro_len = rspamd_strlcpy(ip_buf, + rspamd_inet_address_to_string(task->from_addr), + sizeof(ip_buf)); + macro_value = ip_buf; + } + else if (rspamd_inet_address_get_af(task->from_addr) == AF_INET6) { + /* See #3625 for details */ + socklen_t slen; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) + rspamd_inet_address_get_sa(task->from_addr, &slen); + + /* Expand IPv6 address */ +#define IPV6_OCTET(x) bytes[(x)] >> 4, bytes[(x)] & 0xF + unsigned char *bytes = (unsigned char *) &sin6->sin6_addr; + macro_len = rspamd_snprintf(ip_buf, sizeof(ip_buf), + "%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd." + "%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd.%xd", + IPV6_OCTET(0), IPV6_OCTET(1), + IPV6_OCTET(2), IPV6_OCTET(3), + IPV6_OCTET(4), IPV6_OCTET(5), + IPV6_OCTET(6), IPV6_OCTET(7), + IPV6_OCTET(8), IPV6_OCTET(9), + IPV6_OCTET(10), IPV6_OCTET(11), + IPV6_OCTET(12), IPV6_OCTET(13), + IPV6_OCTET(14), IPV6_OCTET(15)); + macro_value = ip_buf; +#undef IPV6_OCTET + } + else { + macro_len = rspamd_snprintf(ip_buf, sizeof(ip_buf), + "127.0.0.1"); + macro_value = ip_buf; + } + } + else { + macro_len = rspamd_snprintf(ip_buf, sizeof(ip_buf), + "127.0.0.1"); + macro_value = ip_buf; + } + break; + case 's': + if (rec->sender) { + macro_len = strlen(rec->sender); + macro_value = rec->sender; + } + else { + macro_len = sizeof("unknown") - 1; + macro_value = "unknown"; + } + break; + case 'l': + if (rec->local_part) { + macro_len = strlen(rec->local_part); + macro_value = rec->local_part; + } + else { + macro_len = sizeof("unknown") - 1; + macro_value = "unknown"; + } + break; + case 'o': + if (rec->sender_domain) { + macro_len = strlen(rec->sender_domain); + macro_value = rec->sender_domain; + } + else { + macro_len = sizeof("unknown") - 1; + macro_value = "unknown"; + } + break; + case 'd': + if (resolved && resolved->cur_domain) { + macro_len = strlen(resolved->cur_domain); + macro_value = resolved->cur_domain; + } + else { + macro_len = sizeof("unknown") - 1; + macro_value = "unknown"; + } + break; + case 'v': + if (task->from_addr) { + if (rspamd_inet_address_get_af(task->from_addr) == AF_INET) { + macro_len = sizeof("in-addr") - 1; + macro_value = "in-addr"; + } + else { + macro_len = sizeof("ip6") - 1; + macro_value = "ip6"; + } + } + else { + macro_len = sizeof("in-addr") - 1; + macro_value = "in-addr"; + } + break; + case 'h': + if (task->helo) { + tmp = strchr(task->helo, '@'); + if (tmp) { + macro_len = strlen(tmp + 1); + macro_value = tmp + 1; + } + else { + macro_len = strlen(task->helo); + macro_value = task->helo; + } + } + else { + macro_len = sizeof("unknown") - 1; + macro_value = "unknown"; + } + break; + default: + msg_info_spf( + "spf error for domain %s: unknown or " + "unsupported spf macro %c in %s", + rec->sender_domain, + *p, + begin); + return begin; + } + + p++; + state = 3; + ndelim = 0; + delim = '.'; + reversed = FALSE; + break; + + case 3: + /* Read modifier */ + if (*p == '}') { + state = 0; + len = rspamd_spf_process_substitution(macro_value, + macro_len, ndelim, delim, reversed, c); + c += len; + } + else if (*p == 'r' && len != 0) { + reversed = TRUE; + } + else if (g_ascii_isdigit(*p)) { + ndelim = strtoul(p, &tmp, 10); + + if (tmp == NULL || tmp == p) { + p++; + } + else { + p = tmp; + + continue; + } + } + else if (*p == '+' || *p == '-' || + *p == '.' || *p == ',' || *p == '/' || *p == '_' || + *p == '=') { + delim = *p; + } + else { + msg_info_spf("spf error for domain %s: unknown or " + "unsupported spf macro %c in %s", + rec->sender_domain, + *p, + begin); + return begin; + } + p++; + break; + } + } + /* Null terminate */ + *c = '\0'; + + return new; +} + +/* Read current element and try to parse record */ +static gboolean +spf_process_element(struct spf_record *rec, + struct spf_resolved_element *resolved, + const gchar *elt, + const gchar **elts) +{ + struct spf_addr *addr = NULL; + gboolean res = FALSE; + const gchar *begin; + gchar t; + + g_assert(elt != NULL); + g_assert(rec != NULL); + + if (*elt == '\0' || resolved->redirected) { + return TRUE; + } + + begin = expand_spf_macro(rec, resolved, elt); + addr = rspamd_spf_new_addr(rec, resolved, begin); + g_assert(addr != NULL); + t = g_ascii_tolower(addr->spf_string[0]); + begin = addr->spf_string; + + /* Now check what we have */ + switch (t) { + case 'a': + /* all or a */ + if (g_ascii_strncasecmp(begin, SPF_ALL, + sizeof(SPF_ALL) - 1) == 0) { + res = parse_spf_all(rec, addr); + } + else if (g_ascii_strncasecmp(begin, SPF_A, + sizeof(SPF_A) - 1) == 0) { + res = parse_spf_a(rec, resolved, addr); + } + else { + msg_notice_spf("spf error for domain %s: bad spf command %s", + rec->sender_domain, begin); + } + break; + case 'i': + /* include or ip4 */ + if (g_ascii_strncasecmp(begin, SPF_IP4, sizeof(SPF_IP4) - 1) == 0) { + res = parse_spf_ip4(rec, addr); + } + else if (g_ascii_strncasecmp(begin, SPF_INCLUDE, sizeof(SPF_INCLUDE) - 1) == 0) { + res = parse_spf_include(rec, addr); + } + else if (g_ascii_strncasecmp(begin, SPF_IP6, sizeof(SPF_IP6) - 1) == 0) { + res = parse_spf_ip6(rec, addr); + } + else if (g_ascii_strncasecmp(begin, SPF_IP4_ALT, sizeof(SPF_IP4_ALT) - 1) == 0) { + res = parse_spf_ip4(rec, addr); + } + else if (g_ascii_strncasecmp(begin, SPF_IP6_ALT, sizeof(SPF_IP6_ALT) - 1) == 0) { + res = parse_spf_ip6(rec, addr); + } + else { + msg_notice_spf("spf error for domain %s: bad spf command %s", + rec->sender_domain, begin); + } + break; + case 'm': + /* mx */ + if (g_ascii_strncasecmp(begin, SPF_MX, sizeof(SPF_MX) - 1) == 0) { + res = parse_spf_mx(rec, resolved, addr); + } + else { + msg_notice_spf("spf error for domain %s: bad spf command %s", + rec->sender_domain, begin); + } + break; + case 'p': + /* ptr */ + if (g_ascii_strncasecmp(begin, SPF_PTR, + sizeof(SPF_PTR) - 1) == 0) { + res = parse_spf_ptr(rec, resolved, addr); + } + else { + msg_notice_spf("spf error for domain %s: bad spf command %s", + rec->sender_domain, begin); + } + break; + case 'e': + /* exp or exists */ + if (g_ascii_strncasecmp(begin, SPF_EXP, + sizeof(SPF_EXP) - 1) == 0) { + res = parse_spf_exp(rec, addr); + } + else if (g_ascii_strncasecmp(begin, SPF_EXISTS, + sizeof(SPF_EXISTS) - 1) == 0) { + res = parse_spf_exists(rec, addr); + } + else { + msg_notice_spf("spf error for domain %s: bad spf command %s", + rec->sender_domain, begin); + } + break; + case 'r': + /* redirect */ + if (g_ascii_strncasecmp(begin, SPF_REDIRECT, + sizeof(SPF_REDIRECT) - 1) == 0) { + /* + * According to https://tools.ietf.org/html/rfc7208#section-6.1 + * There must be no ALL element anywhere in the record, + * redirect must be ignored + */ + gboolean ignore_redirect = FALSE; + + for (const gchar **tmp = elts; *tmp != NULL; tmp++) { + if (g_ascii_strcasecmp((*tmp) + 1, "all") == 0) { + ignore_redirect = TRUE; + break; + } + } + + if (!ignore_redirect) { + res = parse_spf_redirect(rec, resolved, addr); + } + else { + msg_notice_spf("ignore SPF redirect (%s) for domain %s as there is also all element", + begin, rec->sender_domain); + + /* Pop the current addr as it is ignored */ + g_ptr_array_remove_index_fast(resolved->elts, + resolved->elts->len - 1); + + return TRUE; + } + } + else { + msg_notice_spf("spf error for domain %s: bad spf command %s", + rec->sender_domain, begin); + } + break; + case 'v': + if (g_ascii_strncasecmp(begin, "v=spf", + sizeof("v=spf") - 1) == 0) { + /* Skip this element till the end of record */ + while (*begin && !g_ascii_isspace(*begin)) { + begin++; + } + } + break; + default: + msg_notice_spf("spf error for domain %s: bad spf command %s", + rec->sender_domain, begin); + break; + } + + if (res) { + addr->flags |= RSPAMD_SPF_FLAG_PARSED; + } + + return res; +} + +static void +parse_spf_scopes(struct spf_record *rec, gchar **begin) +{ + for (;;) { + if (g_ascii_strncasecmp(*begin, SPF_SCOPE_PRA, sizeof(SPF_SCOPE_PRA) - 1) == 0) { + *begin += sizeof(SPF_SCOPE_PRA) - 1; + /* XXX: Implement actual PRA check */ + /* extract_pra_info (rec); */ + continue; + } + else if (g_ascii_strncasecmp(*begin, SPF_SCOPE_MFROM, + sizeof(SPF_SCOPE_MFROM) - 1) == 0) { + /* mfrom is standard spf1 check */ + *begin += sizeof(SPF_SCOPE_MFROM) - 1; + continue; + } + else if (**begin != ',') { + break; + } + (*begin)++; + } +} + +static gboolean +start_spf_parse(struct spf_record *rec, struct spf_resolved_element *resolved, + gchar *begin) +{ + gchar **elts, **cur_elt; + gsize len; + + /* Skip spaces */ + while (g_ascii_isspace(*begin)) { + begin++; + } + + len = strlen(begin); + + if (g_ascii_strncasecmp(begin, SPF_VER1_STR, sizeof(SPF_VER1_STR) - 1) == + 0) { + begin += sizeof(SPF_VER1_STR) - 1; + + while (g_ascii_isspace(*begin) && *begin) { + begin++; + } + } + else if (g_ascii_strncasecmp(begin, SPF_VER2_STR, sizeof(SPF_VER2_STR) - 1) == 0) { + /* Skip one number of record, so no we are here spf2.0/ */ + begin += sizeof(SPF_VER2_STR); + if (*begin != '/') { + msg_notice_spf("spf error for domain %s: sender id is invalid", + rec->sender_domain); + } + else { + begin++; + parse_spf_scopes(rec, &begin); + } + /* Now common spf record */ + } + else { + msg_debug_spf( + "spf error for domain %s: bad spf record start: %*s", + rec->sender_domain, + (gint) len, + begin); + + return FALSE; + } + + while (g_ascii_isspace(*begin) && *begin) { + begin++; + } + + elts = g_strsplit_set(begin, " ", 0); + + if (elts) { + cur_elt = elts; + + while (*cur_elt) { + spf_process_element(rec, resolved, *cur_elt, (const gchar **) elts); + cur_elt++; + } + + g_strfreev(elts); + } + + rspamd_spf_maybe_return(rec); + + return TRUE; +} + +static void +spf_dns_callback(struct rdns_reply *reply, gpointer arg) +{ + struct spf_record *rec = arg; + struct spf_resolved_element *resolved = NULL; + struct spf_addr *addr; + + rec->requests_inflight--; + + if (reply->flags & RDNS_TRUNCATED) { + msg_warn_spf("got a truncated record when trying to resolve TXT record for %s", + rec->sender_domain); + resolved = rspamd_spf_new_addr_list(rec, rec->sender_domain); + addr = g_malloc0(sizeof(*addr)); + addr->flags |= RSPAMD_SPF_FLAG_TEMPFAIL; + g_ptr_array_insert(resolved->elts, 0, addr); + + rspamd_spf_maybe_return(rec); + + return; + } + else { + if (reply->code == RDNS_RC_NOERROR) { + resolved = rspamd_spf_new_addr_list(rec, rec->sender_domain); + if (rec->resolved->len == 1) { + /* Top level resolved element */ + rec->ttl = reply->entries->ttl; + } + } + else if ((reply->code == RDNS_RC_NOREC || reply->code == RDNS_RC_NXDOMAIN) && rec->dns_requests == 0) { + resolved = rspamd_spf_new_addr_list(rec, rec->sender_domain); + addr = g_malloc0(sizeof(*addr)); + addr->flags |= RSPAMD_SPF_FLAG_NA; + g_ptr_array_insert(resolved->elts, 0, addr); + } + else if (reply->code != RDNS_RC_NOREC && reply->code != RDNS_RC_NXDOMAIN && rec->dns_requests == 0) { + resolved = rspamd_spf_new_addr_list(rec, rec->sender_domain); + addr = g_malloc0(sizeof(*addr)); + addr->flags |= RSPAMD_SPF_FLAG_TEMPFAIL; + g_ptr_array_insert(resolved->elts, 0, addr); + } + } + + if (resolved) { + struct rdns_reply_entry *selected = NULL; + + if (!spf_process_txt_record(rec, resolved, reply, &selected)) { + resolved = g_ptr_array_index(rec->resolved, 0); + + if (rec->resolved->len > 1) { + addr = g_ptr_array_index(resolved->elts, 0); + if ((reply->code == RDNS_RC_NOREC || reply->code == RDNS_RC_NXDOMAIN) && (addr->flags & RSPAMD_SPF_FLAG_REDIRECT)) { + addr->flags |= RSPAMD_SPF_FLAG_PERMFAIL; + } + else { + addr->flags |= RSPAMD_SPF_FLAG_TEMPFAIL; + } + } + else { + addr = g_malloc0(sizeof(*addr)); + + if (reply->code == RDNS_RC_NOREC || reply->code == RDNS_RC_NXDOMAIN || reply->code == RDNS_RC_NOERROR) { + addr->flags |= RSPAMD_SPF_FLAG_NA; + } + else { + addr->flags |= RSPAMD_SPF_FLAG_TEMPFAIL; + } + g_ptr_array_insert(resolved->elts, 0, addr); + } + } + else { + rec->top_record = rspamd_mempool_strdup(rec->task->task_pool, + selected->content.txt.data); + rspamd_mempool_set_variable(rec->task->task_pool, + RSPAMD_MEMPOOL_SPF_RECORD, + (gpointer) rec->top_record, NULL); + } + } + + rspamd_spf_maybe_return(rec); +} + +static struct rspamd_spf_cred * +rspamd_spf_cache_domain(struct rspamd_task *task) +{ + struct rspamd_email_address *addr; + struct rspamd_spf_cred *cred = NULL; + + addr = rspamd_task_get_sender(task); + if (!addr || (addr->flags & RSPAMD_EMAIL_ADDR_EMPTY)) { + /* Get domain from helo */ + + if (task->helo) { + GString *fs = g_string_new(""); + + cred = rspamd_mempool_alloc(task->task_pool, sizeof(*cred)); + cred->domain = task->helo; + cred->local_part = "postmaster"; + rspamd_printf_gstring(fs, "postmaster@%s", cred->domain); + cred->sender = fs->str; + rspamd_mempool_add_destructor(task->task_pool, + rspamd_gstring_free_hard, fs); + } + } + else { + rspamd_ftok_t tok; + + cred = rspamd_mempool_alloc(task->task_pool, sizeof(*cred)); + tok.begin = addr->domain; + tok.len = addr->domain_len; + cred->domain = rspamd_mempool_ftokdup(task->task_pool, &tok); + tok.begin = addr->user; + tok.len = addr->user_len; + cred->local_part = rspamd_mempool_ftokdup(task->task_pool, &tok); + tok.begin = addr->addr; + tok.len = addr->addr_len; + cred->sender = rspamd_mempool_ftokdup(task->task_pool, &tok); + } + + if (cred) { + rspamd_mempool_set_variable(task->task_pool, RSPAMD_MEMPOOL_SPF_DOMAIN, + cred, NULL); + } + + return cred; +} + +struct rspamd_spf_cred * +rspamd_spf_get_cred(struct rspamd_task *task) +{ + struct rspamd_spf_cred *cred; + + cred = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_SPF_DOMAIN); + + if (!cred) { + cred = rspamd_spf_cache_domain(task); + } + + return cred; +} + +const gchar * +rspamd_spf_get_domain(struct rspamd_task *task) +{ + gchar *domain = NULL; + struct rspamd_spf_cred *cred; + + cred = rspamd_spf_get_cred(task); + + if (cred) { + domain = cred->domain; + } + + return domain; +} + +gboolean +rspamd_spf_resolve(struct rspamd_task *task, spf_cb_t callback, + gpointer cbdata, struct rspamd_spf_cred *cred) +{ + struct spf_record *rec; + + if (!cred || !cred->domain) { + return FALSE; + } + + /* First lookup in the hash */ + if (spf_lib_ctx->spf_hash) { + struct spf_resolved *cached; + + cached = rspamd_lru_hash_lookup(spf_lib_ctx->spf_hash, cred->domain, + task->task_timestamp); + + if (cached) { + cached->flags |= RSPAMD_SPF_FLAG_CACHED; + + if (cached->top_record) { + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_SPF_RECORD, + rspamd_mempool_strdup(task->task_pool, + cached->top_record), + NULL); + } + callback(cached, task, cbdata); + + return TRUE; + } + } + + + rec = rspamd_mempool_alloc0(task->task_pool, sizeof(struct spf_record)); + rec->task = task; + rec->callback = callback; + rec->cbdata = cbdata; + + rec->resolved = g_ptr_array_sized_new(8); + + /* Add destructor */ + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) spf_record_destructor, + rec); + + /* Extract from data */ + rec->sender = cred->sender; + rec->local_part = cred->local_part; + rec->sender_domain = cred->domain; + + if (rspamd_dns_resolver_request_task_forced(task, + spf_dns_callback, + (void *) rec, RDNS_REQUEST_TXT, rec->sender_domain)) { + rec->requests_inflight++; + return TRUE; + } + + return FALSE; +} + +struct spf_resolved * +_spf_record_ref(struct spf_resolved *flat, const gchar *loc) +{ + REF_RETAIN(flat); + return flat; +} + +void _spf_record_unref(struct spf_resolved *flat, const gchar *loc) +{ + REF_RELEASE(flat); +} + +gchar * +spf_addr_mask_to_string(struct spf_addr *addr) +{ + GString *res; + gchar *s, ipstr[INET6_ADDRSTRLEN + 1]; + + if (addr->flags & RSPAMD_SPF_FLAG_ANY) { + res = g_string_new("any"); + } + else if (addr->flags & RSPAMD_SPF_FLAG_IPV4) { + (void) inet_ntop(AF_INET, addr->addr4, ipstr, sizeof(ipstr)); + res = g_string_sized_new(sizeof(ipstr)); + rspamd_printf_gstring(res, "%s/%d", ipstr, addr->m.dual.mask_v4); + } + else if (addr->flags & RSPAMD_SPF_FLAG_IPV6) { + (void) inet_ntop(AF_INET6, addr->addr6, ipstr, sizeof(ipstr)); + res = g_string_sized_new(sizeof(ipstr)); + rspamd_printf_gstring(res, "%s/%d", ipstr, addr->m.dual.mask_v6); + } + else { + res = g_string_new(NULL); + rspamd_printf_gstring(res, "unknown, flags = %d", addr->flags); + } + + s = res->str; + g_string_free(res, FALSE); + + + return s; +} + +struct spf_addr * +spf_addr_match_task(struct rspamd_task *task, struct spf_resolved *rec) +{ + const guint8 *s, *d; + guint af, mask, bmask, addrlen; + struct spf_addr *selected = NULL, *addr, *any_addr = NULL; + guint i; + + if (task->from_addr == NULL) { + return FALSE; + } + + for (i = 0; i < rec->elts->len; i++) { + addr = &g_array_index(rec->elts, struct spf_addr, i); + if (addr->flags & RSPAMD_SPF_FLAG_TEMPFAIL) { + continue; + } + + af = rspamd_inet_address_get_af(task->from_addr); + /* Basic comparing algorithm */ + if (((addr->flags & RSPAMD_SPF_FLAG_IPV6) && af == AF_INET6) || + ((addr->flags & RSPAMD_SPF_FLAG_IPV4) && af == AF_INET)) { + d = rspamd_inet_address_get_hash_key(task->from_addr, &addrlen); + + if (af == AF_INET6) { + s = (const guint8 *) addr->addr6; + mask = addr->m.dual.mask_v6; + } + else { + s = (const guint8 *) addr->addr4; + mask = addr->m.dual.mask_v4; + } + + /* Compare the first bytes */ + bmask = mask / CHAR_BIT; + if (mask > addrlen * CHAR_BIT) { + msg_info_task("bad mask length: %d", mask); + } + else if (memcmp(s, d, bmask) == 0) { + if (bmask * CHAR_BIT < mask) { + /* Compare the remaining bits */ + s += bmask; + d += bmask; + mask = (0xffu << (CHAR_BIT - (mask - bmask * 8u))) & 0xffu; + + if ((*s & mask) == (*d & mask)) { + selected = addr; + break; + } + } + else { + selected = addr; + break; + } + } + } + else { + if (addr->flags & RSPAMD_SPF_FLAG_ANY) { + any_addr = addr; + } + } + } + + if (selected) { + return selected; + } + + return any_addr; +}
\ No newline at end of file diff --git a/src/libserver/spf.h b/src/libserver/spf.h new file mode 100644 index 0000000..871ed29 --- /dev/null +++ b/src/libserver/spf.h @@ -0,0 +1,159 @@ +#ifndef RSPAMD_SPF_H +#define RSPAMD_SPF_H + +#include "config.h" +#include "ref.h" +#include "addr.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct spf_resolved; + +typedef void (*spf_cb_t)(struct spf_resolved *record, + struct rspamd_task *task, gpointer cbdata); + +typedef enum spf_mech_e { + SPF_FAIL, + SPF_SOFT_FAIL, + SPF_PASS, + SPF_NEUTRAL +} spf_mech_t; + +static inline gchar spf_mech_char(spf_mech_t mech) +{ + switch (mech) { + case SPF_FAIL: + return '-'; + case SPF_SOFT_FAIL: + return '~'; + case SPF_PASS: + return '+'; + case SPF_NEUTRAL: + default: + return '?'; + } +} + +typedef enum spf_action_e { + SPF_RESOLVE_MX, + SPF_RESOLVE_A, + SPF_RESOLVE_PTR, + SPF_RESOLVE_AAA, + SPF_RESOLVE_REDIRECT, + SPF_RESOLVE_INCLUDE, + SPF_RESOLVE_EXISTS, + SPF_RESOLVE_EXP +} spf_action_t; + +#define RSPAMD_SPF_FLAG_IPV6 (1u << 0u) +#define RSPAMD_SPF_FLAG_IPV4 (1u << 1u) +#define RSPAMD_SPF_FLAG_PROCESSED (1u << 2u) +#define RSPAMD_SPF_FLAG_ANY (1u << 3u) +#define RSPAMD_SPF_FLAG_PARSED (1u << 4u) +#define RSPAMD_SPF_FLAG_INVALID (1u << 5u) +#define RSPAMD_SPF_FLAG_REFERENCE (1u << 6u) +#define RSPAMD_SPF_FLAG_REDIRECT (1u << 7u) +#define RSPAMD_SPF_FLAG_TEMPFAIL (1u << 8u) +#define RSPAMD_SPF_FLAG_NA (1u << 9u) +#define RSPAMD_SPF_FLAG_PERMFAIL (1u << 10u) +#define RSPAMD_SPF_FLAG_RESOLVED (1u << 11u) +#define RSPAMD_SPF_FLAG_CACHED (1u << 12u) + +/** Default SPF limits for avoiding abuse **/ +#define SPF_MAX_NESTING 10 +#define SPF_MAX_DNS_REQUESTS 30 +#define SPF_MIN_CACHE_TTL (60 * 5) /* 5 minutes */ + +struct spf_addr { + guchar addr6[sizeof(struct in6_addr)]; + guchar addr4[sizeof(struct in_addr)]; + union { + struct { + guint16 mask_v4; + guint16 mask_v6; + } dual; + guint32 idx; + } m; + guint flags; + spf_mech_t mech; + gchar *spf_string; + struct spf_addr *prev, *next; +}; + +enum rspamd_spf_resolved_flags { + RSPAMD_SPF_RESOLVED_NORMAL = 0, + RSPAMD_SPF_RESOLVED_TEMP_FAILED = (1u << 0u), + RSPAMD_SPF_RESOLVED_PERM_FAILED = (1u << 1u), + RSPAMD_SPF_RESOLVED_NA = (1u << 2u), +}; + +struct spf_resolved { + gchar *domain; + gchar *top_record; + guint ttl; + gint flags; + gdouble timestamp; + guint64 digest; + GArray *elts; /* Flat list of struct spf_addr */ + ref_entry_t ref; /* Refcounting */ +}; + +struct rspamd_spf_cred { + gchar *local_part; + gchar *domain; + gchar *sender; +}; + +/* + * Resolve spf record for specified task and call a callback after resolution fails/succeed + */ +gboolean rspamd_spf_resolve(struct rspamd_task *task, + spf_cb_t callback, + gpointer cbdata, + struct rspamd_spf_cred *cred); + +/* + * Get a domain for spf for specified task + */ +const gchar *rspamd_spf_get_domain(struct rspamd_task *task); + +struct rspamd_spf_cred *rspamd_spf_get_cred(struct rspamd_task *task); +/* + * Increase refcount + */ +struct spf_resolved *_spf_record_ref(struct spf_resolved *rec, const gchar *loc); +#define spf_record_ref(rec) \ + _spf_record_ref((rec), G_STRLOC) +/* + * Decrease refcount + */ +void _spf_record_unref(struct spf_resolved *rec, const gchar *loc); +#define spf_record_unref(rec) \ + _spf_record_unref((rec), G_STRLOC) + +/** + * Prints address + mask in a freshly allocated string (must be freed) + * @param addr + * @return + */ +gchar *spf_addr_mask_to_string(struct spf_addr *addr); + +/** + * Returns spf address that matches the specific task (or nil if not matched) + * @param task + * @param rec + * @return + */ +struct spf_addr *spf_addr_match_task(struct rspamd_task *task, + struct spf_resolved *rec); + +void spf_library_config(const ucl_object_t *obj); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/ssl_util.c b/src/libserver/ssl_util.c new file mode 100644 index 0000000..8ee53b0 --- /dev/null +++ b/src/libserver/ssl_util.c @@ -0,0 +1,1133 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "libutil/util.h" +#include "libutil/hash.h" +#include "libserver/logger.h" +#include "libserver/cfg_file.h" +#include "ssl_util.h" +#include "unix-std.h" +#include "cryptobox.h" +#include "contrib/libottery/ottery.h" + +#include <openssl/ssl.h> +#include <openssl/err.h> +#include <openssl/rand.h> +#include <openssl/conf.h> +#include <openssl/evp.h> +#include <openssl/engine.h> +#include <openssl/x509v3.h> + +enum rspamd_ssl_state { + ssl_conn_reset = 0, + ssl_conn_init, + ssl_conn_connected, + ssl_next_read, + ssl_next_write, + ssl_next_shutdown, +}; + +enum rspamd_ssl_shutdown { + ssl_shut_default = 0, + ssl_shut_unclean, +}; + +struct rspamd_ssl_ctx { + SSL_CTX *s; + rspamd_lru_hash_t *sessions; +}; + +struct rspamd_ssl_connection { + gint fd; + enum rspamd_ssl_state state; + enum rspamd_ssl_shutdown shut; + gboolean verify_peer; + SSL *ssl; + struct rspamd_ssl_ctx *ssl_ctx; + gchar *hostname; + struct rspamd_io_ev *ev; + struct rspamd_io_ev *shut_ev; + struct ev_loop *event_loop; + rspamd_ssl_handler_t handler; + rspamd_ssl_error_handler_t err_handler; + gpointer handler_data; + gchar log_tag[8]; +}; + +#define msg_debug_ssl(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_ssl_log_id, "ssl", conn->log_tag, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +static void rspamd_ssl_event_handler(gint fd, short what, gpointer ud); + +INIT_LOG_MODULE(ssl) + +static GQuark +rspamd_ssl_quark(void) +{ + return g_quark_from_static_string("rspamd-ssl"); +} + +#if (OPENSSL_VERSION_NUMBER >= 0x10100000L) && !defined(LIBRESSL_VERSION_NUMBER) +#ifndef X509_get_notBefore +#define X509_get_notBefore(x) X509_get0_notBefore(x) +#endif +#ifndef X509_get_notAfter +#define X509_get_notAfter(x) X509_get0_notAfter(x) +#endif +#ifndef ASN1_STRING_data +#define ASN1_STRING_data(x) ASN1_STRING_get0_data(x) +#endif +#endif + +/* $OpenBSD: tls_verify.c,v 1.14 2015/09/29 10:17:04 deraadt Exp $ */ +/* + * Copyright (c) 2014 Jeremie Courreges-Anglas <jca@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +static gboolean +rspamd_tls_match_name(const char *cert_name, const char *name) +{ + const char *cert_domain, *domain, *next_dot; + + if (g_ascii_strcasecmp(cert_name, name) == 0) { + return TRUE; + } + + /* Wildcard match? */ + if (cert_name[0] == '*') { + /* + * Valid wildcards: + * - "*.domain.tld" + * - "*.sub.domain.tld" + * - etc. + * Reject "*.tld". + * No attempt to prevent the use of eg. "*.co.uk". + */ + cert_domain = &cert_name[1]; + /* Disallow "*" */ + if (cert_domain[0] == '\0') { + return FALSE; + } + + /* Disallow "*foo" */ + if (cert_domain[0] != '.') { + return FALSE; + } + /* Disallow "*.." */ + if (cert_domain[1] == '.') { + return FALSE; + } + next_dot = strchr(&cert_domain[1], '.'); + /* Disallow "*.bar" */ + if (next_dot == NULL) { + return FALSE; + } + /* Disallow "*.bar.." */ + if (next_dot[1] == '.') { + return FALSE; + } + + domain = strchr(name, '.'); + + /* No wildcard match against a name with no host part. */ + if (name[0] == '.') { + return FALSE; + } + /* No wildcard match against a name with no domain part. */ + if (domain == NULL || strlen(domain) == 1) { + return FALSE; + } + + if (g_ascii_strcasecmp(cert_domain, domain) == 0) { + return TRUE; + } + } + + return FALSE; +} + +/* See RFC 5280 section 4.2.1.6 for SubjectAltName details. */ +static gboolean +rspamd_tls_check_subject_altname(X509 *cert, const char *name) +{ + STACK_OF(GENERAL_NAME) *altname_stack = NULL; + int addrlen, type; + int count, i; + union { + struct in_addr ip4; + struct in6_addr ip6; + } addrbuf; + gboolean ret = FALSE; + + altname_stack = X509_get_ext_d2i(cert, NID_subject_alt_name, NULL, NULL); + + if (altname_stack == NULL) { + return FALSE; + } + + if (inet_pton(AF_INET, name, &addrbuf) == 1) { + type = GEN_IPADD; + addrlen = 4; + } + else if (inet_pton(AF_INET6, name, &addrbuf) == 1) { + type = GEN_IPADD; + addrlen = 16; + } + else { + type = GEN_DNS; + addrlen = 0; + } + + count = sk_GENERAL_NAME_num(altname_stack); + + for (i = 0; i < count; i++) { + GENERAL_NAME *altname; + + altname = sk_GENERAL_NAME_value(altname_stack, i); + + if (altname->type != type) { + continue; + } + + if (type == GEN_DNS) { + const char *data; + int format, len; + + format = ASN1_STRING_type(altname->d.dNSName); + + if (format == V_ASN1_IA5STRING) { + data = (const char *) ASN1_STRING_data(altname->d.dNSName); + len = ASN1_STRING_length(altname->d.dNSName); + + if (len < 0 || len != (gint) strlen(data)) { + ret = FALSE; + break; + } + + /* + * Per RFC 5280 section 4.2.1.6: + * " " is a legal domain name, but that + * dNSName must be rejected. + */ + if (strcmp(data, " ") == 0) { + ret = FALSE; + break; + } + + if (rspamd_tls_match_name(data, name)) { + ret = TRUE; + break; + } + } + } + else if (type == GEN_IPADD) { + const char *data; + int datalen; + + datalen = ASN1_STRING_length(altname->d.iPAddress); + data = (const char *) ASN1_STRING_data(altname->d.iPAddress); + + if (datalen < 0) { + ret = FALSE; + break; + } + + /* + * Per RFC 5280 section 4.2.1.6: + * IPv4 must use 4 octets and IPv6 must use 16 octets. + */ + if (datalen == addrlen && memcmp(data, &addrbuf, addrlen) == 0) { + ret = TRUE; + break; + } + } + } + + sk_GENERAL_NAME_pop_free(altname_stack, GENERAL_NAME_free); + return ret; +} + +static gboolean +rspamd_tls_check_common_name(X509 *cert, const char *name) +{ + X509_NAME *subject_name; + char *common_name = NULL; + union { + struct in_addr ip4; + struct in6_addr ip6; + } addrbuf; + int common_name_len; + gboolean ret = FALSE; + + subject_name = X509_get_subject_name(cert); + if (subject_name == NULL) { + goto out; + } + + common_name_len = X509_NAME_get_text_by_NID(subject_name, NID_commonName, NULL, 0); + + if (common_name_len < 0) { + goto out; + } + + common_name = g_malloc0(common_name_len + 1); + X509_NAME_get_text_by_NID(subject_name, NID_commonName, common_name, + common_name_len + 1); + + /* NUL bytes in CN? */ + if (common_name_len != (gint) strlen(common_name)) { + goto out; + } + + if (inet_pton(AF_INET, name, &addrbuf) == 1 || inet_pton(AF_INET6, name, &addrbuf) == 1) { + /* + * We don't want to attempt wildcard matching against IP + * addresses, so perform a simple comparison here. + */ + if (strcmp(common_name, name) == 0) { + ret = TRUE; + } + else { + ret = FALSE; + } + + goto out; + } + + if (rspamd_tls_match_name(common_name, name)) { + ret = TRUE; + } + +out: + g_free(common_name); + + return ret; +} + +static gboolean +rspamd_tls_check_name(X509 *cert, const char *name) +{ + gboolean ret; + + ret = rspamd_tls_check_subject_altname(cert, name); + if (ret) { + return ret; + } + + return rspamd_tls_check_common_name(cert, name); +} + +static gboolean +rspamd_ssl_peer_verify(struct rspamd_ssl_connection *c) +{ + X509 *server_cert; + glong ver_err; + GError *err = NULL; + + ver_err = SSL_get_verify_result(c->ssl); + + if (ver_err != X509_V_OK) { + g_set_error(&err, rspamd_ssl_quark(), 400, "certificate validation " + "failed: %s", + X509_verify_cert_error_string(ver_err)); + c->err_handler(c->handler_data, err); + g_error_free(err); + + return FALSE; + } + + /* Get server's certificate */ + server_cert = SSL_get_peer_certificate(c->ssl); + if (server_cert == NULL) { + g_set_error(&err, rspamd_ssl_quark(), 401, "peer certificate is absent"); + c->err_handler(c->handler_data, err); + g_error_free(err); + + return FALSE; + } + + if (c->hostname) { + if (!rspamd_tls_check_name(server_cert, c->hostname)) { + X509_free(server_cert); + g_set_error(&err, rspamd_ssl_quark(), 403, "peer certificate fails " + "hostname verification for %s", + c->hostname); + c->err_handler(c->handler_data, err); + g_error_free(err); + + return FALSE; + } + } + + X509_free(server_cert); + + return TRUE; +} + +static void +rspamd_tls_set_error(gint retcode, const gchar *stage, GError **err) +{ + GString *reason; + gchar buf[120]; + gint err_code = 0; + + reason = g_string_sized_new(sizeof(buf)); + + if (retcode == SSL_ERROR_SYSCALL) { + rspamd_printf_gstring(reason, "syscall fail: %s", strerror(errno)); + err_code = 500; + } + else { + while ((err_code = ERR_get_error()) != 0) { + ERR_error_string(err_code, buf); + rspamd_printf_gstring(reason, "ssl error: %s,", buf); + } + + err_code = 400; + + if (reason->len > 0 && reason->str[reason->len - 1] == ',') { + reason->str[reason->len - 1] = '\0'; + reason->len--; + } + } + + g_set_error(err, rspamd_ssl_quark(), err_code, + "ssl %s error: %s", stage, reason->str); + g_string_free(reason, TRUE); +} + +static void +rspamd_ssl_connection_dtor(struct rspamd_ssl_connection *conn) +{ + msg_debug_ssl("closing SSL connection %p; %d sessions in the cache", + conn->ssl, rspamd_lru_hash_size(conn->ssl_ctx->sessions)); + SSL_free(conn->ssl); + + if (conn->hostname) { + g_free(conn->hostname); + } + + /* + * Try to workaround for the race between timeout and ssl error + */ + if (conn->shut_ev != conn->ev && ev_can_stop(&conn->ev->tm)) { + rspamd_ev_watcher_stop(conn->event_loop, conn->ev); + } + + if (conn->shut_ev) { + rspamd_ev_watcher_stop(conn->event_loop, conn->shut_ev); + g_free(conn->shut_ev); + } + + close(conn->fd); + g_free(conn); +} + +static void +rspamd_ssl_shutdown(struct rspamd_ssl_connection *conn) +{ + gint ret = 0, nret, retries; + static const gint max_retries = 5; + + /* + * Fucking openssl... + * From the manual, 0 means: "The shutdown is not yet finished. + * Call SSL_shutdown() for a second time, + * if a bidirectional shutdown shall be performed. + * The output of SSL_get_error(3) may be misleading, + * as an erroneous SSL_ERROR_SYSCALL may be flagged + * even though no error occurred." + * + * What is `second`, what if `second` also returns 0? + * What a retarded behaviour! + */ + for (retries = 0; retries < max_retries; retries++) { + ret = SSL_shutdown(conn->ssl); + + if (ret != 0) { + break; + } + } + + if (ret == 1) { + /* All done */ + msg_debug_ssl("ssl shutdown: all done"); + rspamd_ssl_connection_dtor(conn); + } + else if (ret < 0) { + short what; + + nret = SSL_get_error(conn->ssl, ret); + conn->state = ssl_next_shutdown; + + if (nret == SSL_ERROR_WANT_READ) { + msg_debug_ssl("ssl shutdown: need read"); + what = EV_READ; + } + else if (nret == SSL_ERROR_WANT_WRITE) { + msg_debug_ssl("ssl shutdown: need write"); + what = EV_WRITE; + } + else { + /* Cannot do anything else, fatal error */ + GError *err = NULL; + + rspamd_tls_set_error(nret, "final shutdown", &err); + msg_debug_ssl("ssl shutdown: fatal error: %e; retries=%d; ret=%d", + err, retries, ret); + g_error_free(err); + rspamd_ssl_connection_dtor(conn); + + return; + } + + /* As we own fd, we can try to perform shutdown one more time */ + /* BUGON: but we DO NOT own conn->ev, and it's a big issue */ + static const ev_tstamp shutdown_time = 5.0; + + if (conn->shut_ev == NULL) { + rspamd_ev_watcher_stop(conn->event_loop, conn->ev); + conn->shut_ev = g_malloc0(sizeof(*conn->shut_ev)); + rspamd_ev_watcher_init(conn->shut_ev, conn->fd, what, + rspamd_ssl_event_handler, conn); + rspamd_ev_watcher_start(conn->event_loop, conn->shut_ev, shutdown_time); + /* XXX: can it be done safely ? */ + conn->ev = conn->shut_ev; + } + else { + rspamd_ev_watcher_reschedule(conn->event_loop, conn->shut_ev, what); + } + + conn->state = ssl_next_shutdown; + } + else if (ret == 0) { + /* What can we do here?? */ + msg_debug_ssl("ssl shutdown: openssl failed to initiate shutdown after " + "%d attempts!", + max_retries); + rspamd_ssl_connection_dtor(conn); + } +} + +static void +rspamd_ssl_event_handler(gint fd, short what, gpointer ud) +{ + struct rspamd_ssl_connection *conn = ud; + gint ret; + GError *err = NULL; + + if (what == EV_TIMER) { + if (conn->state == ssl_next_shutdown) { + /* No way to restore, just terminate */ + rspamd_ssl_connection_dtor(conn); + } + else { + conn->shut = ssl_shut_unclean; + rspamd_ev_watcher_stop(conn->event_loop, conn->ev); + g_set_error(&err, rspamd_ssl_quark(), 408, + "ssl connection timed out"); + conn->err_handler(conn->handler_data, err); + g_error_free(err); + } + + return; + } + + msg_debug_ssl("ssl event; what=%d; c->state=%d", (int) what, + (int) conn->state); + + switch (conn->state) { + case ssl_conn_init: + /* Continue connection */ + ret = SSL_connect(conn->ssl); + + if (ret == 1) { + rspamd_ev_watcher_stop(conn->event_loop, conn->ev); + /* Verify certificate */ + if ((!conn->verify_peer) || rspamd_ssl_peer_verify(conn)) { + msg_debug_ssl("ssl connect: connected"); + conn->state = ssl_conn_connected; + conn->handler(fd, EV_WRITE, conn->handler_data); + } + else { + return; + } + } + else { + ret = SSL_get_error(conn->ssl, ret); + + if (ret == SSL_ERROR_WANT_READ) { + msg_debug_ssl("ssl connect: need read"); + what = EV_READ; + } + else if (ret == SSL_ERROR_WANT_WRITE) { + msg_debug_ssl("ssl connect: need write"); + what = EV_WRITE; + } + else { + rspamd_ev_watcher_stop(conn->event_loop, conn->ev); + rspamd_tls_set_error(ret, "connect", &err); + conn->err_handler(conn->handler_data, err); + g_error_free(err); + return; + } + + rspamd_ev_watcher_reschedule(conn->event_loop, conn->ev, what); + } + break; + case ssl_next_read: + rspamd_ev_watcher_reschedule(conn->event_loop, conn->ev, EV_READ); + conn->state = ssl_conn_connected; + conn->handler(fd, EV_READ, conn->handler_data); + break; + case ssl_next_write: + rspamd_ev_watcher_reschedule(conn->event_loop, conn->ev, EV_WRITE); + conn->state = ssl_conn_connected; + conn->handler(fd, EV_WRITE, conn->handler_data); + break; + case ssl_conn_connected: + rspamd_ev_watcher_reschedule(conn->event_loop, conn->ev, what); + conn->state = ssl_conn_connected; + conn->handler(fd, what, conn->handler_data); + break; + case ssl_next_shutdown: + rspamd_ssl_shutdown(conn); + break; + default: + rspamd_ev_watcher_stop(conn->event_loop, conn->ev); + g_set_error(&err, rspamd_ssl_quark(), 500, + "ssl bad state error: %d", conn->state); + conn->err_handler(conn->handler_data, err); + g_error_free(err); + break; + } +} + +struct rspamd_ssl_connection * +rspamd_ssl_connection_new(gpointer ssl_ctx, struct ev_loop *ev_base, + gboolean verify_peer, const gchar *log_tag) +{ + struct rspamd_ssl_connection *conn; + struct rspamd_ssl_ctx *ctx = (struct rspamd_ssl_ctx *) ssl_ctx; + + g_assert(ssl_ctx != NULL); + conn = g_malloc0(sizeof(*conn)); + conn->ssl_ctx = ctx; + conn->event_loop = ev_base; + conn->verify_peer = verify_peer; + + if (log_tag) { + rspamd_strlcpy(conn->log_tag, log_tag, sizeof(conn->log_tag)); + } + else { + rspamd_random_hex(conn->log_tag, sizeof(log_tag) - 1); + conn->log_tag[sizeof(log_tag) - 1] = '\0'; + } + + return conn; +} + + +gboolean +rspamd_ssl_connect_fd(struct rspamd_ssl_connection *conn, gint fd, + const gchar *hostname, struct rspamd_io_ev *ev, ev_tstamp timeout, + rspamd_ssl_handler_t handler, rspamd_ssl_error_handler_t err_handler, + gpointer handler_data) +{ + gint ret; + SSL_SESSION *session = NULL; + + g_assert(conn != NULL); + + /* Ensure that we start from the empty SSL errors stack */ + ERR_clear_error(); + conn->ssl = SSL_new(conn->ssl_ctx->s); + + if (hostname) { + session = rspamd_lru_hash_lookup(conn->ssl_ctx->sessions, hostname, + ev_now(conn->event_loop)); + } + + if (session) { + SSL_set_session(conn->ssl, session); + } + + SSL_set_app_data(conn->ssl, conn); + msg_debug_ssl("new ssl connection %p; session reused=%s", + conn->ssl, SSL_session_reused(conn->ssl) ? "true" : "false"); + + if (conn->state != ssl_conn_reset) { + return FALSE; + } + + /* We dup fd to allow graceful closing */ + gint nfd = dup(fd); + + if (nfd == -1) { + return FALSE; + } + + conn->fd = nfd; + conn->ev = ev; + conn->handler = handler; + conn->err_handler = err_handler; + conn->handler_data = handler_data; + + if (SSL_set_fd(conn->ssl, conn->fd) != 1) { + close(conn->fd); + + return FALSE; + } + + if (hostname) { + conn->hostname = g_strdup(hostname); +#ifdef HAVE_SSL_TLSEXT_HOSTNAME + SSL_set_tlsext_host_name(conn->ssl, conn->hostname); +#endif + } + + conn->state = ssl_conn_init; + + ret = SSL_connect(conn->ssl); + + if (ret == 1) { + conn->state = ssl_conn_connected; + + msg_debug_ssl("connected, start write event"); + rspamd_ev_watcher_stop(conn->event_loop, ev); + rspamd_ev_watcher_init(ev, nfd, EV_WRITE, rspamd_ssl_event_handler, conn); + rspamd_ev_watcher_start(conn->event_loop, ev, timeout); + } + else { + ret = SSL_get_error(conn->ssl, ret); + + if (ret == SSL_ERROR_WANT_READ) { + msg_debug_ssl("not connected, want read"); + } + else if (ret == SSL_ERROR_WANT_WRITE) { + msg_debug_ssl("not connected, want write"); + } + else { + GError *err = NULL; + + conn->shut = ssl_shut_unclean; + rspamd_tls_set_error(ret, "initial connect", &err); + msg_debug_ssl("not connected, fatal error %e", err); + g_error_free(err); + + + return FALSE; + } + + rspamd_ev_watcher_stop(conn->event_loop, ev); + rspamd_ev_watcher_init(ev, nfd, EV_WRITE | EV_READ, + rspamd_ssl_event_handler, conn); + rspamd_ev_watcher_start(conn->event_loop, ev, timeout); + } + + return TRUE; +} + +void rspamd_ssl_connection_restore_handlers(struct rspamd_ssl_connection *conn, + rspamd_ssl_handler_t handler, + rspamd_ssl_error_handler_t err_handler, + gpointer handler_data, + short ev_what) +{ + conn->handler = handler; + conn->err_handler = err_handler; + conn->handler_data = handler_data; + + rspamd_ev_watcher_stop(conn->event_loop, conn->ev); + rspamd_ev_watcher_init(conn->ev, conn->fd, ev_what, rspamd_ssl_event_handler, conn); + rspamd_ev_watcher_start(conn->event_loop, conn->ev, conn->ev->timeout); +} + +gssize +rspamd_ssl_read(struct rspamd_ssl_connection *conn, gpointer buf, + gsize buflen) +{ + gint ret; + short what; + GError *err = NULL; + + g_assert(conn != NULL); + + if (conn->state != ssl_conn_connected && conn->state != ssl_next_read) { + errno = EINVAL; + g_set_error(&err, rspamd_ssl_quark(), 400, + "ssl state error: cannot read data"); + conn->shut = ssl_shut_unclean; + conn->err_handler(conn->handler_data, err); + g_error_free(err); + + return -1; + } + + ret = SSL_read(conn->ssl, buf, buflen); + msg_debug_ssl("ssl read: %d", ret); + + if (ret > 0) { + conn->state = ssl_conn_connected; + return ret; + } + else if (ret == 0) { + ret = SSL_get_error(conn->ssl, ret); + + if (ret == SSL_ERROR_ZERO_RETURN || ret == SSL_ERROR_SYSCALL) { + conn->state = ssl_conn_reset; + return 0; + } + else { + conn->shut = ssl_shut_unclean; + rspamd_tls_set_error(ret, "read", &err); + conn->err_handler(conn->handler_data, err); + g_error_free(err); + errno = EINVAL; + + return -1; + } + } + else { + ret = SSL_get_error(conn->ssl, ret); + conn->state = ssl_next_read; + what = 0; + + if (ret == SSL_ERROR_WANT_READ) { + msg_debug_ssl("ssl read: need read"); + what |= EV_READ; + } + else if (ret == SSL_ERROR_WANT_WRITE) { + msg_debug_ssl("ssl read: need write"); + what |= EV_WRITE; + } + else { + conn->shut = ssl_shut_unclean; + rspamd_tls_set_error(ret, "read", &err); + conn->err_handler(conn->handler_data, err); + g_error_free(err); + errno = EINVAL; + + return -1; + } + + rspamd_ev_watcher_reschedule(conn->event_loop, conn->ev, what); + errno = EAGAIN; + } + + return -1; +} + +gssize +rspamd_ssl_write(struct rspamd_ssl_connection *conn, gconstpointer buf, + gsize buflen) +{ + gint ret; + short what; + GError *err = NULL; + + g_assert(conn != NULL); + + if (conn->state != ssl_conn_connected && conn->state != ssl_next_write) { + errno = EINVAL; + return -1; + } + + ret = SSL_write(conn->ssl, buf, buflen); + msg_debug_ssl("ssl write: ret=%d, buflen=%z", ret, buflen); + + if (ret > 0) { + conn->state = ssl_conn_connected; + return ret; + } + else if (ret == 0) { + ret = SSL_get_error(conn->ssl, ret); + + if (ret == SSL_ERROR_ZERO_RETURN) { + rspamd_tls_set_error(ret, "write", &err); + conn->err_handler(conn->handler_data, err); + g_error_free(err); + errno = ECONNRESET; + conn->state = ssl_conn_reset; + + return -1; + } + else { + conn->shut = ssl_shut_unclean; + rspamd_tls_set_error(ret, "write", &err); + conn->err_handler(conn->handler_data, err); + g_error_free(err); + errno = EINVAL; + + return -1; + } + } + else { + ret = SSL_get_error(conn->ssl, ret); + conn->state = ssl_next_write; + + if (ret == SSL_ERROR_WANT_READ) { + msg_debug_ssl("ssl write: need read"); + what = EV_READ; + } + else if (ret == SSL_ERROR_WANT_WRITE) { + msg_debug_ssl("ssl write: need write"); + what = EV_WRITE; + } + else { + conn->shut = ssl_shut_unclean; + rspamd_tls_set_error(ret, "write", &err); + conn->err_handler(conn->handler_data, err); + g_error_free(err); + errno = EINVAL; + + return -1; + } + + rspamd_ev_watcher_reschedule(conn->event_loop, conn->ev, what); + errno = EAGAIN; + } + + return -1; +} + +gssize +rspamd_ssl_writev(struct rspamd_ssl_connection *conn, struct iovec *iov, + gsize iovlen) +{ + /* + * Static is needed to avoid issue: + * https://github.com/openssl/openssl/issues/6865 + */ + static guchar ssl_buf[16384]; + guchar *p; + struct iovec *cur; + gsize i, remain; + + remain = sizeof(ssl_buf); + p = ssl_buf; + + for (i = 0; i < iovlen; i++) { + cur = &iov[i]; + + if (cur->iov_len > 0) { + if (remain >= cur->iov_len) { + memcpy(p, cur->iov_base, cur->iov_len); + p += cur->iov_len; + remain -= cur->iov_len; + } + else { + memcpy(p, cur->iov_base, remain); + p += remain; + remain = 0; + break; + } + } + } + + return rspamd_ssl_write(conn, ssl_buf, p - ssl_buf); +} + +/** + * Removes connection data + * @param conn + */ +void rspamd_ssl_connection_free(struct rspamd_ssl_connection *conn) +{ + if (conn) { + if (conn->shut == ssl_shut_unclean) { + /* Ignore return result and close socket */ + msg_debug_ssl("unclean shutdown"); + SSL_set_quiet_shutdown(conn->ssl, 1); + (void) SSL_shutdown(conn->ssl); + rspamd_ssl_connection_dtor(conn); + } + else { + msg_debug_ssl("normal shutdown"); + rspamd_ssl_shutdown(conn); + } + } +} + +static int +rspamd_ssl_new_client_session(SSL *ssl, SSL_SESSION *sess) +{ + struct rspamd_ssl_connection *conn; + + conn = SSL_get_app_data(ssl); + + if (conn->hostname) { + rspamd_lru_hash_insert(conn->ssl_ctx->sessions, + g_strdup(conn->hostname), SSL_get1_session(ssl), + ev_now(conn->event_loop), SSL_CTX_get_timeout(conn->ssl_ctx->s)); + msg_debug_ssl("saved new session for %s: %p", conn->hostname, conn); + } + + return 0; +} + +static struct rspamd_ssl_ctx * +rspamd_init_ssl_ctx_common(void) +{ + struct rspamd_ssl_ctx *ret; + SSL_CTX *ssl_ctx; + gint ssl_options; + static const guint client_cache_size = 1024; + + rspamd_openssl_maybe_init(); + + ret = g_malloc0(sizeof(*ret)); + ssl_options = SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3; + ssl_ctx = SSL_CTX_new(SSLv23_method()); + +#ifdef SSL_OP_NO_COMPRESSION + ssl_options |= SSL_OP_NO_COMPRESSION; +#elif OPENSSL_VERSION_NUMBER >= 0x00908000L + sk_SSL_COMP_zero(SSL_COMP_get_compression_methods()); +#endif + + SSL_CTX_set_options(ssl_ctx, ssl_options); + +#ifdef TLS1_3_VERSION + SSL_CTX_set_min_proto_version(ssl_ctx, 0); + SSL_CTX_set_max_proto_version(ssl_ctx, TLS1_3_VERSION); +#endif + +#ifdef SSL_SESS_CACHE_CLIENT + SSL_CTX_set_session_cache_mode(ssl_ctx, SSL_SESS_CACHE_CLIENT | SSL_SESS_CACHE_NO_INTERNAL_STORE); +#endif + + ret->s = ssl_ctx; + ret->sessions = rspamd_lru_hash_new_full(client_cache_size, + g_free, (GDestroyNotify) SSL_SESSION_free, rspamd_str_hash, + rspamd_str_equal); + SSL_CTX_set_app_data(ssl_ctx, ret); + SSL_CTX_sess_set_new_cb(ssl_ctx, rspamd_ssl_new_client_session); + + return ret; +} + +gpointer +rspamd_init_ssl_ctx(void) +{ + struct rspamd_ssl_ctx *ssl_ctx = rspamd_init_ssl_ctx_common(); + + SSL_CTX_set_verify(ssl_ctx->s, SSL_VERIFY_PEER, NULL); + SSL_CTX_set_verify_depth(ssl_ctx->s, 4); + + return ssl_ctx; +} + +gpointer rspamd_init_ssl_ctx_noverify(void) +{ + struct rspamd_ssl_ctx *ssl_ctx_noverify = rspamd_init_ssl_ctx_common(); + + SSL_CTX_set_verify(ssl_ctx_noverify->s, SSL_VERIFY_NONE, NULL); + + return ssl_ctx_noverify; +} + +void rspamd_openssl_maybe_init(void) +{ + static gboolean openssl_initialized = FALSE; + + if (!openssl_initialized) { + ERR_load_crypto_strings(); + SSL_load_error_strings(); + + OpenSSL_add_all_algorithms(); + OpenSSL_add_all_digests(); + OpenSSL_add_all_ciphers(); + +#if OPENSSL_VERSION_NUMBER >= 0x1000104fL && OPENSSL_VERSION_NUMBER < 0x30000000L && !defined(LIBRESSL_VERSION_NUMBER) + ENGINE_load_builtin_engines(); +#endif +#if OPENSSL_VERSION_NUMBER < 0x10100000L || defined(LIBRESSL_VERSION_NUMBER) + SSL_library_init(); +#else + OPENSSL_init_ssl(0, NULL); +#endif + +#if OPENSSL_VERSION_NUMBER < 0x10100000L || defined(LIBRESSL_VERSION_NUMBER) + OPENSSL_config(NULL); +#endif + if (RAND_status() == 0) { + guchar seed[128]; + + /* Try to use ottery to seed rand */ + ottery_rand_bytes(seed, sizeof(seed)); + RAND_seed(seed, sizeof(seed)); + rspamd_explicit_memzero(seed, sizeof(seed)); + } + + openssl_initialized = TRUE; + } +} + +void rspamd_ssl_ctx_config(struct rspamd_config *cfg, gpointer ssl_ctx) +{ + struct rspamd_ssl_ctx *ctx = (struct rspamd_ssl_ctx *) ssl_ctx; + static const char default_secure_ciphers[] = "HIGH:!aNULL:!kRSA:!PSK:!SRP:!MD5:!RC4"; + + if (cfg->ssl_ca_path) { + if (SSL_CTX_load_verify_locations(ctx->s, cfg->ssl_ca_path, + NULL) != 1) { + msg_err_config("cannot load CA certs from %s: %s", + cfg->ssl_ca_path, + ERR_error_string(ERR_get_error(), NULL)); + } + } + else { + msg_debug_config("ssl_ca_path is not set, using default CA path"); + SSL_CTX_set_default_verify_paths(ctx->s); + } + + if (cfg->ssl_ciphers) { + if (SSL_CTX_set_cipher_list(ctx->s, cfg->ssl_ciphers) != 1) { + msg_err_config( + "cannot set ciphers set to %s: %s; fallback to %s", + cfg->ssl_ciphers, + ERR_error_string(ERR_get_error(), NULL), + default_secure_ciphers); + /* Default settings */ + SSL_CTX_set_cipher_list(ctx->s, default_secure_ciphers); + } + } +} + +void rspamd_ssl_ctx_free(gpointer ssl_ctx) +{ + struct rspamd_ssl_ctx *ctx = (struct rspamd_ssl_ctx *) ssl_ctx; + + rspamd_lru_hash_destroy(ctx->sessions); + SSL_CTX_free(ctx->s); + g_free(ssl_ctx); +}
\ No newline at end of file diff --git a/src/libserver/ssl_util.h b/src/libserver/ssl_util.h new file mode 100644 index 0000000..cde7d47 --- /dev/null +++ b/src/libserver/ssl_util.h @@ -0,0 +1,120 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef SRC_LIBUTIL_SSL_UTIL_H_ +#define SRC_LIBUTIL_SSL_UTIL_H_ + +#include "config.h" +#include "libutil/addr.h" +#include "libutil/libev_helper.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_ssl_connection; + +typedef void (*rspamd_ssl_handler_t)(gint fd, short what, gpointer d); + +typedef void (*rspamd_ssl_error_handler_t)(gpointer d, GError *err); + +/** + * Creates a new ssl connection data structure + * @param ssl_ctx initialized SSL_CTX structure + * @return opaque connection data + */ +struct rspamd_ssl_connection *rspamd_ssl_connection_new(gpointer ssl_ctx, + struct ev_loop *ev_base, + gboolean verify_peer, + const gchar *log_tag); + +/** + * Connects SSL session using the specified (connected) FD + * @param conn connection + * @param fd fd to use + * @param hostname hostname for SNI + * @param ev event to use + * @param tv timeout for connection + * @param handler connected session handler + * @param handler_data opaque data + * @return TRUE if a session has been connected + */ +gboolean rspamd_ssl_connect_fd(struct rspamd_ssl_connection *conn, gint fd, + const gchar *hostname, struct rspamd_io_ev *ev, ev_tstamp timeout, + rspamd_ssl_handler_t handler, rspamd_ssl_error_handler_t err_handler, + gpointer handler_data); + +/** + * Restores SSL handlers for the existing ssl connection (e.g. after keepalive) + * @param conn + * @param handler + * @param err_handler + * @param handler_data + */ +void rspamd_ssl_connection_restore_handlers(struct rspamd_ssl_connection *conn, + rspamd_ssl_handler_t handler, + rspamd_ssl_error_handler_t err_handler, + gpointer handler_data, + short ev_what); + +/** + * Perform async read from SSL socket + * @param conn + * @param buf + * @param buflen + * @return + */ +gssize rspamd_ssl_read(struct rspamd_ssl_connection *conn, gpointer buf, + gsize buflen); + +/** + * Perform async write to ssl buffer + * @param conn + * @param buf + * @param buflen + * @param ev + * @param tv + * @return + */ +gssize rspamd_ssl_write(struct rspamd_ssl_connection *conn, gconstpointer buf, + gsize buflen); + +/** + * Emulate writev by copying iovec to a temporary buffer + * @param conn + * @param buf + * @param buflen + * @return + */ +gssize rspamd_ssl_writev(struct rspamd_ssl_connection *conn, struct iovec *iov, + gsize iovlen); + +/** + * Removes connection data + * @param conn + */ +void rspamd_ssl_connection_free(struct rspamd_ssl_connection *conn); + +gpointer rspamd_init_ssl_ctx(void); +gpointer rspamd_init_ssl_ctx_noverify(void); +void rspamd_ssl_ctx_config(struct rspamd_config *cfg, gpointer ssl_ctx); +void rspamd_ssl_ctx_free(gpointer ssl_ctx); +void rspamd_openssl_maybe_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* SRC_LIBUTIL_SSL_UTIL_H_ */ diff --git a/src/libserver/symcache/symcache_c.cxx b/src/libserver/symcache/symcache_c.cxx new file mode 100644 index 0000000..6a7e41c --- /dev/null +++ b/src/libserver/symcache/symcache_c.cxx @@ -0,0 +1,715 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "symcache_internal.hxx" +#include "symcache_periodic.hxx" +#include "symcache_item.hxx" +#include "symcache_runtime.hxx" + +/** + * C API for symcache + */ + +#define C_API_SYMCACHE(ptr) (reinterpret_cast<rspamd::symcache::symcache *>(ptr)) +#define C_API_SYMCACHE_RUNTIME(ptr) (reinterpret_cast<rspamd::symcache::symcache_runtime *>(ptr)) +#define C_API_SYMCACHE_ITEM(ptr) (reinterpret_cast<rspamd::symcache::cache_item *>(ptr)) +#define C_API_SYMCACHE_DYN_ITEM(ptr) (reinterpret_cast<rspamd::symcache::cache_dynamic_item *>(ptr)) + +void rspamd_symcache_destroy(struct rspamd_symcache *cache) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + delete real_cache; +} + +struct rspamd_symcache * +rspamd_symcache_new(struct rspamd_config *cfg) +{ + auto *ncache = new rspamd::symcache::symcache(cfg); + + return (struct rspamd_symcache *) ncache; +} + +gboolean +rspamd_symcache_init(struct rspamd_symcache *cache) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + return real_cache->init(); +} + +void rspamd_symcache_save(struct rspamd_symcache *cache) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + real_cache->save_items(); +} + +gint rspamd_symcache_add_symbol(struct rspamd_symcache *cache, + const gchar *name, + gint priority, + symbol_func_t func, + gpointer user_data, + int type, + gint parent) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + /* Legacy stuff */ + if (name == nullptr) { + name = ""; + } + + if (parent == -1) { + return real_cache->add_symbol_with_callback(name, priority, func, user_data, type); + } + else { + return real_cache->add_virtual_symbol(name, parent, type); + } +} + +bool rspamd_symcache_add_symbol_augmentation(struct rspamd_symcache *cache, + int sym_id, + const char *augmentation, + const char *value) +{ + auto *real_cache = C_API_SYMCACHE(cache); + auto log_tag = [&]() { return real_cache->log_tag(); }; + + if (augmentation == nullptr) { + msg_err_cache("null augmentation is not allowed for item %d", sym_id); + return false; + } + + + auto *item = real_cache->get_item_by_id_mut(sym_id, false); + + if (item == nullptr) { + msg_err_cache("item %d is not found", sym_id); + return false; + } + + /* Handle empty or absent strings equally */ + if (value == nullptr || value[0] == '\0') { + return item->add_augmentation(*real_cache, augmentation, std::nullopt); + } + + return item->add_augmentation(*real_cache, augmentation, value); +} + +void rspamd_symcache_set_peak_callback(struct rspamd_symcache *cache, gint cbref) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + real_cache->set_peak_cb(cbref); +} + +gboolean +rspamd_symcache_add_condition_delayed(struct rspamd_symcache *cache, + const gchar *sym, lua_State *L, gint cbref) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + real_cache->add_delayed_condition(sym, cbref); + + return TRUE; +} + +gint rspamd_symcache_find_symbol(struct rspamd_symcache *cache, + const gchar *name) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + /* Legacy stuff but used */ + if (name == nullptr) { + return -1; + } + + auto sym_maybe = real_cache->get_item_by_name(name, false); + + if (sym_maybe != nullptr) { + return sym_maybe->id; + } + + return -1; +} + +gboolean +rspamd_symcache_stat_symbol(struct rspamd_symcache *cache, + const gchar *name, + gdouble *frequency, + gdouble *freq_stddev, + gdouble *tm, + guint *nhits) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + auto sym_maybe = real_cache->get_item_by_name(name, false); + + if (sym_maybe != nullptr) { + *frequency = sym_maybe->st->avg_frequency; + *freq_stddev = sqrt(sym_maybe->st->stddev_frequency); + *tm = sym_maybe->st->time_counter.mean; + + if (nhits) { + *nhits = sym_maybe->st->hits; + } + + return TRUE; + } + + return FALSE; +} + + +guint rspamd_symcache_stats_symbols_count(struct rspamd_symcache *cache) +{ + auto *real_cache = C_API_SYMCACHE(cache); + return real_cache->get_stats_symbols_count(); +} + +guint64 +rspamd_symcache_get_cksum(struct rspamd_symcache *cache) +{ + auto *real_cache = C_API_SYMCACHE(cache); + return real_cache->get_cksum(); +} + +gboolean +rspamd_symcache_validate(struct rspamd_symcache *cache, + struct rspamd_config *cfg, + gboolean strict) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + return real_cache->validate(strict); +} + +ucl_object_t * +rspamd_symcache_counters(struct rspamd_symcache *cache) +{ + auto *real_cache = C_API_SYMCACHE(cache); + return real_cache->counters(); +} + +void * +rspamd_symcache_start_refresh(struct rspamd_symcache *cache, + struct ev_loop *ev_base, struct rspamd_worker *w) +{ + auto *real_cache = C_API_SYMCACHE(cache); + return new rspamd::symcache::cache_refresh_cbdata{real_cache, ev_base, w}; +} + +void rspamd_symcache_inc_frequency(struct rspamd_symcache *cache, struct rspamd_symcache_item *item, + const char *sym_name) +{ + auto *real_item = C_API_SYMCACHE_ITEM(item); + auto *real_cache = C_API_SYMCACHE(cache); + + if (real_item) { + real_item->inc_frequency(sym_name, *real_cache); + } +} + +void rspamd_symcache_add_delayed_dependency(struct rspamd_symcache *cache, + const gchar *from, const gchar *to) +{ + auto *real_cache = C_API_SYMCACHE(cache); + real_cache->add_delayed_dependency(from, to); +} + +const gchar * +rspamd_symcache_get_parent(struct rspamd_symcache *cache, + const gchar *symbol) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + auto *sym = real_cache->get_item_by_name(symbol, false); + + if (sym && sym->is_virtual()) { + auto *parent = sym->get_parent(*real_cache); + + if (parent) { + return parent->get_name().c_str(); + } + } + + return nullptr; +} + +const gchar * +rspamd_symcache_item_name(struct rspamd_symcache_item *item) +{ + auto *real_item = C_API_SYMCACHE_ITEM(item); + + if (real_item == nullptr) { + return nullptr; + } + + return real_item->get_name().c_str(); +} + +gint rspamd_symcache_item_flags(struct rspamd_symcache_item *item) +{ + auto *real_item = C_API_SYMCACHE_ITEM(item); + + if (real_item == nullptr) { + return 0; + } + + return real_item->get_flags(); +} + + +const gchar * +rspamd_symcache_dyn_item_name(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *dyn_item) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_dyn_item = C_API_SYMCACHE_DYN_ITEM(dyn_item); + + if (cache_runtime == nullptr || real_dyn_item == nullptr) { + return nullptr; + } + + auto static_item = cache_runtime->get_item_by_dynamic_item(real_dyn_item); + + return static_item->get_name().c_str(); +} + +gint rspamd_symcache_item_flags(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *dyn_item) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_dyn_item = C_API_SYMCACHE_DYN_ITEM(dyn_item); + + if (cache_runtime == nullptr || real_dyn_item == nullptr) { + return 0; + } + + auto static_item = cache_runtime->get_item_by_dynamic_item(real_dyn_item); + + return static_item->get_flags(); +} + +guint rspamd_symcache_get_symbol_flags(struct rspamd_symcache *cache, + const gchar *symbol) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + auto *sym = real_cache->get_item_by_name(symbol, false); + + if (sym) { + return sym->get_flags(); + } + + return 0; +} + +const struct rspamd_symcache_item_stat * +rspamd_symcache_item_stat(struct rspamd_symcache_item *item) +{ + auto *real_item = C_API_SYMCACHE_ITEM(item); + return real_item->st; +} + +void rspamd_symcache_get_symbol_details(struct rspamd_symcache *cache, + const gchar *symbol, + ucl_object_t *this_sym_ucl) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + auto *sym = real_cache->get_item_by_name(symbol, false); + + if (sym) { + ucl_object_insert_key(this_sym_ucl, + ucl_object_fromstring(sym->get_type_str()), + "type", strlen("type"), false); + } +} + +void rspamd_symcache_foreach(struct rspamd_symcache *cache, + void (*func)(struct rspamd_symcache_item *item, gpointer /* userdata */), + gpointer ud) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + real_cache->symbols_foreach([&](const rspamd::symcache::cache_item *item) { + func((struct rspamd_symcache_item *) item, ud); + }); +} + +void rspamd_symcache_process_settings_elt(struct rspamd_symcache *cache, + struct rspamd_config_settings_elt *elt) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + real_cache->process_settings_elt(elt); +} + +bool rspamd_symcache_set_allowed_settings_ids(struct rspamd_symcache *cache, + const gchar *symbol, + const guint32 *ids, + guint nids) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + auto *item = real_cache->get_item_by_name_mut(symbol, false); + + if (item == nullptr) { + return false; + } + + item->allowed_ids.set_ids(ids, nids); + return true; +} + +bool rspamd_symcache_set_forbidden_settings_ids(struct rspamd_symcache *cache, + const gchar *symbol, + const guint32 *ids, + guint nids) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + auto *item = real_cache->get_item_by_name_mut(symbol, false); + + if (item == nullptr) { + return false; + } + + item->forbidden_ids.set_ids(ids, nids); + return true; +} + +const guint32 * +rspamd_symcache_get_allowed_settings_ids(struct rspamd_symcache *cache, + const gchar *symbol, + guint *nids) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + const auto *item = real_cache->get_item_by_name(symbol, false); + return item->allowed_ids.get_ids(*nids); +} + +const guint32 * +rspamd_symcache_get_forbidden_settings_ids(struct rspamd_symcache *cache, + const gchar *symbol, + guint *nids) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + const auto *item = real_cache->get_item_by_name(symbol, false); + return item->forbidden_ids.get_ids(*nids); +} + +void rspamd_symcache_disable_all_symbols(struct rspamd_task *task, + struct rspamd_symcache *_cache, + guint skip_mask) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + + cache_runtime->disable_all_symbols(skip_mask); +} + +gboolean +rspamd_symcache_disable_symbol(struct rspamd_task *task, + struct rspamd_symcache *cache, + const gchar *symbol) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_cache = C_API_SYMCACHE(cache); + + if (cache_runtime == nullptr) { + return FALSE; + } + + return cache_runtime->disable_symbol(task, *real_cache, symbol); +} + +gboolean +rspamd_symcache_enable_symbol(struct rspamd_task *task, + struct rspamd_symcache *cache, + const gchar *symbol) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_cache = C_API_SYMCACHE(cache); + + if (cache_runtime == nullptr) { + return FALSE; + } + + return cache_runtime->enable_symbol(task, *real_cache, symbol); +} + +void rspamd_symcache_disable_symbol_static(struct rspamd_symcache *cache, + const gchar *symbol) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + real_cache->disable_symbol_delayed(symbol); +} + +void rspamd_symcache_enable_symbol_static(struct rspamd_symcache *cache, + const gchar *symbol) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + real_cache->enable_symbol_delayed(symbol); +} + +/* A real structure to match C results without extra copying */ +struct rspamd_symcache_real_timeout_result { + struct rspamd_symcache_timeout_result c_api_result; + std::vector<std::pair<double, const rspamd::symcache::cache_item *>> elts; +}; + +struct rspamd_symcache_timeout_result * +rspamd_symcache_get_max_timeout(struct rspamd_symcache *cache) +{ + auto *real_cache = C_API_SYMCACHE(cache); + auto *res = new rspamd_symcache_real_timeout_result; + + res->c_api_result.max_timeout = real_cache->get_max_timeout(res->elts); + res->c_api_result.items = reinterpret_cast<struct rspamd_symcache_timeout_item *>(res->elts.data()); + res->c_api_result.nitems = res->elts.size(); + + return &res->c_api_result; +} + +void rspamd_symcache_timeout_result_free(struct rspamd_symcache_timeout_result *res) +{ + auto *real_result = reinterpret_cast<rspamd_symcache_real_timeout_result *>(res); + delete real_result; +} + +gboolean +rspamd_symcache_is_checked(struct rspamd_task *task, + struct rspamd_symcache *cache, + const gchar *symbol) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_cache = C_API_SYMCACHE(cache); + + if (cache_runtime == nullptr) { + return FALSE; + } + + return cache_runtime->is_symbol_checked(*real_cache, symbol); +} + +gboolean +rspamd_symcache_process_settings(struct rspamd_task *task, + struct rspamd_symcache *cache) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_cache = C_API_SYMCACHE(cache); + + if (cache_runtime == nullptr) { + return FALSE; + } + + return cache_runtime->process_settings(task, *real_cache); +} + +gboolean +rspamd_symcache_is_item_allowed(struct rspamd_task *task, + struct rspamd_symcache_item *item, + gboolean exec_only) +{ + auto *real_item = C_API_SYMCACHE_ITEM(item); + + if (real_item == nullptr) { + return TRUE; + } + + return real_item->is_allowed(task, exec_only); +} + +gboolean +rspamd_symcache_is_symbol_enabled(struct rspamd_task *task, + struct rspamd_symcache *cache, + const gchar *symbol) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_cache = C_API_SYMCACHE(cache); + + if (!cache_runtime) { + return TRUE; + } + + return cache_runtime->is_symbol_enabled(task, *real_cache, symbol); +} + +struct rspamd_symcache_dynamic_item * +rspamd_symcache_get_cur_item(struct rspamd_task *task) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + + if (!cache_runtime) { + return nullptr; + } + + return (struct rspamd_symcache_dynamic_item *) cache_runtime->get_cur_item(); +} + +struct rspamd_symcache_dynamic_item * +rspamd_symcache_set_cur_item(struct rspamd_task *task, struct rspamd_symcache_dynamic_item *item) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_dyn_item = C_API_SYMCACHE_DYN_ITEM(item); + + if (!cache_runtime || !real_dyn_item) { + return nullptr; + } + + return (struct rspamd_symcache_dynamic_item *) cache_runtime->set_cur_item(real_dyn_item); +} + +void rspamd_symcache_enable_profile(struct rspamd_task *task) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + if (!cache_runtime) { + return; + } + + cache_runtime->set_profile_mode(true); +} + +guint rspamd_symcache_item_async_inc_full(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + const gchar *subsystem, + const gchar *loc) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_dyn_item = C_API_SYMCACHE_DYN_ITEM(item); + + auto *static_item = cache_runtime->get_item_by_dynamic_item(real_dyn_item); + msg_debug_cache_task("increase async events counter for %s(%d) = %d + 1; " + "subsystem %s (%s)", + static_item->symbol.c_str(), static_item->id, + real_dyn_item->async_events, subsystem, loc); + + return ++real_dyn_item->async_events; +} + +guint rspamd_symcache_item_async_dec_full(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + const gchar *subsystem, + const gchar *loc) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_dyn_item = C_API_SYMCACHE_DYN_ITEM(item); + + auto *static_item = cache_runtime->get_item_by_dynamic_item(real_dyn_item); + msg_debug_cache_task("decrease async events counter for %s(%d) = %d - 1; " + "subsystem %s (%s)", + static_item->symbol.c_str(), static_item->id, + real_dyn_item->async_events, subsystem, loc); + + if (G_UNLIKELY(real_dyn_item->async_events == 0)) { + msg_err_cache_task("INTERNAL ERROR: trying decrease async events counter for %s(%d) that is already zero; " + "subsystem %s (%s)", + static_item->symbol.c_str(), static_item->id, + real_dyn_item->async_events, subsystem, loc); + g_abort(); + g_assert_not_reached(); + } + + return --real_dyn_item->async_events; +} + +gboolean +rspamd_symcache_item_async_dec_check_full(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item, + const gchar *subsystem, + const gchar *loc) +{ + if (rspamd_symcache_item_async_dec_full(task, item, subsystem, loc) == 0) { + rspamd_symcache_finalize_item(task, item); + + return TRUE; + } + + return FALSE; +} + +struct rspamd_abstract_callback_data * +rspamd_symcache_get_cbdata(struct rspamd_symcache *cache, + const gchar *symbol) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + auto *item = real_cache->get_item_by_name(symbol, true); + + if (item) { + return (struct rspamd_abstract_callback_data *) item->get_cbdata(); + } + + return nullptr; +} + +void rspamd_symcache_composites_foreach(struct rspamd_task *task, + struct rspamd_symcache *cache, + GHFunc func, + gpointer fd) +{ + auto *real_cache = C_API_SYMCACHE(cache); + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + + real_cache->composites_foreach([&](const auto *item) { + auto *dyn_item = cache_runtime->get_dynamic_item(item->id); + + if (dyn_item && !dyn_item->started) { + auto *old_item = cache_runtime->set_cur_item(dyn_item); + func((void *) item->get_name().c_str(), item->get_cbdata(), fd); + dyn_item->finished = true; + cache_runtime->set_cur_item(old_item); + } + }); + + cache_runtime->set_cur_item(nullptr); +} + +gboolean +rspamd_symcache_process_symbols(struct rspamd_task *task, + struct rspamd_symcache *cache, + guint stage) +{ + auto *real_cache = C_API_SYMCACHE(cache); + + if (task->symcache_runtime == nullptr) { + task->symcache_runtime = rspamd::symcache::symcache_runtime::create(task, *real_cache); + } + + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + return cache_runtime->process_symbols(task, *real_cache, stage); +} + +void rspamd_symcache_finalize_item(struct rspamd_task *task, + struct rspamd_symcache_dynamic_item *item) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + auto *real_dyn_item = C_API_SYMCACHE_DYN_ITEM(item); + + cache_runtime->finalize_item(task, real_dyn_item); +} + +void rspamd_symcache_runtime_destroy(struct rspamd_task *task) +{ + auto *cache_runtime = C_API_SYMCACHE_RUNTIME(task->symcache_runtime); + cache_runtime->savepoint_dtor(); +}
\ No newline at end of file diff --git a/src/libserver/symcache/symcache_id_list.hxx b/src/libserver/symcache/symcache_id_list.hxx new file mode 100644 index 0000000..bef4fa9 --- /dev/null +++ b/src/libserver/symcache/symcache_id_list.hxx @@ -0,0 +1,95 @@ +/*- + * Copyright 2022 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_SYMCACHE_ID_LIST_HXX +#define RSPAMD_SYMCACHE_ID_LIST_HXX +#pragma once + +#include <cstdint> +#include <cstring> // for memset +#include <algorithm>// for sort/bsearch + +#include "config.h" +#include "libutil/mem_pool.h" +#include "contrib/ankerl/svector.h" + +namespace rspamd::symcache { +/* + * This structure is optimised to store ids list: + * - If the first element is -1 then use dynamic part, else use static part + * There is no std::variant to save space + */ + +constexpr const auto id_capacity = 4; +constexpr const auto id_sort_threshold = 32; + +struct id_list { + ankerl::svector<std::uint32_t, id_capacity> data; + + id_list() = default; + + auto reset() + { + data.clear(); + } + + /** + * Returns ids from a compressed list, accepting a mutable reference for number of elements + * @param nids output of the number of elements + * @return + */ + auto get_ids(unsigned &nids) const -> const std::uint32_t * + { + nids = data.size(); + + return data.data(); + } + + auto add_id(std::uint32_t id) -> void + { + data.push_back(id); + + /* Check sort threshold */ + if (data.size() > id_sort_threshold) { + std::sort(data.begin(), data.end()); + } + } + + auto set_ids(const std::uint32_t *ids, std::size_t nids) -> void + { + data.resize(nids); + + for (auto &id: data) { + id = *ids++; + } + + if (data.size() > id_sort_threshold) { + std::sort(data.begin(), data.end()); + } + } + + auto check_id(unsigned int id) const -> bool + { + if (data.size() > id_sort_threshold) { + return std::binary_search(data.begin(), data.end(), id); + } + return std::find(data.begin(), data.end(), id) != data.end(); + } +}; + +}// namespace rspamd::symcache + +#endif//RSPAMD_SYMCACHE_ID_LIST_HXX diff --git a/src/libserver/symcache/symcache_impl.cxx b/src/libserver/symcache/symcache_impl.cxx new file mode 100644 index 0000000..93675ac --- /dev/null +++ b/src/libserver/symcache/symcache_impl.cxx @@ -0,0 +1,1316 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lua/lua_common.h" +#include "symcache_internal.hxx" +#include "symcache_item.hxx" +#include "symcache_runtime.hxx" +#include "unix-std.h" +#include "libutil/cxx/file_util.hxx" +#include "libutil/cxx/util.hxx" +#include "fmt/core.h" +#include "contrib/t1ha/t1ha.h" + +#ifdef __has_include +#if __has_include(<version>) +#include <version> +#endif +#endif +#include <cmath> + +namespace rspamd::symcache { + +INIT_LOG_MODULE_PUBLIC(symcache) + +auto symcache::init() -> bool +{ + auto res = true; + reload_time = cfg->cache_reload_time; + + if (cfg->cache_filename != nullptr) { + msg_debug_cache("loading symcache saved data from %s", cfg->cache_filename); + load_items(); + } + + ankerl::unordered_dense::set<int> disabled_ids; + /* Process enabled/disabled symbols */ + for (const auto &[id, it]: items_by_id) { + if (disabled_symbols) { + /* + * Due to the ability to add patterns, this is now O(N^2), but it is done + * once on configuration and the amount of static patterns is usually low + * The possible optimization is to store non patterns in a different set to check it + * quickly. However, it is unlikely that this would be used to something really heavy. + */ + for (const auto &disable_pat: *disabled_symbols) { + if (disable_pat.matches(it->get_name())) { + msg_debug_cache("symbol %s matches %*s disable pattern", it->get_name().c_str(), + (int) disable_pat.to_string_view().size(), disable_pat.to_string_view().data()); + auto need_disable = true; + + if (enabled_symbols) { + for (const auto &enable_pat: *enabled_symbols) { + if (enable_pat.matches(it->get_name())) { + msg_debug_cache("symbol %s matches %*s enable pattern; skip disabling", it->get_name().c_str(), + (int) enable_pat.to_string_view().size(), enable_pat.to_string_view().data()); + need_disable = false; + break; + } + } + } + + if (need_disable) { + disabled_ids.insert(it->id); + + if (it->is_virtual()) { + auto real_elt = it->get_parent(*this); + + if (real_elt) { + disabled_ids.insert(real_elt->id); + + const auto *children = real_elt->get_children(); + if (children != nullptr) { + for (const auto &cld: *children) { + msg_debug_cache("symbol %s is a virtual sibling of the disabled symbol %s", + cld->get_name().c_str(), it->get_name().c_str()); + disabled_ids.insert(cld->id); + } + } + } + } + else { + /* Also disable all virtual children of this element */ + const auto *children = it->get_children(); + + if (children != nullptr) { + for (const auto &cld: *children) { + msg_debug_cache("symbol %s is a virtual child of the disabled symbol %s", + cld->get_name().c_str(), it->get_name().c_str()); + disabled_ids.insert(cld->id); + } + } + } + } + } + } + } + } + + /* Deal with the delayed dependencies */ + msg_debug_cache("resolving delayed dependencies: %d in list", (int) delayed_deps->size()); + for (const auto &delayed_dep: *delayed_deps) { + auto virt_item = get_item_by_name(delayed_dep.from, false); + auto real_item = get_item_by_name(delayed_dep.from, true); + + if (virt_item == nullptr || real_item == nullptr) { + msg_err_cache("cannot register delayed dependency between %s and %s: " + "%s is missing", + delayed_dep.from.data(), + delayed_dep.to.data(), delayed_dep.from.data()); + } + else { + + if (!disabled_ids.contains(real_item->id)) { + msg_debug_cache("delayed between %s(%d:%d) -> %s", + delayed_dep.from.data(), + real_item->id, virt_item->id, + delayed_dep.to.data()); + add_dependency(real_item->id, delayed_dep.to, + virt_item != real_item ? virt_item->id : -1); + } + else { + msg_debug_cache("no delayed between %s(%d:%d) -> %s; %s is disabled", + delayed_dep.from.data(), + real_item->id, virt_item->id, + delayed_dep.to.data(), + delayed_dep.from.data()); + } + } + } + + /* Remove delayed dependencies, as they are no longer needed at this point */ + delayed_deps.reset(); + + /* Physically remove ids that are disabled statically */ + for (auto id_to_disable: disabled_ids) { + /* + * This erasure is inefficient, we can swap the last element with the removed id + * But in this way, our ids are still sorted by addition + */ + + /* Preserve refcount here */ + auto deleted_element_refcount = items_by_id[id_to_disable]; + items_by_id.erase(id_to_disable); + items_by_symbol.erase(deleted_element_refcount->get_name()); + + auto &additional_vec = get_item_specific_vector(*deleted_element_refcount); +#if defined(__cpp_lib_erase_if) + std::erase_if(additional_vec, [id_to_disable](cache_item *elt) { + return elt->id == id_to_disable; + }); +#else + auto it = std::remove_if(additional_vec.begin(), + additional_vec.end(), [id_to_disable](cache_item *elt) { + return elt->id == id_to_disable; + }); + additional_vec.erase(it, additional_vec.end()); +#endif + + /* Refcount is dropped, so the symbol should be freed, ensure that nothing else owns this symbol */ + g_assert(deleted_element_refcount.use_count() == 1); + } + + /* Remove no longer used stuff */ + enabled_symbols.reset(); + disabled_symbols.reset(); + + /* Deal with the delayed conditions */ + msg_debug_cache("resolving delayed conditions: %d in list", (int) delayed_conditions->size()); + for (const auto &delayed_cond: *delayed_conditions) { + auto it = get_item_by_name_mut(delayed_cond.sym, true); + + if (it == nullptr) { + msg_err_cache( + "cannot register delayed condition for %s", + delayed_cond.sym.c_str()); + luaL_unref(delayed_cond.L, LUA_REGISTRYINDEX, delayed_cond.cbref); + } + else { + if (!it->add_condition(delayed_cond.L, delayed_cond.cbref)) { + msg_err_cache( + "cannot register delayed condition for %s: virtual parent; qed", + delayed_cond.sym.c_str()); + g_abort(); + } + + msg_debug_cache("added a condition to the symbol %s", it->symbol.c_str()); + } + } + delayed_conditions.reset(); + + msg_debug_cache("process dependencies"); + for (const auto &[_id, it]: items_by_id) { + it->process_deps(*this); + } + + /* Sorting stuff */ + constexpr auto postfilters_cmp = [](const auto &it1, const auto &it2) -> bool { + return it1->priority < it2->priority; + }; + constexpr auto prefilters_cmp = [](const auto &it1, const auto &it2) -> bool { + return it1->priority > it2->priority; + }; + + msg_debug_cache("sorting stuff"); + std::stable_sort(std::begin(connfilters), std::end(connfilters), prefilters_cmp); + std::stable_sort(std::begin(prefilters), std::end(prefilters), prefilters_cmp); + std::stable_sort(std::begin(postfilters), std::end(postfilters), postfilters_cmp); + std::stable_sort(std::begin(idempotent), std::end(idempotent), postfilters_cmp); + + resort(); + + /* Connect metric symbols with symcache symbols */ + if (cfg->symbols) { + msg_debug_cache("connect metrics"); + g_hash_table_foreach(cfg->symbols, + symcache::metric_connect_cb, + (void *) this); + } + + return res; +} + +auto symcache::load_items() -> bool +{ + auto cached_map = util::raii_mmaped_file::mmap_shared(cfg->cache_filename, + O_RDONLY, PROT_READ); + + if (!cached_map.has_value()) { + if (cached_map.error().category == util::error_category::CRITICAL) { + msg_err_cache("%s", cached_map.error().error_message.data()); + } + else { + msg_info_cache("%s", cached_map.error().error_message.data()); + } + return false; + } + + + if (cached_map->get_size() < (gint) sizeof(symcache_header)) { + msg_info_cache("cannot use file %s, truncated: %z", cfg->cache_filename, + errno, strerror(errno)); + return false; + } + + const auto *hdr = (struct symcache_header *) cached_map->get_map(); + + if (memcmp(hdr->magic, symcache_magic, + sizeof(symcache_magic)) != 0) { + msg_info_cache("cannot use file %s, bad magic", cfg->cache_filename); + + return false; + } + + auto *parser = ucl_parser_new(0); + const auto *p = (const std::uint8_t *) (hdr + 1); + + if (!ucl_parser_add_chunk(parser, p, cached_map->get_size() - sizeof(*hdr))) { + msg_info_cache("cannot use file %s, cannot parse: %s", cfg->cache_filename, + ucl_parser_get_error(parser)); + ucl_parser_free(parser); + + return false; + } + + auto *top = ucl_parser_get_object(parser); + ucl_parser_free(parser); + + if (top == nullptr || ucl_object_type(top) != UCL_OBJECT) { + msg_info_cache("cannot use file %s, bad object", cfg->cache_filename); + ucl_object_unref(top); + + return false; + } + + auto it = ucl_object_iterate_new(top); + const ucl_object_t *cur; + while ((cur = ucl_object_iterate_safe(it, true)) != nullptr) { + auto item_it = items_by_symbol.find(ucl_object_key(cur)); + + if (item_it != items_by_symbol.end()) { + auto item = item_it->second; + /* Copy saved info */ + /* + * XXX: don't save or load weight, it should be obtained from the + * metric + */ +#if 0 + elt = ucl_object_lookup (cur, "weight"); + + if (elt) { + w = ucl_object_todouble (elt); + if (w != 0) { + item->weight = w; + } + } +#endif + const auto *elt = ucl_object_lookup(cur, "time"); + if (elt) { + item->st->avg_time = ucl_object_todouble(elt); + } + + elt = ucl_object_lookup(cur, "count"); + if (elt) { + item->st->total_hits = ucl_object_toint(elt); + item->last_count = item->st->total_hits; + } + + elt = ucl_object_lookup(cur, "frequency"); + if (elt && ucl_object_type(elt) == UCL_OBJECT) { + const ucl_object_t *freq_elt; + + freq_elt = ucl_object_lookup(elt, "avg"); + + if (freq_elt) { + item->st->avg_frequency = ucl_object_todouble(freq_elt); + } + freq_elt = ucl_object_lookup(elt, "stddev"); + + if (freq_elt) { + item->st->stddev_frequency = ucl_object_todouble(freq_elt); + } + } + + if (item->is_virtual() && !item->is_ghost()) { + const auto &parent = item->get_parent(*this); + + if (parent) { + if (parent->st->weight < item->st->weight) { + parent->st->weight = item->st->weight; + } + } + /* + * We maintain avg_time for virtual symbols equal to the + * parent item avg_time + */ + item->st->avg_time = parent->st->avg_time; + } + + total_weight += fabs(item->st->weight); + total_hits += item->st->total_hits; + } + } + + ucl_object_iterate_free(it); + ucl_object_unref(top); + + return true; +} + +template<typename T> +static constexpr auto round_to_hundreds(T x) +{ + return (::floor(x) * 100.0) / 100.0; +} + +bool symcache::save_items() const +{ + if (cfg->cache_filename == nullptr) { + return false; + } + + auto file_sink = util::raii_file_sink::create(cfg->cache_filename, + O_WRONLY | O_TRUNC, 00644); + + if (!file_sink.has_value()) { + if (errno == EEXIST) { + /* Some other process is already writing data, give up silently */ + return false; + } + + msg_err_cache("%s", file_sink.error().error_message.data()); + + return false; + } + + struct symcache_header hdr; + memset(&hdr, 0, sizeof(hdr)); + memcpy(hdr.magic, symcache_magic, sizeof(symcache_magic)); + + if (write(file_sink->get_fd(), &hdr, sizeof(hdr)) == -1) { + msg_err_cache("cannot write to file %s, error %d, %s", cfg->cache_filename, + errno, strerror(errno)); + + return false; + } + + auto *top = ucl_object_typed_new(UCL_OBJECT); + + for (const auto &it: items_by_symbol) { + auto item = it.second; + auto elt = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(elt, + ucl_object_fromdouble(round_to_hundreds(item->st->weight)), + "weight", 0, false); + ucl_object_insert_key(elt, + ucl_object_fromdouble(round_to_hundreds(item->st->time_counter.mean)), + "time", 0, false); + ucl_object_insert_key(elt, ucl_object_fromint(item->st->total_hits), + "count", 0, false); + + auto *freq = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(freq, + ucl_object_fromdouble(round_to_hundreds(item->st->frequency_counter.mean)), + "avg", 0, false); + ucl_object_insert_key(freq, + ucl_object_fromdouble(round_to_hundreds(item->st->frequency_counter.stddev)), + "stddev", 0, false); + ucl_object_insert_key(elt, freq, "frequency", 0, false); + + ucl_object_insert_key(top, elt, it.first.data(), 0, true); + } + + auto fp = fdopen(file_sink->get_fd(), "a"); + auto *efunc = ucl_object_emit_file_funcs(fp); + auto ret = ucl_object_emit_full(top, UCL_EMIT_JSON_COMPACT, efunc, nullptr); + ucl_object_emit_funcs_free(efunc); + ucl_object_unref(top); + fclose(fp); + + return ret; +} + +auto symcache::metric_connect_cb(void *k, void *v, void *ud) -> void +{ + auto *cache = (symcache *) ud; + const auto *sym = (const char *) k; + auto *s = (struct rspamd_symbol *) v; + auto weight = *s->weight_ptr; + auto *item = cache->get_item_by_name_mut(sym, false); + + if (item) { + item->st->weight = weight; + s->cache_item = (void *) item; + } +} + + +auto symcache::get_item_by_id(int id, bool resolve_parent) const -> const cache_item * +{ + if (id < 0 || id >= items_by_id.size()) { + msg_err_cache("internal error: requested item with id %d, when we have just %d items in the cache", + id, (int) items_by_id.size()); + return nullptr; + } + + const auto &maybe_item = rspamd::find_map(items_by_id, id); + + if (!maybe_item.has_value()) { + msg_err_cache("internal error: requested item with id %d but it is empty; qed", + id); + return nullptr; + } + + const auto &item = maybe_item.value().get(); + + if (resolve_parent && item->is_virtual()) { + return item->get_parent(*this); + } + + return item.get(); +} + +auto symcache::get_item_by_id_mut(int id, bool resolve_parent) const -> cache_item * +{ + if (id < 0 || id >= items_by_id.size()) { + msg_err_cache("internal error: requested item with id %d, when we have just %d items in the cache", + id, (int) items_by_id.size()); + return nullptr; + } + + const auto &maybe_item = rspamd::find_map(items_by_id, id); + + if (!maybe_item.has_value()) { + msg_err_cache("internal error: requested item with id %d but it is empty; qed", + id); + return nullptr; + } + + const auto &item = maybe_item.value().get(); + + if (resolve_parent && item->is_virtual()) { + return const_cast<cache_item *>(item->get_parent(*this)); + } + + return item.get(); +} + +auto symcache::get_item_by_name(std::string_view name, bool resolve_parent) const -> const cache_item * +{ + auto it = items_by_symbol.find(name); + + if (it == items_by_symbol.end()) { + return nullptr; + } + + if (resolve_parent && it->second->is_virtual()) { + it->second->resolve_parent(*this); + return it->second->get_parent(*this); + } + + return it->second; +} + +auto symcache::get_item_by_name_mut(std::string_view name, bool resolve_parent) const -> cache_item * +{ + auto it = items_by_symbol.find(name); + + if (it == items_by_symbol.end()) { + return nullptr; + } + + if (resolve_parent && it->second->is_virtual()) { + return (cache_item *) it->second->get_parent(*this); + } + + return it->second; +} + +auto symcache::add_dependency(int id_from, std::string_view to, int virtual_id_from) -> void +{ + g_assert(id_from >= 0 && id_from < (gint) items_by_id.size()); + const auto &source = items_by_id[id_from]; + g_assert(source.get() != nullptr); + + source->deps.emplace_back(nullptr, + std::string(to), + id_from, + -1); + + + if (virtual_id_from >= 0) { + g_assert(virtual_id_from < (gint) items_by_id.size()); + /* We need that for settings id propagation */ + const auto &vsource = items_by_id[virtual_id_from]; + g_assert(vsource.get() != nullptr); + vsource->deps.emplace_back(nullptr, + std::string(to), + -1, + virtual_id_from); + } +} + +auto symcache::resort() -> void +{ + auto log_func = RSPAMD_LOG_FUNC; + auto ord = std::make_shared<order_generation>(filters.size() + + prefilters.size() + + composites.size() + + postfilters.size() + + idempotent.size() + + connfilters.size() + + classifiers.size(), + cur_order_gen); + + for (auto &it: filters) { + if (it) { + total_hits += it->st->total_hits; + /* Unmask topological order */ + it->order = 0; + ord->d.emplace_back(it->getptr()); + } + } + + enum class tsort_mask { + PERM, + TEMP + }; + + constexpr auto tsort_unmask = [](cache_item *it) -> auto { + return (it->order & ~((1u << 31) | (1u << 30))); + }; + + /* Recursive topological sort helper */ + const auto tsort_visit = [&](cache_item *it, unsigned cur_order, auto &&rec) { + constexpr auto tsort_mark = [](cache_item *it, tsort_mask how) { + switch (how) { + case tsort_mask::PERM: + it->order |= (1u << 31); + break; + case tsort_mask::TEMP: + it->order |= (1u << 30); + break; + } + }; + constexpr auto tsort_is_marked = [](cache_item *it, tsort_mask how) { + switch (how) { + case tsort_mask::PERM: + return (it->order & (1u << 31)); + case tsort_mask::TEMP: + return (it->order & (1u << 30)); + } + + return 100500u; /* Because fuck compilers, that's why */ + }; + + if (tsort_is_marked(it, tsort_mask::PERM)) { + if (cur_order > tsort_unmask(it)) { + /* Need to recalculate the whole chain */ + it->order = cur_order; /* That also removes all masking */ + } + else { + /* We are fine, stop DFS */ + return; + } + } + else if (tsort_is_marked(it, tsort_mask::TEMP)) { + msg_err_cache_lambda("cyclic dependencies found when checking '%s'!", + it->symbol.c_str()); + return; + } + + tsort_mark(it, tsort_mask::TEMP); + msg_debug_cache_lambda("visiting node: %s (%d)", it->symbol.c_str(), cur_order); + + for (const auto &dep: it->deps) { + msg_debug_cache_lambda("visiting dep: %s (%d)", dep.item->symbol.c_str(), cur_order + 1); + rec(dep.item, cur_order + 1, rec); + } + + it->order = cur_order; + tsort_mark(it, tsort_mask::PERM); + }; + /* + * Topological sort + */ + total_hits = 0; + auto used_items = ord->d.size(); + + for (const auto &it: ord->d) { + if (it->order == 0) { + tsort_visit(it.get(), 0, tsort_visit); + } + } + + + /* Main sorting comparator */ + constexpr auto score_functor = [](auto w, auto f, auto t) -> auto { + auto time_alpha = 1.0, weight_alpha = 0.1, freq_alpha = 0.01; + + return ((w > 0.0 ? w : weight_alpha) * (f > 0.0 ? f : freq_alpha) / + (t > time_alpha ? t : time_alpha)); + }; + + auto cache_order_cmp = [&](const auto &it1, const auto &it2) -> auto { + constexpr const auto topology_mult = 1e7, + priority_mult = 1e6, + augmentations1_mult = 1e5; + auto w1 = tsort_unmask(it1.get()) * topology_mult, + w2 = tsort_unmask(it2.get()) * topology_mult; + + w1 += it1->priority * priority_mult; + w2 += it2->priority * priority_mult; + w1 += it1->get_augmentation_weight() * augmentations1_mult; + w2 += it2->get_augmentation_weight() * augmentations1_mult; + + auto avg_freq = ((double) total_hits / used_items); + auto avg_weight = (total_weight / used_items); + auto f1 = (double) it1->st->total_hits / avg_freq; + auto f2 = (double) it2->st->total_hits / avg_freq; + auto weight1 = std::fabs(it1->st->weight) / avg_weight; + auto weight2 = std::fabs(it2->st->weight) / avg_weight; + auto t1 = it1->st->avg_time; + auto t2 = it2->st->avg_time; + w1 += score_functor(weight1, f1, t1); + w2 += score_functor(weight2, f2, t2); + + return w1 > w2; + }; + + std::stable_sort(std::begin(ord->d), std::end(ord->d), cache_order_cmp); + /* + * Here lives some ugly legacy! + * We have several filters classes, connfilters, prefilters, filters... etc + * + * Our order is meaningful merely for filters, but we have to add other classes + * to understand if those symbols are checked or disabled. + * We can disable symbols for almost everything but not for virtual symbols. + * The rule of thumb is that if a symbol has explicit parent, then it is a + * virtual symbol that follows it's special rules + */ + + /* + * We enrich ord with all other symbol types without any sorting, + * as it is done in another place + */ + constexpr auto append_items_vec = [](const auto &vec, auto &out) { + for (const auto &it: vec) { + if (it) { + out.emplace_back(it->getptr()); + } + } + }; + + append_items_vec(connfilters, ord->d); + append_items_vec(prefilters, ord->d); + append_items_vec(postfilters, ord->d); + append_items_vec(idempotent, ord->d); + append_items_vec(composites, ord->d); + append_items_vec(classifiers, ord->d); + + /* After sorting is done, we can assign all elements in the by_symbol hash */ + for (const auto [i, it]: rspamd::enumerate(ord->d)) { + ord->by_symbol.emplace(it->get_name(), i); + ord->by_cache_id[it->id] = i; + } + /* Finally set the current order */ + std::swap(ord, items_by_order); +} + +auto symcache::add_symbol_with_callback(std::string_view name, + int priority, + symbol_func_t func, + void *user_data, + int flags_and_type) -> int +{ + auto real_type_pair_maybe = item_type_from_c(flags_and_type); + + if (!real_type_pair_maybe.has_value()) { + msg_err_cache("incompatible flags when adding %s: %s", name.data(), + real_type_pair_maybe.error().c_str()); + return -1; + } + + auto real_type_pair = real_type_pair_maybe.value(); + + if (real_type_pair.first != symcache_item_type::FILTER) { + real_type_pair.second |= SYMBOL_TYPE_NOSTAT; + } + if (real_type_pair.second & (SYMBOL_TYPE_GHOST | SYMBOL_TYPE_CALLBACK)) { + real_type_pair.second |= SYMBOL_TYPE_NOSTAT; + } + + if (real_type_pair.first == symcache_item_type::VIRTUAL) { + msg_err_cache("trying to add virtual symbol %s as real (no parent)", name.data()); + return -1; + } + + std::string static_string_name; + + if (name.empty()) { + static_string_name = fmt::format("AUTO_{}_{}", (void *) func, user_data); + msg_warn_cache("trying to add an empty symbol name, convert it to %s", + static_string_name.c_str()); + } + else { + static_string_name = name; + } + + if (real_type_pair.first == symcache_item_type::IDEMPOTENT && priority != 0) { + msg_warn_cache("priority has been set for idempotent symbol %s: %d", + static_string_name.c_str(), priority); + } + + if ((real_type_pair.second & SYMBOL_TYPE_FINE) && priority == 0) { + /* Adjust priority for negative weighted symbols */ + priority = 1; + } + + if (items_by_symbol.contains(static_string_name)) { + msg_err_cache("duplicate symbol name: %s", static_string_name.data()); + return -1; + } + + auto id = items_by_id.size(); + + auto item = cache_item::create_with_function(static_pool, id, + std::move(static_string_name), + priority, func, user_data, + real_type_pair.first, real_type_pair.second); + + items_by_symbol.emplace(item->get_name(), item.get()); + get_item_specific_vector(*item).push_back(item.get()); + items_by_id.emplace(id, std::move(item));// Takes ownership + + if (!(real_type_pair.second & SYMBOL_TYPE_NOSTAT)) { + cksum = t1ha(name.data(), name.size(), cksum); + stats_symbols_count++; + } + + return id; +} + +auto symcache::add_virtual_symbol(std::string_view name, int parent_id, int flags_and_type) -> int +{ + if (name.empty()) { + msg_err_cache("cannot register a virtual symbol with no name; qed"); + return -1; + } + + auto real_type_pair_maybe = item_type_from_c(flags_and_type); + + if (!real_type_pair_maybe.has_value()) { + msg_err_cache("incompatible flags when adding %s: %s", name.data(), + real_type_pair_maybe.error().c_str()); + return -1; + } + + auto real_type_pair = real_type_pair_maybe.value(); + + if (items_by_symbol.contains(name)) { + msg_err_cache("duplicate symbol name: %s", name.data()); + return -1; + } + + if (items_by_id.size() < parent_id) { + msg_err_cache("parent id %d is out of bounds for virtual symbol %s", parent_id, name.data()); + return -1; + } + + auto id = items_by_id.size(); + + auto item = cache_item::create_with_virtual(static_pool, + id, + std::string{name}, + parent_id, real_type_pair.first, real_type_pair.second); + const auto &parent = items_by_id[parent_id].get(); + parent->add_child(item.get()); + items_by_symbol.emplace(item->get_name(), item.get()); + get_item_specific_vector(*item).push_back(item.get()); + items_by_id.emplace(id, std::move(item));// Takes ownership + + return id; +} + +auto symcache::set_peak_cb(int cbref) -> void +{ + if (peak_cb != -1) { + luaL_unref(L, LUA_REGISTRYINDEX, peak_cb); + } + + peak_cb = cbref; + msg_info_cache("registered peak callback"); +} + +auto symcache::add_delayed_condition(std::string_view sym, int cbref) -> void +{ + delayed_conditions->emplace_back(sym, cbref, (lua_State *) cfg->lua_state); +} + +auto symcache::validate(bool strict) -> bool +{ + total_weight = 1.0; + + for (auto &pair: items_by_symbol) { + auto &item = pair.second; + auto ghost = item->st->weight == 0 ? true : false; + auto skipped = !ghost; + + if (item->is_scoreable() && g_hash_table_lookup(cfg->symbols, item->symbol.c_str()) == nullptr) { + if (!std::isnan(cfg->unknown_weight)) { + item->st->weight = cfg->unknown_weight; + auto *s = rspamd_mempool_alloc0_type(static_pool, + struct rspamd_symbol); + /* Legit as we actually never modify this data */ + s->name = (char *) item->symbol.c_str(); + s->weight_ptr = &item->st->weight; + g_hash_table_insert(cfg->symbols, (void *) s->name, (void *) s); + + msg_info_cache("adding unknown symbol %s with weight: %.2f", + item->symbol.c_str(), cfg->unknown_weight); + ghost = false; + skipped = false; + } + else { + skipped = true; + } + } + else { + skipped = false; + } + + if (!ghost && skipped) { + if (!(item->flags & SYMBOL_TYPE_SKIPPED)) { + item->flags |= SYMBOL_TYPE_SKIPPED; + msg_warn_cache("symbol %s has no score registered, skip its check", + item->symbol.c_str()); + } + } + + if (ghost) { + msg_debug_cache("symbol %s is registered as ghost symbol, it won't be inserted " + "to any metric", + item->symbol.c_str()); + } + + if (item->st->weight < 0 && item->priority == 0) { + item->priority++; + } + + if (item->is_virtual()) { + if (!(item->flags & SYMBOL_TYPE_GHOST)) { + auto *parent = const_cast<cache_item *>(item->get_parent(*this)); + + if (parent == nullptr) { + item->resolve_parent(*this); + parent = const_cast<cache_item *>(item->get_parent(*this)); + } + + if (::fabs(parent->st->weight) < ::fabs(item->st->weight)) { + parent->st->weight = item->st->weight; + } + + auto p1 = ::abs(item->priority); + auto p2 = ::abs(parent->priority); + + if (p1 != p2) { + parent->priority = MAX(p1, p2); + item->priority = parent->priority; + } + } + } + + total_weight += fabs(item->st->weight); + } + + /* Now check each metric item and find corresponding symbol in a cache */ + auto ret = true; + GHashTableIter it; + void *k, *v; + g_hash_table_iter_init(&it, cfg->symbols); + + while (g_hash_table_iter_next(&it, &k, &v)) { + auto ignore_symbol = false; + auto sym_def = (struct rspamd_symbol *) v; + + if (sym_def && (sym_def->flags & + (RSPAMD_SYMBOL_FLAG_IGNORE_METRIC | RSPAMD_SYMBOL_FLAG_DISABLED))) { + ignore_symbol = true; + } + + if (!ignore_symbol) { + if (!items_by_symbol.contains((const char *) k)) { + msg_debug_cache( + "symbol '%s' has its score defined but there is no " + "corresponding rule registered", + k); + } + } + else if (sym_def->flags & RSPAMD_SYMBOL_FLAG_DISABLED) { + auto item = get_item_by_name_mut((const char *) k, false); + + if (item) { + item->enabled = FALSE; + } + } + } + + return ret; +} + +auto symcache::counters() const -> ucl_object_t * +{ + auto *top = ucl_object_typed_new(UCL_ARRAY); + constexpr const auto round_float = [](const auto x, const int digits) -> auto { + const auto power10 = ::pow(10, digits); + return (::floor(x * power10) / power10); + }; + + for (auto &pair: items_by_symbol) { + auto &item = pair.second; + auto symbol = pair.first; + + auto *obj = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(obj, ucl_object_fromlstring(symbol.data(), symbol.size()), + "symbol", 0, false); + + if (item->is_virtual()) { + if (!(item->flags & SYMBOL_TYPE_GHOST)) { + const auto *parent = item->get_parent(*this); + ucl_object_insert_key(obj, + ucl_object_fromdouble(round_float(item->st->weight, 3)), + "weight", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromdouble(round_float(parent->st->avg_frequency, 3)), + "frequency", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromint(parent->st->total_hits), + "hits", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromdouble(round_float(parent->st->avg_time, 3)), + "time", 0, false); + } + else { + ucl_object_insert_key(obj, + ucl_object_fromdouble(round_float(item->st->weight, 3)), + "weight", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromdouble(0.0), + "frequency", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromdouble(0.0), + "hits", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromdouble(0.0), + "time", 0, false); + } + } + else { + ucl_object_insert_key(obj, + ucl_object_fromdouble(round_float(item->st->weight, 3)), + "weight", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromdouble(round_float(item->st->avg_frequency, 3)), + "frequency", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromint(item->st->total_hits), + "hits", 0, false); + ucl_object_insert_key(obj, + ucl_object_fromdouble(round_float(item->st->avg_time, 3)), + "time", 0, false); + } + + ucl_array_append(top, obj); + } + + return top; +} + +auto symcache::periodic_resort(struct ev_loop *ev_loop, double cur_time, double last_resort) -> void +{ + for (const auto &item: filters) { + + if (item->update_counters_check_peak(L, ev_loop, cur_time, last_resort)) { + auto cur_value = (item->st->total_hits - item->last_count) / + (cur_time - last_resort); + auto cur_err = (item->st->avg_frequency - cur_value); + cur_err *= cur_err; + msg_debug_cache("peak found for %s is %.2f, avg: %.2f, " + "stddev: %.2f, error: %.2f, peaks: %d", + item->symbol.c_str(), cur_value, + item->st->avg_frequency, + item->st->stddev_frequency, + cur_err, + item->frequency_peaks); + + if (peak_cb != -1) { + struct ev_loop **pbase; + + lua_rawgeti(L, LUA_REGISTRYINDEX, peak_cb); + pbase = (struct ev_loop **) lua_newuserdata(L, sizeof(*pbase)); + *pbase = ev_loop; + rspamd_lua_setclass(L, "rspamd{ev_base}", -1); + lua_pushlstring(L, item->symbol.c_str(), item->symbol.size()); + lua_pushnumber(L, item->st->avg_frequency); + lua_pushnumber(L, ::sqrt(item->st->stddev_frequency)); + lua_pushnumber(L, cur_value); + lua_pushnumber(L, cur_err); + + if (lua_pcall(L, 6, 0, 0) != 0) { + msg_info_cache("call to peak function for %s failed: %s", + item->symbol.c_str(), lua_tostring(L, -1)); + lua_pop(L, 1); + } + } + } + } +} + +symcache::~symcache() +{ + if (peak_cb != -1) { + luaL_unref(L, LUA_REGISTRYINDEX, peak_cb); + } +} + +auto symcache::maybe_resort() -> bool +{ + if (items_by_order->generation_id != cur_order_gen) { + /* + * Cache has been modified, need to resort it + */ + msg_info_cache("symbols cache has been modified since last check:" + " old id: %ud, new id: %ud", + items_by_order->generation_id, cur_order_gen); + resort(); + + return true; + } + + return false; +} + +auto symcache::get_item_specific_vector(const cache_item &it) -> symcache::items_ptr_vec & +{ + switch (it.get_type()) { + case symcache_item_type::CONNFILTER: + return connfilters; + case symcache_item_type::FILTER: + return filters; + case symcache_item_type::IDEMPOTENT: + return idempotent; + case symcache_item_type::PREFILTER: + return prefilters; + case symcache_item_type::POSTFILTER: + return postfilters; + case symcache_item_type::COMPOSITE: + return composites; + case symcache_item_type::CLASSIFIER: + return classifiers; + case symcache_item_type::VIRTUAL: + return virtual_symbols; + } + + RSPAMD_UNREACHABLE; +} + +auto symcache::process_settings_elt(struct rspamd_config_settings_elt *elt) -> void +{ + + auto id = elt->id; + + if (elt->symbols_disabled) { + /* Process denied symbols */ + ucl_object_iter_t iter = nullptr; + const ucl_object_t *cur; + + while ((cur = ucl_object_iterate(elt->symbols_disabled, &iter, true)) != NULL) { + const auto *sym = ucl_object_key(cur); + auto *item = get_item_by_name_mut(sym, false); + + if (item != nullptr) { + if (item->is_virtual()) { + /* + * Virtual symbols are special: + * we ignore them in symcache but prevent them from being + * inserted. + */ + item->forbidden_ids.add_id(id); + msg_debug_cache("deny virtual symbol %s for settings %ud (%s); " + "parent can still be executed", + sym, id, elt->name); + } + else { + /* Normal symbol, disable it */ + item->forbidden_ids.add_id(id); + msg_debug_cache("deny symbol %s for settings %ud (%s)", + sym, id, elt->name); + } + } + else { + msg_warn_cache("cannot find a symbol to disable %s " + "when processing settings %ud (%s)", + sym, id, elt->name); + } + } + } + + if (elt->symbols_enabled) { + ucl_object_iter_t iter = nullptr; + const ucl_object_t *cur; + + while ((cur = ucl_object_iterate(elt->symbols_enabled, &iter, true)) != nullptr) { + /* Here, we resolve parent and explicitly allow it */ + const auto *sym = ucl_object_key(cur); + + auto *item = get_item_by_name_mut(sym, false); + + if (item != nullptr) { + if (item->is_virtual()) { + auto *parent = get_item_by_name_mut(sym, true); + + if (parent) { + if (elt->symbols_disabled && + ucl_object_lookup(elt->symbols_disabled, parent->symbol.data())) { + msg_err_cache("conflict in %s: cannot enable disabled symbol %s, " + "wanted to enable symbol %s", + elt->name, parent->symbol.data(), sym); + continue; + } + + parent->exec_only_ids.add_id(id); + msg_debug_cache("allow just execution of symbol %s for settings %ud (%s)", + parent->symbol.data(), id, elt->name); + } + } + + item->allowed_ids.add_id(id); + msg_debug_cache("allow execution of symbol %s for settings %ud (%s)", + sym, id, elt->name); + } + else { + msg_warn_cache("cannot find a symbol to enable %s " + "when processing settings %ud (%s)", + sym, id, elt->name); + } + } + } +} + +auto symcache::get_max_timeout(std::vector<std::pair<double, const cache_item *>> &elts) const -> double +{ + auto accumulated_timeout = 0.0; + auto log_func = RSPAMD_LOG_FUNC; + ankerl::unordered_dense::set<const cache_item *> seen_items; + + auto get_item_timeout = [](cache_item *it) { + return it->get_numeric_augmentation("timeout").value_or(0.0); + }; + + /* This function returns the timeout for an item and all it's dependencies */ + auto get_filter_timeout = [&](cache_item *it, auto self) -> double { + auto own_timeout = get_item_timeout(it); + auto max_child_timeout = 0.0; + + for (const auto &dep: it->deps) { + auto cld_timeout = self(dep.item, self); + + if (cld_timeout > max_child_timeout) { + max_child_timeout = cld_timeout; + } + } + + return own_timeout + max_child_timeout; + }; + + /* For prefilters and postfilters, we just care about priorities */ + auto pre_postfilter_iter = [&](const items_ptr_vec &vec) -> double { + auto saved_priority = -1; + auto max_timeout = 0.0, added_timeout = 0.0; + const cache_item *max_elt = nullptr; + for (const auto &it: vec) { + if (it->priority != saved_priority && max_elt != nullptr && max_timeout > 0) { + if (!seen_items.contains(max_elt)) { + accumulated_timeout += max_timeout; + added_timeout += max_timeout; + + msg_debug_cache_lambda("added %.2f to the timeout (%.2f) as the priority has changed (%d -> %d); " + "symbol: %s", + max_timeout, accumulated_timeout, saved_priority, it->priority, + max_elt->symbol.c_str()); + elts.emplace_back(max_timeout, max_elt); + seen_items.insert(max_elt); + } + max_timeout = 0; + saved_priority = it->priority; + max_elt = nullptr; + } + + auto timeout = get_item_timeout(it); + + if (timeout > max_timeout) { + max_timeout = timeout; + max_elt = it; + } + } + + if (max_elt != nullptr && max_timeout > 0) { + if (!seen_items.contains(max_elt)) { + accumulated_timeout += max_timeout; + added_timeout += max_timeout; + + msg_debug_cache_lambda("added %.2f to the timeout (%.2f) end of processing; " + "symbol: %s", + max_timeout, accumulated_timeout, + max_elt->symbol.c_str()); + elts.emplace_back(max_timeout, max_elt); + seen_items.insert(max_elt); + } + } + + return added_timeout; + }; + + auto prefilters_timeout = pre_postfilter_iter(this->prefilters); + + /* For normal filters, we check the maximum chain of the dependencies + * This function might have O(N^2) complexity if all symbols are in a single + * dependencies chain. But it is not the case in practice + */ + double max_filters_timeout = 0; + for (const auto &it: this->filters) { + auto timeout = get_filter_timeout(it, get_filter_timeout); + + if (timeout > max_filters_timeout) { + max_filters_timeout = timeout; + if (!seen_items.contains(it)) { + elts.emplace_back(timeout, it); + seen_items.insert(it); + } + } + } + + accumulated_timeout += max_filters_timeout; + + auto postfilters_timeout = pre_postfilter_iter(this->postfilters); + auto idempotent_timeout = pre_postfilter_iter(this->idempotent); + + /* Sort in decreasing order by timeout */ + std::stable_sort(std::begin(elts), std::end(elts), + [](const auto &p1, const auto &p2) { + return p1.first > p2.first; + }); + + msg_debug_cache("overall cache timeout: %.2f, %.2f from prefilters," + " %.2f from postfilters, %.2f from idempotent filters," + " %.2f from normal filters", + accumulated_timeout, prefilters_timeout, postfilters_timeout, + idempotent_timeout, max_filters_timeout); + + return accumulated_timeout; +} + +}// namespace rspamd::symcache
\ No newline at end of file diff --git a/src/libserver/symcache/symcache_internal.hxx b/src/libserver/symcache/symcache_internal.hxx new file mode 100644 index 0000000..255a4b1 --- /dev/null +++ b/src/libserver/symcache/symcache_internal.hxx @@ -0,0 +1,652 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Internal C++ structures and classes for symcache + */ + +#ifndef RSPAMD_SYMCACHE_INTERNAL_HXX +#define RSPAMD_SYMCACHE_INTERNAL_HXX +#pragma once + +#include <cmath> +#include <cstdlib> +#include <cstdint> +#include <utility> +#include <vector> +#include <string> +#include <string_view> +#include <memory> +#include <variant> + +#include "rspamd_symcache.h" +#include "contrib/libev/ev.h" +#include "contrib/ankerl/unordered_dense.h" +#include "contrib/expected/expected.hpp" +#include "cfg_file.h" + +#include "symcache_id_list.hxx" + +#define msg_err_cache(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "symcache", log_tag(), \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_err_cache_lambda(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "symcache", log_tag(), \ + log_func, \ + __VA_ARGS__) +#define msg_err_cache_task(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "symcache", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_cache(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "symcache", log_tag(), \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_cache(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "symcache", log_tag(), \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_cache(...) rspamd_conditional_debug_fast(NULL, NULL, \ + ::rspamd::symcache::rspamd_symcache_log_id, "symcache", log_tag(), \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_cache_lambda(...) rspamd_conditional_debug_fast(NULL, NULL, \ + ::rspamd::symcache::rspamd_symcache_log_id, "symcache", log_tag(), \ + log_func, \ + __VA_ARGS__) +#define msg_debug_cache_task(...) rspamd_conditional_debug_fast(NULL, NULL, \ + ::rspamd::symcache::rspamd_symcache_log_id, "symcache", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_cache_task_lambda(...) rspamd_conditional_debug_fast(NULL, NULL, \ + ::rspamd::symcache::rspamd_symcache_log_id, "symcache", task->task_pool->tag.uid, \ + log_func, \ + __VA_ARGS__) + +struct lua_State; + +namespace rspamd::symcache { + +/* Defined in symcache_impl.cxx */ +extern int rspamd_symcache_log_id; + +static const std::uint8_t symcache_magic[8] = {'r', 's', 'c', 2, 0, 0, 0, 0}; + +struct symcache_header { + std::uint8_t magic[8]; + unsigned int nitems; + std::uint8_t checksum[64]; + std::uint8_t unused[128]; +}; + +struct cache_item; +using cache_item_ptr = std::shared_ptr<cache_item>; + +/** + * This structure is intended to keep the current ordering for all symbols + * It is designed to be shared among all tasks and keep references to the real + * symbols. + * If some symbol has been added or removed to the symbol cache, it will not affect + * the current order, and it will only be regenerated for the subsequent tasks. + * This allows safe and no copy sharing and keeping track of all symbols in the + * cache runtime. + */ +struct order_generation { + /* All items ordered */ + std::vector<cache_item_ptr> d; + /* Mapping from symbol name to the position in the order array */ + ankerl::unordered_dense::map<std::string_view, unsigned int> by_symbol; + /* Mapping from symbol id to the position in the order array */ + ankerl::unordered_dense::map<unsigned int, unsigned int> by_cache_id; + /* It matches cache->generation_id; if not, a fresh ordering is required */ + unsigned int generation_id; + + explicit order_generation(std::size_t nelts, unsigned id) + : generation_id(id) + { + d.reserve(nelts); + by_symbol.reserve(nelts); + by_cache_id.reserve(nelts); + } + + auto size() const -> auto + { + return d.size(); + } +}; + +using order_generation_ptr = std::shared_ptr<order_generation>; + + +struct delayed_cache_dependency { + std::string from; + std::string to; + + delayed_cache_dependency(std::string_view _from, std::string_view _to) + : from(_from), to(_to) + { + } +}; + +struct delayed_cache_condition { + std::string sym; + int cbref; + lua_State *L; + +public: + delayed_cache_condition(std::string_view sym, int cbref, lua_State *L) + : sym(sym), cbref(cbref), L(L) + { + } +}; + +class delayed_symbol_elt { +private: + std::variant<std::string, rspamd_regexp_t *> content; + +public: + /* Disable copy */ + delayed_symbol_elt() = delete; + delayed_symbol_elt(const delayed_symbol_elt &) = delete; + delayed_symbol_elt &operator=(const delayed_symbol_elt &) = delete; + /* Enable move */ + delayed_symbol_elt(delayed_symbol_elt &&other) noexcept = default; + delayed_symbol_elt &operator=(delayed_symbol_elt &&other) noexcept = default; + + explicit delayed_symbol_elt(std::string_view elt) noexcept + { + if (!elt.empty() && elt[0] == '/') { + /* Possibly regexp */ + auto *re = rspamd_regexp_new_len(elt.data(), elt.size(), nullptr, nullptr); + + if (re != nullptr) { + std::get<rspamd_regexp_t *>(content) = re; + } + else { + std::get<std::string>(content) = elt; + } + } + else { + std::get<std::string>(content) = elt; + } + } + + ~delayed_symbol_elt() + { + if (std::holds_alternative<rspamd_regexp_t *>(content)) { + rspamd_regexp_unref(std::get<rspamd_regexp_t *>(content)); + } + } + + auto matches(std::string_view what) const -> bool + { + return std::visit([&](auto &elt) { + using T = typeof(elt); + if constexpr (std::is_same_v<T, rspamd_regexp_t *>) { + if (rspamd_regexp_match(elt, what.data(), what.size(), false)) { + return true; + } + } + else if constexpr (std::is_same_v<T, std::string>) { + return elt == what; + } + + return false; + }, + content); + } + + auto to_string_view() const -> std::string_view + { + return std::visit([&](auto &elt) { + using T = typeof(elt); + if constexpr (std::is_same_v<T, rspamd_regexp_t *>) { + return std::string_view{rspamd_regexp_get_pattern(elt)}; + } + else if constexpr (std::is_same_v<T, std::string>) { + return std::string_view{elt}; + } + + return std::string_view{}; + }, + content); + } +}; + +struct delayed_symbol_elt_equal { + using is_transparent = void; + auto operator()(const delayed_symbol_elt &a, const delayed_symbol_elt &b) const + { + return a.to_string_view() == b.to_string_view(); + } + auto operator()(const delayed_symbol_elt &a, const std::string_view &b) const + { + return a.to_string_view() == b; + } + auto operator()(const std::string_view &a, const delayed_symbol_elt &b) const + { + return a == b.to_string_view(); + } +}; + +struct delayed_symbol_elt_hash { + using is_transparent = void; + auto operator()(const delayed_symbol_elt &a) const + { + return ankerl::unordered_dense::hash<std::string_view>()(a.to_string_view()); + } + auto operator()(const std::string_view &a) const + { + return ankerl::unordered_dense::hash<std::string_view>()(a); + } +}; + +class symcache { +private: + using items_ptr_vec = std::vector<cache_item *>; + /* Map indexed by symbol name: all symbols must have unique names, so this map holds ownership */ + ankerl::unordered_dense::map<std::string_view, cache_item *> items_by_symbol; + ankerl::unordered_dense::map<int, cache_item_ptr> items_by_id; + + /* Items sorted into some order */ + order_generation_ptr items_by_order; + unsigned int cur_order_gen; + + /* Specific vectors for execution/iteration */ + items_ptr_vec connfilters; + items_ptr_vec prefilters; + items_ptr_vec filters; + items_ptr_vec postfilters; + items_ptr_vec composites; + items_ptr_vec idempotent; + items_ptr_vec classifiers; + items_ptr_vec virtual_symbols; + + /* These are stored within pointer to clean up after init */ + std::unique_ptr<std::vector<delayed_cache_dependency>> delayed_deps; + std::unique_ptr<std::vector<delayed_cache_condition>> delayed_conditions; + /* Delayed statically enabled or disabled symbols */ + using delayed_symbol_names = ankerl::unordered_dense::set<delayed_symbol_elt, + delayed_symbol_elt_hash, delayed_symbol_elt_equal>; + std::unique_ptr<delayed_symbol_names> disabled_symbols; + std::unique_ptr<delayed_symbol_names> enabled_symbols; + + rspamd_mempool_t *static_pool; + std::uint64_t cksum; + double total_weight; + std::size_t stats_symbols_count; + +private: + std::uint64_t total_hits; + + struct rspamd_config *cfg; + lua_State *L; + double reload_time; + double last_profile; + +private: + int peak_cb; + int cache_id; + +private: + /* Internal methods */ + auto load_items() -> bool; + auto resort() -> void; + auto get_item_specific_vector(const cache_item &) -> items_ptr_vec &; + /* Helper for g_hash_table_foreach */ + static auto metric_connect_cb(void *k, void *v, void *ud) -> void; + +public: + explicit symcache(struct rspamd_config *cfg) + : cfg(cfg) + { + /* XXX: do we need a special pool for symcache? I don't think so */ + static_pool = cfg->cfg_pool; + reload_time = cfg->cache_reload_time; + total_hits = 1; + total_weight = 1.0; + cksum = 0xdeadbabe; + peak_cb = -1; + cache_id = rspamd_random_uint64_fast(); + L = (lua_State *) cfg->lua_state; + delayed_conditions = std::make_unique<std::vector<delayed_cache_condition>>(); + delayed_deps = std::make_unique<std::vector<delayed_cache_dependency>>(); + } + + virtual ~symcache(); + + /** + * Saves items on disk (if possible) + * @return + */ + auto save_items() const -> bool; + + /** + * Get an item by ID + * @param id + * @param resolve_parent + * @return + */ + auto get_item_by_id(int id, bool resolve_parent) const -> const cache_item *; + auto get_item_by_id_mut(int id, bool resolve_parent) const -> cache_item *; + /** + * Get an item by it's name + * @param name + * @param resolve_parent + * @return + */ + auto get_item_by_name(std::string_view name, bool resolve_parent) const -> const cache_item *; + /** + * Get an item by it's name, mutable pointer + * @param name + * @param resolve_parent + * @return + */ + auto get_item_by_name_mut(std::string_view name, bool resolve_parent) const -> cache_item *; + + /** + * Add a direct dependency + * @param id_from + * @param to + * @param virtual_id_from + * @return + */ + auto add_dependency(int id_from, std::string_view to, int virtual_id_from) -> void; + + /** + * Add a delayed dependency between symbols that will be resolved on the init stage + * @param from + * @param to + */ + auto add_delayed_dependency(std::string_view from, std::string_view to) -> void + { + if (!delayed_deps) { + delayed_deps = std::make_unique<std::vector<delayed_cache_dependency>>(); + } + + delayed_deps->emplace_back(from, to); + } + + /** + * Adds a symbol to the list of the disabled symbols + * @param sym + * @return + */ + auto disable_symbol_delayed(std::string_view sym) -> bool + { + if (!disabled_symbols) { + disabled_symbols = std::make_unique<delayed_symbol_names>(); + } + + if (!disabled_symbols->contains(sym)) { + disabled_symbols->emplace(sym); + + return true; + } + + return false; + } + + /** + * Adds a symbol to the list of the enabled symbols + * @param sym + * @return + */ + auto enable_symbol_delayed(std::string_view sym) -> bool + { + if (!enabled_symbols) { + enabled_symbols = std::make_unique<delayed_symbol_names>(); + } + + if (!enabled_symbols->contains(sym)) { + enabled_symbols->emplace(sym); + + return true; + } + + return false; + } + + /** + * Initialises the symbols cache, must be called after all symbols are added + * and the config file is loaded + */ + auto init() -> bool; + + /** + * Log helper that returns cfg checksum + * @return + */ + auto log_tag() const -> const char * + { + return cfg->checksum; + } + + /** + * Helper to return a memory pool associated with the cache + * @return + */ + auto get_pool() const + { + return static_pool; + } + + /** + * A method to add a generic symbol with a callback to couple with C API + * @param name name of the symbol, unlike C API it must be "" for callback only (compat) symbols, in this case an automatic name is generated + * @param priority + * @param func + * @param user_data + * @param flags_and_type mix of flags and type in a messy C enum + * @return id of a new symbol or -1 in case of failure + */ + auto add_symbol_with_callback(std::string_view name, + int priority, + symbol_func_t func, + void *user_data, + int flags_and_type) -> int; + /** + * A method to add a generic virtual symbol with no function associated + * @param name must have some value, or a fatal error will strike you + * @param parent_id if this param is -1 then this symbol is associated with nothing + * @param flags_and_type mix of flags and type in a messy C enum + * @return id of a new symbol or -1 in case of failure + */ + auto add_virtual_symbol(std::string_view name, int parent_id, + int flags_and_type) -> int; + + /** + * Sets a lua callback to be called on peaks in execution time + * @param cbref + */ + auto set_peak_cb(int cbref) -> void; + + /** + * Add a delayed condition for a symbol that might not be registered yet + * @param sym + * @param cbref + */ + auto add_delayed_condition(std::string_view sym, int cbref) -> void; + + /** + * Returns number of symbols that needs to be checked in statistical algorithm + * @return + */ + auto get_stats_symbols_count() const + { + return stats_symbols_count; + } + + /** + * Returns a checksum for the cache + * @return + */ + auto get_cksum() const + { + return cksum; + } + + /** + * Validate symbols in the cache + * @param strict + * @return + */ + auto validate(bool strict) -> bool; + + /** + * Returns counters for the cache + * @return + */ + auto counters() const -> ucl_object_t *; + + /** + * Adjusts stats of the cache for the periodic counter + */ + auto periodic_resort(struct ev_loop *ev_loop, double cur_time, double last_resort) -> void; + + /** + * A simple helper to get the reload time + * @return + */ + auto get_reload_time() const + { + return reload_time; + }; + + /** + * Iterate over all symbols using a specific functor + * @tparam Functor + * @param f + */ + template<typename Functor> + auto symbols_foreach(Functor f) -> void + { + for (const auto &sym_it: items_by_symbol) { + f(sym_it.second); + } + } + + /** + * Iterate over all composites using a specific functor + * @tparam Functor + * @param f + */ + template<typename Functor> + auto composites_foreach(Functor f) -> void + { + for (const auto &sym_it: composites) { + f(sym_it); + } + } + + /** + * Iterate over all composites using a specific functor + * @tparam Functor + * @param f + */ + template<typename Functor> + auto connfilters_foreach(Functor f) -> bool + { + return std::all_of(std::begin(connfilters), std::end(connfilters), + [&](const auto &sym_it) { + return f(sym_it); + }); + } + template<typename Functor> + auto prefilters_foreach(Functor f) -> bool + { + return std::all_of(std::begin(prefilters), std::end(prefilters), + [&](const auto &sym_it) { + return f(sym_it); + }); + } + template<typename Functor> + auto postfilters_foreach(Functor f) -> bool + { + return std::all_of(std::begin(postfilters), std::end(postfilters), + [&](const auto &sym_it) { + return f(sym_it); + }); + } + template<typename Functor> + auto idempotent_foreach(Functor f) -> bool + { + return std::all_of(std::begin(idempotent), std::end(idempotent), + [&](const auto &sym_it) { + return f(sym_it); + }); + } + template<typename Functor> + auto filters_foreach(Functor f) -> bool + { + return std::all_of(std::begin(filters), std::end(filters), + [&](const auto &sym_it) { + return f(sym_it); + }); + } + + /** + * Resort cache if anything has been changed since last time + * @return + */ + auto maybe_resort() -> bool; + + /** + * Returns current set of items ordered for sharing ownership + * @return + */ + auto get_cache_order() const -> auto + { + return items_by_order; + } + + /** + * Get last profile timestamp + * @return + */ + auto get_last_profile() const -> auto + { + return last_profile; + } + + /** + * Sets last profile timestamp + * @param last_profile + * @return + */ + auto set_last_profile(double last_profile) + { + symcache::last_profile = last_profile; + } + + /** + * Process settings elt identified by id + * @param elt + */ + auto process_settings_elt(struct rspamd_config_settings_elt *elt) -> void; + + /** + * Returns maximum timeout that is requested by all rules + * @return + */ + auto get_max_timeout(std::vector<std::pair<double, const cache_item *>> &elts) const -> double; +}; + + +}// namespace rspamd::symcache + +#endif//RSPAMD_SYMCACHE_INTERNAL_HXX diff --git a/src/libserver/symcache/symcache_item.cxx b/src/libserver/symcache/symcache_item.cxx new file mode 100644 index 0000000..ac901f5 --- /dev/null +++ b/src/libserver/symcache/symcache_item.cxx @@ -0,0 +1,652 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lua/lua_common.h" +#include "symcache_internal.hxx" +#include "symcache_item.hxx" +#include "fmt/core.h" +#include "libserver/task.h" +#include "libutil/cxx/util.hxx" +#include <numeric> +#include <functional> + +namespace rspamd::symcache { + +enum class augmentation_value_type { + NO_VALUE, + STRING_VALUE, + NUMBER_VALUE, +}; + +struct augmentation_info { + int weight = 0; + int implied_flags = 0; + augmentation_value_type value_type = augmentation_value_type::NO_VALUE; +}; + +/* A list of internal augmentations that are known to Rspamd with their weight */ +static const auto known_augmentations = + ankerl::unordered_dense::map<std::string, augmentation_info, rspamd::smart_str_hash, rspamd::smart_str_equal>{ + {"passthrough", {.weight = 10, .implied_flags = SYMBOL_TYPE_IGNORE_PASSTHROUGH}}, + {"single_network", {.weight = 1, .implied_flags = 0}}, + {"no_network", {.weight = 0, .implied_flags = 0}}, + {"many_network", {.weight = 1, .implied_flags = 0}}, + {"important", {.weight = 5, .implied_flags = SYMBOL_TYPE_FINE}}, + {"timeout", { + .weight = 0, + .implied_flags = 0, + .value_type = augmentation_value_type::NUMBER_VALUE, + }}}; + +auto cache_item::get_parent(const symcache &cache) const -> const cache_item * +{ + if (is_virtual()) { + const auto &virtual_sp = std::get<virtual_item>(specific); + + return virtual_sp.get_parent(cache); + } + + return nullptr; +} + +auto cache_item::get_parent_mut(const symcache &cache) -> cache_item * +{ + if (is_virtual()) { + auto &virtual_sp = std::get<virtual_item>(specific); + + return virtual_sp.get_parent_mut(cache); + } + + return nullptr; +} + +auto cache_item::process_deps(const symcache &cache) -> void +{ + /* Allow logging macros to work */ + auto log_tag = [&]() { return cache.log_tag(); }; + + for (auto &dep: deps) { + msg_debug_cache("process real dependency %s on %s", symbol.c_str(), dep.sym.c_str()); + auto *dit = cache.get_item_by_name_mut(dep.sym, true); + + if (dep.vid >= 0) { + /* Case of the virtual symbol that depends on another (maybe virtual) symbol */ + const auto *vdit = cache.get_item_by_name(dep.sym, false); + + if (!vdit) { + if (dit) { + msg_err_cache("cannot add dependency from %s on %s: no dependency symbol registered", + dep.sym.c_str(), dit->symbol.c_str()); + } + } + else { + msg_debug_cache("process virtual dependency %s(%d) on %s(%d)", symbol.c_str(), + dep.vid, vdit->symbol.c_str(), vdit->id); + + unsigned nids = 0; + + /* Propagate ids */ + msg_debug_cache("check id propagation for dependency %s from %s", + symbol.c_str(), dit->symbol.c_str()); + + const auto *ids = dit->allowed_ids.get_ids(nids); + + if (nids > 0) { + msg_debug_cache("propagate allowed ids from %s to %s", + dit->symbol.c_str(), symbol.c_str()); + + allowed_ids.set_ids(ids, nids); + } + + ids = dit->forbidden_ids.get_ids(nids); + + if (nids > 0) { + msg_debug_cache("propagate forbidden ids from %s to %s", + dit->symbol.c_str(), symbol.c_str()); + + forbidden_ids.set_ids(ids, nids); + } + } + } + + if (dit != nullptr) { + if (!dit->is_filter()) { + /* + * Check sanity: + * - filters -> prefilter dependency is OK and always satisfied + * - postfilter -> (filter, prefilter) dep is ok + * - idempotent -> (any) dep is OK + * + * Otherwise, emit error + * However, even if everything is fine this dep is useless ¯\_(ツ)_/¯ + */ + auto ok_dep = false; + + if (dit->get_type() == type) { + ok_dep = true; + } + else if (type < dit->get_type()) { + ok_dep = true; + } + + if (!ok_dep) { + msg_err_cache("cannot add dependency from %s on %s: invalid symbol types", + dep.sym.c_str(), symbol.c_str()); + + continue; + } + } + else { + if (dit->id == id) { + msg_err_cache("cannot add dependency on self: %s -> %s " + "(resolved to %s)", + symbol.c_str(), dep.sym.c_str(), dit->symbol.c_str()); + } + else { + /* Create a reverse dep */ + if (is_virtual()) { + auto *parent = get_parent_mut(cache); + + if (parent) { + dit->rdeps.emplace_back(parent, parent->symbol, parent->id, -1); + dep.item = dit; + dep.id = dit->id; + + msg_debug_cache("added reverse dependency from %d on %d", parent->id, + dit->id); + } + } + else { + dep.item = dit; + dep.id = dit->id; + dit->rdeps.emplace_back(this, symbol, id, -1); + msg_debug_cache("added reverse dependency from %d on %d", id, + dit->id); + } + } + } + } + else if (dep.id >= 0) { + msg_err_cache("cannot find dependency on symbol %s for symbol %s", + dep.sym.c_str(), symbol.c_str()); + + continue; + } + } + + // Remove empty deps + deps.erase(std::remove_if(std::begin(deps), std::end(deps), + [](const auto &dep) { return !dep.item; }), + std::end(deps)); +} + +auto cache_item::resolve_parent(const symcache &cache) -> bool +{ + auto log_tag = [&]() { return cache.log_tag(); }; + + if (is_virtual()) { + auto &virt = std::get<virtual_item>(specific); + + if (virt.get_parent(cache)) { + msg_debug_cache("trying to resolve parent twice for %s", symbol.c_str()); + + return false; + } + + return virt.resolve_parent(cache); + } + else { + msg_warn_cache("trying to resolve a parent for non-virtual symbol %s", symbol.c_str()); + } + + return false; +} + +auto cache_item::update_counters_check_peak(lua_State *L, + struct ev_loop *ev_loop, + double cur_time, + double last_resort) -> bool +{ + auto ret = false; + static const double decay_rate = 0.25; + + st->total_hits += st->hits; + g_atomic_int_set(&st->hits, 0); + + if (last_count > 0) { + auto cur_value = (st->total_hits - last_count) / + (cur_time - last_resort); + rspamd_set_counter_ema(&st->frequency_counter, + cur_value, decay_rate); + st->avg_frequency = st->frequency_counter.mean; + st->stddev_frequency = st->frequency_counter.stddev; + + auto cur_err = (st->avg_frequency - cur_value); + cur_err *= cur_err; + + if (st->frequency_counter.number > 10 && + cur_err > ::sqrt(st->stddev_frequency) * 3) { + frequency_peaks++; + ret = true; + } + } + + last_count = st->total_hits; + + if (cd->number > 0) { + if (!is_virtual()) { + st->avg_time = cd->mean; + rspamd_set_counter_ema(&st->time_counter, + st->avg_time, decay_rate); + st->avg_time = st->time_counter.mean; + memset(cd, 0, sizeof(*cd)); + } + } + + return ret; +} + +auto cache_item::inc_frequency(const char *sym_name, symcache &cache) -> void +{ + if (sym_name && symbol != sym_name) { + if (is_filter()) { + const auto *children = get_children(); + if (children) { + /* Likely a callback symbol with some virtual symbol that needs to be adjusted */ + for (const auto &cld: *children) { + if (cld->get_name() == sym_name) { + cld->inc_frequency(sym_name, cache); + } + } + } + } + else { + /* Name not equal to symbol name, so we need to find the proper name */ + auto *another_item = cache.get_item_by_name_mut(sym_name, false); + if (another_item != nullptr) { + another_item->inc_frequency(sym_name, cache); + } + } + } + else { + /* Symbol and sym name are the same */ + g_atomic_int_inc(&st->hits); + } +} + +auto cache_item::get_type_str() const -> const char * +{ + switch (type) { + case symcache_item_type::CONNFILTER: + return "connfilter"; + case symcache_item_type::FILTER: + return "filter"; + case symcache_item_type::IDEMPOTENT: + return "idempotent"; + case symcache_item_type::PREFILTER: + return "prefilter"; + case symcache_item_type::POSTFILTER: + return "postfilter"; + case symcache_item_type::COMPOSITE: + return "composite"; + case symcache_item_type::CLASSIFIER: + return "classifier"; + case symcache_item_type::VIRTUAL: + return "virtual"; + } + + RSPAMD_UNREACHABLE; +} + +auto cache_item::is_allowed(struct rspamd_task *task, bool exec_only) const -> bool +{ + const auto *what = "execution"; + + if (!exec_only) { + what = "symbol insertion"; + } + + /* Static checks */ + if (!enabled || + (RSPAMD_TASK_IS_EMPTY(task) && !(flags & SYMBOL_TYPE_EMPTY)) || + (flags & SYMBOL_TYPE_MIME_ONLY && !RSPAMD_TASK_IS_MIME(task))) { + + if (!enabled) { + msg_debug_cache_task("skipping %s of %s as it is permanently disabled", + what, symbol.c_str()); + + return false; + } + else { + /* + * If we check merely execution (not insertion), then we disallow + * mime symbols for non mime tasks and vice versa + */ + if (exec_only) { + msg_debug_cache_task("skipping check of %s as it cannot be " + "executed for this task type", + symbol.c_str()); + + return FALSE; + } + } + } + + /* Settings checks */ + if (task->settings_elt != nullptr) { + if (forbidden_ids.check_id(task->settings_elt->id)) { + msg_debug_cache_task("deny %s of %s as it is forbidden for " + "settings id %ud", + what, + symbol.c_str(), + task->settings_elt->id); + + return false; + } + + if (!(flags & SYMBOL_TYPE_EXPLICIT_DISABLE)) { + if (!allowed_ids.check_id(task->settings_elt->id)) { + + if (task->settings_elt->policy == RSPAMD_SETTINGS_POLICY_IMPLICIT_ALLOW) { + msg_debug_cache_task("allow execution of %s settings id %ud " + "allows implicit execution of the symbols;", + symbol.c_str(), + id); + + return true; + } + + if (exec_only) { + /* + * Special case if any of our virtual children are enabled + */ + if (exec_only_ids.check_id(task->settings_elt->id)) { + return true; + } + } + + msg_debug_cache_task("deny %s of %s as it is not listed " + "as allowed for settings id %ud", + what, + symbol.c_str(), + task->settings_elt->id); + return false; + } + } + else { + msg_debug_cache_task("allow %s of %s for " + "settings id %ud as it can be only disabled explicitly", + what, + symbol.c_str(), + task->settings_elt->id); + } + } + else if (flags & SYMBOL_TYPE_EXPLICIT_ENABLE) { + msg_debug_cache_task("deny %s of %s as it must be explicitly enabled", + what, + symbol.c_str()); + return false; + } + + /* Allow all symbols with no settings id */ + return true; +} + +auto cache_item::add_augmentation(const symcache &cache, std::string_view augmentation, + std::optional<std::string_view> value) -> bool +{ + auto log_tag = [&]() { return cache.log_tag(); }; + + if (augmentations.contains(augmentation)) { + msg_warn_cache("duplicate augmentation: %s", augmentation.data()); + + return false; + } + + auto maybe_known = rspamd::find_map(known_augmentations, augmentation); + + if (maybe_known.has_value()) { + auto &known_info = maybe_known.value().get(); + + if (known_info.implied_flags) { + if ((known_info.implied_flags & flags) == 0) { + msg_info_cache("added implied flags (%bd) for symbol %s as it has %s augmentation", + known_info.implied_flags, symbol.data(), augmentation.data()); + flags |= known_info.implied_flags; + } + } + + if (known_info.value_type == augmentation_value_type::NO_VALUE) { + if (value.has_value()) { + msg_err_cache("value specified for augmentation %s, that has no value", + augmentation.data()); + + return false; + } + return augmentations.try_emplace(augmentation, known_info.weight).second; + } + else { + if (!value.has_value()) { + msg_err_cache("value is not specified for augmentation %s, that requires explicit value", + augmentation.data()); + + return false; + } + + if (known_info.value_type == augmentation_value_type::STRING_VALUE) { + return augmentations.try_emplace(augmentation, std::string{value.value()}, + known_info.weight) + .second; + } + else if (known_info.value_type == augmentation_value_type::NUMBER_VALUE) { + /* I wish it was supported properly */ + //auto conv_res = std::from_chars(value->data(), value->size(), num); + char numbuf[128], *endptr = nullptr; + rspamd_strlcpy(numbuf, value->data(), MIN(value->size(), sizeof(numbuf))); + auto num = g_ascii_strtod(numbuf, &endptr); + + if (fabs(num) >= G_MAXFLOAT || std::isnan(num)) { + msg_err_cache("value for augmentation %s is not numeric: %*s", + augmentation.data(), + (int) value->size(), value->data()); + return false; + } + + return augmentations.try_emplace(augmentation, num, + known_info.weight) + .second; + } + } + } + else { + msg_debug_cache("added unknown augmentation %s for symbol %s", + "unknown", augmentation.data(), symbol.data()); + return augmentations.try_emplace(augmentation, 0).second; + } + + // Should not be reached + return false; +} + +auto cache_item::get_augmentation_weight() const -> int +{ + return std::accumulate(std::begin(augmentations), std::end(augmentations), + 0, [](int acc, const auto &map_pair) { + return acc + map_pair.second.weight; + }); +} + +auto cache_item::get_numeric_augmentation(std::string_view name) const -> std::optional<double> +{ + const auto augmentation_value_maybe = rspamd::find_map(this->augmentations, name); + + if (augmentation_value_maybe.has_value()) { + const auto &augmentation = augmentation_value_maybe.value().get(); + + if (std::holds_alternative<double>(augmentation.value)) { + return std::get<double>(augmentation.value); + } + } + + return std::nullopt; +} + + +auto virtual_item::get_parent(const symcache &cache) const -> const cache_item * +{ + if (parent) { + return parent; + } + + return cache.get_item_by_id(parent_id, false); +} + +auto virtual_item::get_parent_mut(const symcache &cache) -> cache_item * +{ + if (parent) { + return parent; + } + + return const_cast<cache_item *>(cache.get_item_by_id(parent_id, false)); +} + +auto virtual_item::resolve_parent(const symcache &cache) -> bool +{ + if (parent) { + return false; + } + + auto item_ptr = cache.get_item_by_id(parent_id, true); + + if (item_ptr) { + parent = const_cast<cache_item *>(item_ptr); + + return true; + } + + return false; +} + +auto item_type_from_c(int type) -> tl::expected<std::pair<symcache_item_type, int>, std::string> +{ + constexpr const auto trivial_types = SYMBOL_TYPE_CONNFILTER | SYMBOL_TYPE_PREFILTER | SYMBOL_TYPE_POSTFILTER | SYMBOL_TYPE_IDEMPOTENT | SYMBOL_TYPE_COMPOSITE | SYMBOL_TYPE_CLASSIFIER | SYMBOL_TYPE_VIRTUAL; + + constexpr auto all_but_one_ty = [&](int type, int exclude_bit) -> auto { + return (type & trivial_types) & (trivial_types & ~exclude_bit); + }; + + if (type & trivial_types) { + auto check_trivial = [&](auto flag, + symcache_item_type ty) -> tl::expected<std::pair<symcache_item_type, int>, std::string> { + if (all_but_one_ty(type, flag)) { + return tl::make_unexpected(fmt::format("invalid flags for a symbol: {}", (int) type)); + } + + return std::make_pair(ty, type & ~flag); + }; + if (type & SYMBOL_TYPE_CONNFILTER) { + return check_trivial(SYMBOL_TYPE_CONNFILTER, symcache_item_type::CONNFILTER); + } + else if (type & SYMBOL_TYPE_PREFILTER) { + return check_trivial(SYMBOL_TYPE_PREFILTER, symcache_item_type::PREFILTER); + } + else if (type & SYMBOL_TYPE_POSTFILTER) { + return check_trivial(SYMBOL_TYPE_POSTFILTER, symcache_item_type::POSTFILTER); + } + else if (type & SYMBOL_TYPE_IDEMPOTENT) { + return check_trivial(SYMBOL_TYPE_IDEMPOTENT, symcache_item_type::IDEMPOTENT); + } + else if (type & SYMBOL_TYPE_COMPOSITE) { + return check_trivial(SYMBOL_TYPE_COMPOSITE, symcache_item_type::COMPOSITE); + } + else if (type & SYMBOL_TYPE_CLASSIFIER) { + return check_trivial(SYMBOL_TYPE_CLASSIFIER, symcache_item_type::CLASSIFIER); + } + else if (type & SYMBOL_TYPE_VIRTUAL) { + return check_trivial(SYMBOL_TYPE_VIRTUAL, symcache_item_type::VIRTUAL); + } + + return tl::make_unexpected(fmt::format("internal error: impossible flags combination: {}", (int) type)); + } + + /* Maybe check other flags combination here? */ + return std::make_pair(symcache_item_type::FILTER, type); +} + +bool operator<(symcache_item_type lhs, symcache_item_type rhs) +{ + auto ret = false; + switch (lhs) { + case symcache_item_type::CONNFILTER: + break; + case symcache_item_type::PREFILTER: + if (rhs == symcache_item_type::CONNFILTER) { + ret = true; + } + break; + case symcache_item_type::FILTER: + if (rhs == symcache_item_type::CONNFILTER || rhs == symcache_item_type::PREFILTER) { + ret = true; + } + break; + case symcache_item_type::POSTFILTER: + if (rhs != symcache_item_type::IDEMPOTENT) { + ret = true; + } + break; + case symcache_item_type::IDEMPOTENT: + default: + break; + } + + return ret; +} + +item_condition::~item_condition() +{ + if (cb != -1 && L != nullptr) { + luaL_unref(L, LUA_REGISTRYINDEX, cb); + } +} + +auto item_condition::check(std::string_view sym_name, struct rspamd_task *task) const -> bool +{ + if (cb != -1 && L != nullptr) { + auto ret = false; + + lua_pushcfunction(L, &rspamd_lua_traceback); + auto err_idx = lua_gettop(L); + + lua_rawgeti(L, LUA_REGISTRYINDEX, cb); + rspamd_lua_task_push(L, task); + + if (lua_pcall(L, 1, 1, err_idx) != 0) { + msg_info_task("call to condition for %s failed: %s", + sym_name.data(), lua_tostring(L, -1)); + } + else { + ret = lua_toboolean(L, -1); + } + + lua_settop(L, err_idx - 1); + + return ret; + } + + return true; +} + +}// namespace rspamd::symcache diff --git a/src/libserver/symcache/symcache_item.hxx b/src/libserver/symcache/symcache_item.hxx new file mode 100644 index 0000000..a60213a --- /dev/null +++ b/src/libserver/symcache/symcache_item.hxx @@ -0,0 +1,561 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_SYMCACHE_ITEM_HXX +#define RSPAMD_SYMCACHE_ITEM_HXX + +#pragma once + +#include <utility> +#include <vector> +#include <string> +#include <string_view> +#include <memory> +#include <variant> +#include <algorithm> +#include <optional> + +#include "rspamd_symcache.h" +#include "symcache_id_list.hxx" +#include "contrib/expected/expected.hpp" +#include "contrib/libev/ev.h" +#include "symcache_runtime.hxx" +#include "libutil/cxx/hash_util.hxx" + +namespace rspamd::symcache { + +class symcache; +struct cache_item; +using cache_item_ptr = std::shared_ptr<cache_item>; + +enum class symcache_item_type { + CONNFILTER, /* Executed on connection stage */ + PREFILTER, /* Executed before all filters */ + FILTER, /* Normal symbol with a callback */ + POSTFILTER, /* Executed after all filters */ + IDEMPOTENT, /* Executed after postfilters, cannot change results */ + CLASSIFIER, /* A virtual classifier symbol */ + COMPOSITE, /* A virtual composite symbol */ + VIRTUAL, /* A virtual symbol... */ +}; + +/* + * Compare item types: earlier stages symbols are > than later stages symbols + * Order for virtual stuff is not defined. + */ +bool operator<(symcache_item_type lhs, symcache_item_type rhs); + +constexpr static auto item_type_to_str(symcache_item_type t) -> const char * +{ + switch (t) { + case symcache_item_type::CONNFILTER: + return "connfilter"; + case symcache_item_type::PREFILTER: + return "prefilter"; + case symcache_item_type::FILTER: + return "filter"; + case symcache_item_type::POSTFILTER: + return "postfilter"; + case symcache_item_type::IDEMPOTENT: + return "idempotent"; + case symcache_item_type::CLASSIFIER: + return "classifier"; + case symcache_item_type::COMPOSITE: + return "composite"; + case symcache_item_type::VIRTUAL: + return "virtual"; + } +} + +/** + * This is a public helper to convert a legacy C type to a more static type + * @param type input type as a C enum + * @return pair of type safe symcache_item_type + the remaining flags or an error + */ +auto item_type_from_c(int type) -> tl::expected<std::pair<symcache_item_type, int>, std::string>; + +struct item_condition { +private: + lua_State *L = nullptr; + int cb = -1; + +public: + explicit item_condition(lua_State *L_, int cb_) noexcept + : L(L_), cb(cb_) + { + } + item_condition(item_condition &&other) noexcept + { + *this = std::move(other); + } + /* Make it move only */ + item_condition(const item_condition &) = delete; + item_condition &operator=(item_condition &&other) noexcept + { + std::swap(other.L, L); + std::swap(other.cb, cb); + return *this; + } + ~item_condition(); + + auto check(std::string_view sym_name, struct rspamd_task *task) const -> bool; +}; + +class normal_item { +private: + symbol_func_t func = nullptr; + void *user_data = nullptr; + std::vector<cache_item *> virtual_children; + std::vector<item_condition> conditions; + +public: + explicit normal_item(symbol_func_t _func, void *_user_data) + : func(_func), user_data(_user_data) + { + } + + auto add_condition(lua_State *L, int cbref) -> void + { + conditions.emplace_back(L, cbref); + } + + auto call(struct rspamd_task *task, struct rspamd_symcache_dynamic_item *item) const -> void + { + func(task, item, user_data); + } + + auto check_conditions(std::string_view sym_name, struct rspamd_task *task) const -> bool + { + return std::all_of(std::begin(conditions), std::end(conditions), + [&](const auto &cond) { return cond.check(sym_name, task); }); + } + + auto get_cbdata() const -> auto + { + return user_data; + } + + auto add_child(cache_item *ptr) -> void + { + virtual_children.push_back(ptr); + } + + auto get_childen() const -> const std::vector<cache_item *> & + { + return virtual_children; + } +}; + +class virtual_item { +private: + int parent_id = -1; + cache_item *parent = nullptr; + +public: + explicit virtual_item(int _parent_id) + : parent_id(_parent_id) + { + } + + auto get_parent(const symcache &cache) const -> const cache_item *; + auto get_parent_mut(const symcache &cache) -> cache_item *; + + auto resolve_parent(const symcache &cache) -> bool; +}; + +struct cache_dependency { + cache_item *item; /* Real dependency */ + std::string sym; /* Symbolic dep name */ + int id; /* Real from */ + int vid; /* Virtual from */ +public: + /* Default piecewise constructor */ + explicit cache_dependency(cache_item *_item, std::string _sym, int _id, int _vid) + : item(_item), sym(std::move(_sym)), id(_id), vid(_vid) + { + } +}; + +/* + * Used to store augmentation values + */ +struct item_augmentation { + std::variant<std::monostate, std::string, double> value; + int weight; + + explicit item_augmentation(int weight) + : value(std::monostate{}), weight(weight) + { + } + explicit item_augmentation(std::string str_value, int weight) + : value(str_value), weight(weight) + { + } + explicit item_augmentation(double double_value, int weight) + : value(double_value), weight(weight) + { + } +}; + +struct cache_item : std::enable_shared_from_this<cache_item> { + /* The following fields will live in shared memory */ + struct rspamd_symcache_item_stat *st = nullptr; + struct rspamd_counter_data *cd = nullptr; + + /* Unique id - counter */ + int id; + std::uint64_t last_count = 0; + std::string symbol; + symcache_item_type type; + int flags; + + /* Condition of execution */ + bool enabled = true; + + /* Priority */ + int priority = 0; + /* Topological order */ + unsigned int order = 0; + int frequency_peaks = 0; + + /* Specific data for virtual and callback symbols */ + std::variant<normal_item, virtual_item> specific; + + /* Settings ids */ + id_list allowed_ids; + /* Allows execution but not symbols insertion */ + id_list exec_only_ids; + id_list forbidden_ids; + + /* Set of augmentations */ + ankerl::unordered_dense::map<std::string, item_augmentation, + rspamd::smart_str_hash, rspamd::smart_str_equal> + augmentations; + + /* Dependencies */ + std::vector<cache_dependency> deps; + /* Reverse dependencies */ + std::vector<cache_dependency> rdeps; + +public: + /** + * Create a normal item with a callback + * @param name + * @param priority + * @param func + * @param user_data + * @param type + * @param flags + * @return + */ + template<typename T> + static auto create_with_function(rspamd_mempool_t *pool, + int id, + T &&name, + int priority, + symbol_func_t func, + void *user_data, + symcache_item_type type, + int flags) -> cache_item_ptr + { + return std::shared_ptr<cache_item>(new cache_item(pool, + id, std::forward<T>(name), priority, + func, user_data, + type, flags)); + } + + /** + * Create a virtual item + * @param name + * @param priority + * @param parent + * @param type + * @param flags + * @return + */ + template<typename T> + static auto create_with_virtual(rspamd_mempool_t *pool, + int id, + T &&name, + int parent, + symcache_item_type type, + int flags) -> cache_item_ptr + { + return std::shared_ptr<cache_item>(new cache_item(pool, id, std::forward<T>(name), + parent, type, flags)); + } + + /** + * Share ownership on the item + * @return + */ + auto getptr() -> cache_item_ptr + { + return shared_from_this(); + } + + /** + * Process and resolve dependencies for the item + * @param cache + */ + auto process_deps(const symcache &cache) -> void; + + auto is_virtual() const -> bool + { + return std::holds_alternative<virtual_item>(specific); + } + + auto is_filter() const -> bool + { + return std::holds_alternative<normal_item>(specific) && + (type == symcache_item_type::FILTER); + } + + /** + * Returns true if a symbol should have some score defined + * @return + */ + auto is_scoreable() const -> bool + { + return !(flags & SYMBOL_TYPE_CALLBACK) && + ((type == symcache_item_type::FILTER) || + is_virtual() || + (type == symcache_item_type::COMPOSITE) || + (type == symcache_item_type::CLASSIFIER)); + } + + auto is_ghost() const -> bool + { + return flags & SYMBOL_TYPE_GHOST; + } + + auto get_parent(const symcache &cache) const -> const cache_item *; + auto get_parent_mut(const symcache &cache) -> cache_item *; + + auto resolve_parent(const symcache &cache) -> bool; + + auto get_type() const -> auto + { + return type; + } + + auto get_type_str() const -> const char *; + + auto get_name() const -> const std::string & + { + return symbol; + } + + auto get_flags() const -> auto + { + return flags; + }; + + auto add_condition(lua_State *L, int cbref) -> bool + { + if (!is_virtual()) { + auto &normal = std::get<normal_item>(specific); + normal.add_condition(L, cbref); + + return true; + } + + return false; + } + + auto update_counters_check_peak(lua_State *L, + struct ev_loop *ev_loop, + double cur_time, + double last_resort) -> bool; + + /** + * Increase frequency for a symbol + */ + auto inc_frequency(const char *sym_name, symcache &cache) -> void; + + /** + * Check if an item is allowed to be executed not checking item conditions + * @param task + * @param exec_only + * @return + */ + auto is_allowed(struct rspamd_task *task, bool exec_only) const -> bool; + + /** + * Returns callback data + * @return + */ + auto get_cbdata() const -> void * + { + if (std::holds_alternative<normal_item>(specific)) { + const auto &filter_data = std::get<normal_item>(specific); + + return filter_data.get_cbdata(); + } + + return nullptr; + } + + /** + * Check all conditions for an item + * @param task + * @return + */ + auto check_conditions(struct rspamd_task *task) const -> auto + { + if (std::holds_alternative<normal_item>(specific)) { + const auto &filter_data = std::get<normal_item>(specific); + + return filter_data.check_conditions(symbol, task); + } + + return false; + } + + auto call(struct rspamd_task *task, cache_dynamic_item *dyn_item) const -> void + { + if (std::holds_alternative<normal_item>(specific)) { + const auto &filter_data = std::get<normal_item>(specific); + + filter_data.call(task, (struct rspamd_symcache_dynamic_item *) dyn_item); + } + } + + /** + * Add an augmentation to the item, returns `true` if augmentation is known and unique, false otherwise + * @param augmentation + * @return + */ + auto add_augmentation(const symcache &cache, std::string_view augmentation, + std::optional<std::string_view> value) -> bool; + + /** + * Return sum weight of all known augmentations + * @return + */ + auto get_augmentation_weight() const -> int; + + /** + * Returns numeric augmentation value + * @param name + * @return + */ + auto get_numeric_augmentation(std::string_view name) const -> std::optional<double>; + + /** + * Returns string augmentation value + * @param name + * @return + */ + auto get_string_augmentation(std::string_view name) const -> std::optional<std::string_view>; + + /** + * Add a virtual symbol as a child of some normal symbol + * @param ptr + */ + auto add_child(cache_item *ptr) -> void + { + if (std::holds_alternative<normal_item>(specific)) { + auto &filter_data = std::get<normal_item>(specific); + + filter_data.add_child(ptr); + } + else { + g_assert("add child is called for a virtual symbol!"); + } + } + + /** + * Returns virtual children for a normal item + * @param ptr + * @return + */ + auto get_children() const -> const std::vector<cache_item *> * + { + if (std::holds_alternative<normal_item>(specific)) { + const auto &filter_data = std::get<normal_item>(specific); + + return &filter_data.get_childen(); + } + + return nullptr; + } + +private: + /** + * Constructor for a normal symbols with callback + * @param name + * @param _priority + * @param func + * @param user_data + * @param _type + * @param _flags + */ + cache_item(rspamd_mempool_t *pool, + int _id, + std::string &&name, + int _priority, + symbol_func_t func, + void *user_data, + symcache_item_type _type, + int _flags) + : id(_id), + symbol(std::move(name)), + type(_type), + flags(_flags), + priority(_priority), + specific(normal_item{func, user_data}) + { + /* These structures are kept trivial, so they need to be explicitly reset */ + forbidden_ids.reset(); + allowed_ids.reset(); + exec_only_ids.reset(); + st = rspamd_mempool_alloc0_shared_type(pool, std::remove_pointer_t<decltype(st)>); + cd = rspamd_mempool_alloc0_shared_type(pool, std::remove_pointer_t<decltype(cd)>); + } + + /** + * Constructor for a virtual symbol + * @param name + * @param _priority + * @param parent + * @param _type + * @param _flags + */ + cache_item(rspamd_mempool_t *pool, + int _id, + std::string &&name, + int parent, + symcache_item_type _type, + int _flags) + : id(_id), + symbol(std::move(name)), + type(_type), + flags(_flags), + specific(virtual_item{parent}) + { + /* These structures are kept trivial, so they need to be explicitly reset */ + forbidden_ids.reset(); + allowed_ids.reset(); + exec_only_ids.reset(); + st = rspamd_mempool_alloc0_shared_type(pool, std::remove_pointer_t<decltype(st)>); + cd = rspamd_mempool_alloc0_shared_type(pool, std::remove_pointer_t<decltype(cd)>); + } +}; + +}// namespace rspamd::symcache + +#endif//RSPAMD_SYMCACHE_ITEM_HXX diff --git a/src/libserver/symcache/symcache_periodic.hxx b/src/libserver/symcache/symcache_periodic.hxx new file mode 100644 index 0000000..535956b --- /dev/null +++ b/src/libserver/symcache/symcache_periodic.hxx @@ -0,0 +1,89 @@ +/*- + * Copyright 2022 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef RSPAMD_SYMCACHE_PERIODIC_HXX +#define RSPAMD_SYMCACHE_PERIODIC_HXX + +#pragma once + +#include "config.h" +#include "contrib/libev/ev.h" +#include "symcache_internal.hxx" +#include "worker_util.h" + +namespace rspamd::symcache { +struct cache_refresh_cbdata { +private: + symcache *cache; + struct ev_loop *event_loop; + struct rspamd_worker *w; + double reload_time; + double last_resort; + ev_timer resort_ev; + +public: + explicit cache_refresh_cbdata(symcache *_cache, + struct ev_loop *_ev_base, + struct rspamd_worker *_w) + : cache(_cache), event_loop(_ev_base), w(_w) + { + auto log_tag = [&]() { return cache->log_tag(); }; + last_resort = rspamd_get_ticks(TRUE); + reload_time = cache->get_reload_time(); + auto tm = rspamd_time_jitter(reload_time, 0); + msg_debug_cache("next reload in %.2f seconds", tm); + ev_timer_init(&resort_ev, cache_refresh_cbdata::resort_cb, + tm, tm); + resort_ev.data = (void *) this; + ev_timer_start(event_loop, &resort_ev); + rspamd_mempool_add_destructor(cache->get_pool(), + cache_refresh_cbdata::refresh_dtor, (void *) this); + } + + static void refresh_dtor(void *d) + { + auto *cbdata = (struct cache_refresh_cbdata *) d; + delete cbdata; + } + + static void resort_cb(EV_P_ ev_timer *w, int _revents) + { + auto *cbdata = (struct cache_refresh_cbdata *) w->data; + + auto log_tag = [&]() { return cbdata->cache->log_tag(); }; + + if (rspamd_worker_is_primary_controller(cbdata->w)) { + /* Plan new event */ + auto tm = rspamd_time_jitter(cbdata->reload_time, 0); + msg_debug_cache("resort symbols cache, next reload in %.2f seconds", tm); + cbdata->resort_ev.repeat = tm; + ev_timer_again(EV_A_ w); + auto cur_time = rspamd_get_ticks(FALSE); + cbdata->cache->periodic_resort(cbdata->event_loop, cur_time, cbdata->last_resort); + cbdata->last_resort = cur_time; + } + } + +private: + ~cache_refresh_cbdata() + { + ev_timer_stop(event_loop, &resort_ev); + } +}; +}// namespace rspamd::symcache + +#endif//RSPAMD_SYMCACHE_PERIODIC_HXX diff --git a/src/libserver/symcache/symcache_runtime.cxx b/src/libserver/symcache/symcache_runtime.cxx new file mode 100644 index 0000000..d9622d8 --- /dev/null +++ b/src/libserver/symcache/symcache_runtime.cxx @@ -0,0 +1,823 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "symcache_internal.hxx" +#include "symcache_item.hxx" +#include "symcache_runtime.hxx" +#include "libutil/cxx/util.hxx" +#include "libserver/task.h" +#include "libmime/scan_result.h" +#include "utlist.h" +#include "libserver/worker_util.h" +#include <limits> +#include <cmath> + +namespace rspamd::symcache { + +/* At least once per minute */ +constexpr static const auto PROFILE_MAX_TIME = 60.0; +/* For messages larger than 2Mb enable profiling */ +constexpr static const auto PROFILE_MESSAGE_SIZE_THRESHOLD = 1024ul * 1024 * 2; +/* Enable profile at least once per this amount of messages processed */ +constexpr static const auto PROFILE_PROBABILITY = 0.01; + +auto symcache_runtime::create(struct rspamd_task *task, symcache &cache) -> symcache_runtime * +{ + cache.maybe_resort(); + + auto &&cur_order = cache.get_cache_order(); + auto *checkpoint = (symcache_runtime *) rspamd_mempool_alloc0(task->task_pool, + sizeof(symcache_runtime) + + sizeof(struct cache_dynamic_item) * cur_order->size()); + + checkpoint->order = cache.get_cache_order(); + + /* Calculate profile probability */ + ev_now_update_if_cheap(task->event_loop); + ev_tstamp now = ev_now(task->event_loop); + checkpoint->profile_start = now; + checkpoint->lim = rspamd_task_get_required_score(task, task->result); + + if ((cache.get_last_profile() == 0.0 || now > cache.get_last_profile() + PROFILE_MAX_TIME) || + (task->msg.len >= PROFILE_MESSAGE_SIZE_THRESHOLD) || + (rspamd_random_double_fast() >= (1 - PROFILE_PROBABILITY))) { + msg_debug_cache_task("enable profiling of symbols for task"); + checkpoint->profile = true; + cache.set_last_profile(now); + } + + task->symcache_runtime = (void *) checkpoint; + + return checkpoint; +} + +auto symcache_runtime::process_settings(struct rspamd_task *task, const symcache &cache) -> bool +{ + if (!task->settings) { + msg_err_task("`process_settings` is called with no settings"); + return false; + } + + const auto *wl = ucl_object_lookup(task->settings, "whitelist"); + + if (wl != nullptr) { + msg_info_task("task is whitelisted"); + task->flags |= RSPAMD_TASK_FLAG_SKIP; + return true; + } + + auto already_disabled = false; + + auto process_group = [&](const ucl_object_t *gr_obj, auto functor) -> void { + ucl_object_iter_t it = nullptr; + const ucl_object_t *cur; + + if (gr_obj) { + while ((cur = ucl_iterate_object(gr_obj, &it, true)) != nullptr) { + if (ucl_object_type(cur) == UCL_STRING) { + auto *gr = (struct rspamd_symbols_group *) + g_hash_table_lookup(task->cfg->groups, + ucl_object_tostring(cur)); + + if (gr) { + GHashTableIter gr_it; + void *k, *v; + g_hash_table_iter_init(&gr_it, gr->symbols); + + while (g_hash_table_iter_next(&gr_it, &k, &v)) { + functor((const char *) k); + } + } + } + } + } + }; + + ucl_object_iter_t it = nullptr; + const ucl_object_t *cur; + + const auto *enabled = ucl_object_lookup(task->settings, "symbols_enabled"); + + if (enabled) { + msg_debug_cache_task("disable all symbols as `symbols_enabled` is found"); + /* Disable all symbols but selected */ + disable_all_symbols(SYMBOL_TYPE_EXPLICIT_DISABLE); + already_disabled = true; + it = nullptr; + + while ((cur = ucl_iterate_object(enabled, &it, true)) != nullptr) { + enable_symbol(task, cache, ucl_object_tostring(cur)); + } + } + + /* Enable groups of symbols */ + enabled = ucl_object_lookup(task->settings, "groups_enabled"); + if (enabled && !already_disabled) { + disable_all_symbols(SYMBOL_TYPE_EXPLICIT_DISABLE); + } + process_group(enabled, [&](const char *sym) { + enable_symbol(task, cache, sym); + }); + + const auto *disabled = ucl_object_lookup(task->settings, "symbols_disabled"); + + if (disabled) { + it = nullptr; + + while ((cur = ucl_iterate_object(disabled, &it, true)) != nullptr) { + disable_symbol(task, cache, ucl_object_tostring(cur)); + } + } + + /* Disable groups of symbols */ + disabled = ucl_object_lookup(task->settings, "groups_disabled"); + process_group(disabled, [&](const char *sym) { + disable_symbol(task, cache, sym); + }); + + /* Update required limit */ + lim = rspamd_task_get_required_score(task, task->result); + + return false; +} + +auto symcache_runtime::disable_all_symbols(int skip_mask) -> void +{ + for (auto [i, item]: rspamd::enumerate(order->d)) { + auto *dyn_item = &dynamic_items[i]; + + if (!(item->get_flags() & skip_mask)) { + dyn_item->finished = true; + dyn_item->started = true; + } + } +} + +auto symcache_runtime::disable_symbol(struct rspamd_task *task, const symcache &cache, std::string_view name) -> bool +{ + const auto *item = cache.get_item_by_name(name, true); + + if (item != nullptr) { + + auto *dyn_item = get_dynamic_item(item->id); + + if (dyn_item) { + dyn_item->finished = true; + dyn_item->started = true; + msg_debug_cache_task("disable execution of %s", name.data()); + + return true; + } + else { + msg_debug_cache_task("cannot disable %s: id not found %d", name.data(), item->id); + } + } + else { + msg_debug_cache_task("cannot disable %s: symbol not found", name.data()); + } + + return false; +} + +auto symcache_runtime::enable_symbol(struct rspamd_task *task, const symcache &cache, std::string_view name) -> bool +{ + const auto *item = cache.get_item_by_name(name, true); + + if (item != nullptr) { + + auto *dyn_item = get_dynamic_item(item->id); + + if (dyn_item) { + dyn_item->finished = false; + dyn_item->started = false; + msg_debug_cache_task("enable execution of %s", name.data()); + + return true; + } + else { + msg_debug_cache_task("cannot enable %s: id not found %d", name.data(), item->id); + } + } + else { + msg_debug_cache_task("cannot enable %s: symbol not found", name.data()); + } + + return false; +} + +auto symcache_runtime::is_symbol_checked(const symcache &cache, std::string_view name) -> bool +{ + const auto *item = cache.get_item_by_name(name, true); + + if (item != nullptr) { + + auto *dyn_item = get_dynamic_item(item->id); + + if (dyn_item) { + return dyn_item->started; + } + } + + return false; +} + +auto symcache_runtime::is_symbol_enabled(struct rspamd_task *task, const symcache &cache, std::string_view name) -> bool +{ + + const auto *item = cache.get_item_by_name(name, true); + if (item) { + + if (!item->is_allowed(task, true)) { + return false; + } + else { + auto *dyn_item = get_dynamic_item(item->id); + + if (dyn_item) { + if (dyn_item->started) { + /* Already started */ + return false; + } + + if (!item->is_virtual()) { + return std::get<normal_item>(item->specific).check_conditions(item->symbol, task); + } + } + else { + /* Unknown item */ + msg_debug_cache_task("cannot enable %s: symbol not found", name.data()); + } + } + } + + return true; +} + +auto symcache_runtime::get_dynamic_item(int id) const -> cache_dynamic_item * +{ + + /* Not found in the cache, do a hash lookup */ + auto our_id_maybe = rspamd::find_map(order->by_cache_id, id); + + if (our_id_maybe) { + return &dynamic_items[our_id_maybe.value()]; + } + + return nullptr; +} + +auto symcache_runtime::process_symbols(struct rspamd_task *task, symcache &cache, unsigned int stage) -> bool +{ + msg_debug_cache_task("symbols processing stage at pass: %d", stage); + + if (RSPAMD_TASK_IS_SKIPPED(task)) { + return true; + } + + switch (stage) { + case RSPAMD_TASK_STAGE_CONNFILTERS: + case RSPAMD_TASK_STAGE_PRE_FILTERS: + case RSPAMD_TASK_STAGE_POST_FILTERS: + case RSPAMD_TASK_STAGE_IDEMPOTENT: + return process_pre_postfilters(task, cache, + rspamd_session_events_pending(task->s), stage); + break; + + case RSPAMD_TASK_STAGE_FILTERS: + return process_filters(task, cache, rspamd_session_events_pending(task->s)); + break; + + default: + g_assert_not_reached(); + } +} + +auto symcache_runtime::process_pre_postfilters(struct rspamd_task *task, + symcache &cache, + int start_events, + unsigned int stage) -> bool +{ + auto saved_priority = std::numeric_limits<int>::min(); + auto all_done = true; + auto log_func = RSPAMD_LOG_FUNC; + auto compare_functor = +[](int a, int b) { return a < b; }; + + auto proc_func = [&](cache_item *item) { + /* + * We can safely ignore all pre/postfilters except idempotent ones and + * those that are marked as ignore passthrough result + */ + if (stage != RSPAMD_TASK_STAGE_IDEMPOTENT && + !(item->flags & SYMBOL_TYPE_IGNORE_PASSTHROUGH)) { + if (check_metric_limit(task)) { + msg_debug_cache_task_lambda("task has already the result being set, ignore further checks"); + + return true; + } + } + + auto dyn_item = get_dynamic_item(item->id); + + if (!dyn_item->started && !dyn_item->finished) { + if (has_slow) { + /* Delay */ + has_slow = false; + + return false; + } + + if (saved_priority == std::numeric_limits<int>::min()) { + saved_priority = item->priority; + } + else { + if (compare_functor(item->priority, saved_priority) && + rspamd_session_events_pending(task->s) > start_events) { + /* + * Delay further checks as we have higher + * priority filters to be processed + */ + return false; + } + } + + return process_symbol(task, cache, item, dyn_item); + } + + /* Continue processing */ + return true; + }; + + switch (stage) { + case RSPAMD_TASK_STAGE_CONNFILTERS: + all_done = cache.connfilters_foreach(proc_func); + break; + case RSPAMD_TASK_STAGE_PRE_FILTERS: + all_done = cache.prefilters_foreach(proc_func); + break; + case RSPAMD_TASK_STAGE_POST_FILTERS: + compare_functor = +[](int a, int b) { return a > b; }; + all_done = cache.postfilters_foreach(proc_func); + break; + case RSPAMD_TASK_STAGE_IDEMPOTENT: + compare_functor = +[](int a, int b) { return a > b; }; + all_done = cache.idempotent_foreach(proc_func); + break; + default: + g_error("invalid invocation"); + break; + } + + return all_done; +} + +auto symcache_runtime::process_filters(struct rspamd_task *task, symcache &cache, int start_events) -> bool +{ + auto all_done = true; + auto log_func = RSPAMD_LOG_FUNC; + auto has_passtrough = false; + + for (const auto [idx, item]: rspamd::enumerate(order->d)) { + /* Exclude all non filters */ + if (item->type != symcache_item_type::FILTER) { + /* + * We use breaking the loop as we append non-filters to the end of the list + * so, it is safe to stop processing immediately + */ + break; + } + + if (!(item->flags & (SYMBOL_TYPE_FINE | SYMBOL_TYPE_IGNORE_PASSTHROUGH))) { + if (has_passtrough || check_metric_limit(task)) { + msg_debug_cache_task_lambda("task has already the result being set, ignore further checks"); + has_passtrough = true; + /* Skip this item */ + continue; + } + } + + auto dyn_item = &dynamic_items[idx]; + + if (!dyn_item->started) { + all_done = false; + + if (!check_item_deps(task, cache, item.get(), + dyn_item, false)) { + msg_debug_cache_task("blocked execution of %d(%s) unless deps are " + "resolved", + item->id, item->symbol.c_str()); + + continue; + } + + process_symbol(task, cache, item.get(), dyn_item); + + if (has_slow) { + /* Delay */ + has_slow = false; + + return false; + } + } + } + + return all_done; +} + +auto symcache_runtime::process_symbol(struct rspamd_task *task, symcache &cache, cache_item *item, + cache_dynamic_item *dyn_item) -> bool +{ + if (item->type == symcache_item_type::CLASSIFIER || item->type == symcache_item_type::COMPOSITE) { + /* Classifiers are special :( */ + return true; + } + + if (rspamd_session_blocked(task->s)) { + /* + * We cannot add new events as session is either destroyed or + * being cleaned up. + */ + return true; + } + + g_assert(!item->is_virtual()); + if (dyn_item->started) { + /* + * This can actually happen when deps span over different layers + */ + return dyn_item->finished; + } + + /* Check has been started */ + dyn_item->started = true; + auto check = true; + + if (!item->is_allowed(task, true) || !item->check_conditions(task)) { + check = false; + } + + if (check) { + msg_debug_cache_task("execute %s, %d; symbol type = %s", item->symbol.data(), + item->id, item_type_to_str(item->type)); + + if (profile) { + ev_now_update_if_cheap(task->event_loop); + dyn_item->start_msec = (ev_now(task->event_loop) - + profile_start) * + 1e3; + } + dyn_item->async_events = 0; + cur_item = dyn_item; + items_inflight++; + /* Callback now must finalize itself */ + item->call(task, dyn_item); + cur_item = nullptr; + + if (items_inflight == 0) { + return true; + } + + if (dyn_item->async_events == 0 && !dyn_item->finished) { + msg_err_cache_task("critical error: item %s has no async events pending, " + "but it is not finalised", + item->symbol.data()); + g_assert_not_reached(); + } + + return false; + } + else { + dyn_item->finished = true; + } + + return true; +} + +auto symcache_runtime::check_metric_limit(struct rspamd_task *task) -> bool +{ + if (task->flags & RSPAMD_TASK_FLAG_PASS_ALL) { + return false; + } + + /* Check score limit */ + if (!std::isnan(lim)) { + if (task->result->score > lim) { + return true; + } + } + + if (task->result->passthrough_result != nullptr) { + /* We also need to check passthrough results */ + auto *pr = task->result->passthrough_result; + DL_FOREACH(task->result->passthrough_result, pr) + { + struct rspamd_action_config *act_config = + rspamd_find_action_config_for_action(task->result, pr->action); + + /* Skip least results */ + if (pr->flags & RSPAMD_PASSTHROUGH_LEAST) { + continue; + } + + /* Skip disabled actions */ + if (act_config && (act_config->flags & RSPAMD_ACTION_RESULT_DISABLED)) { + continue; + } + + /* Immediately stop on non least passthrough action */ + return true; + } + } + + return false; +} + +auto symcache_runtime::check_item_deps(struct rspamd_task *task, symcache &cache, cache_item *item, + cache_dynamic_item *dyn_item, bool check_only) -> bool +{ + constexpr const auto max_recursion = 20; + auto log_func = RSPAMD_LOG_FUNC; + + auto inner_functor = [&](int recursion, cache_item *item, cache_dynamic_item *dyn_item, auto rec_functor) -> bool { + if (recursion > max_recursion) { + msg_err_task_lambda("cyclic dependencies: maximum check level %ud exceed when " + "checking dependencies for %s", + max_recursion, item->symbol.c_str()); + + return true; + } + + auto ret = true; + + for (const auto &dep: item->deps) { + if (!dep.item) { + /* Assume invalid deps as done */ + msg_debug_cache_task_lambda("symbol %d(%s) has invalid dependencies on %d(%s)", + item->id, item->symbol.c_str(), dep.id, dep.sym.c_str()); + continue; + } + + auto *dep_dyn_item = get_dynamic_item(dep.item->id); + + if (!dep_dyn_item->finished) { + if (!dep_dyn_item->started) { + /* Not started */ + if (!check_only) { + if (!rec_functor(recursion + 1, + dep.item, + dep_dyn_item, + rec_functor)) { + + ret = false; + msg_debug_cache_task_lambda("delayed dependency %d(%s) for " + "symbol %d(%s)", + dep.id, dep.sym.c_str(), item->id, item->symbol.c_str()); + } + else if (!process_symbol(task, cache, dep.item, dep_dyn_item)) { + /* Now started, but has events pending */ + ret = false; + msg_debug_cache_task_lambda("started check of %d(%s) symbol " + "as dep for " + "%d(%s)", + dep.id, dep.sym.c_str(), item->id, item->symbol.c_str()); + } + else { + msg_debug_cache_task_lambda("dependency %d(%s) for symbol %d(%s) is " + "already processed", + dep.id, dep.sym.c_str(), item->id, item->symbol.c_str()); + } + } + else { + msg_debug_cache_task_lambda("dependency %d(%s) for symbol %d(%s) " + "cannot be started now", + dep.id, dep.sym.c_str(), item->id, item->symbol.c_str()); + ret = false; + } + } + else { + /* Started but not finished */ + msg_debug_cache_task_lambda("dependency %d(%s) for symbol %d(%s) is " + "still executing", + dep.id, dep.sym.c_str(), item->id, item->symbol.c_str()); + ret = false; + } + } + else { + msg_debug_cache_task_lambda("dependency %d(%s) for symbol %d(%s) is already " + "checked", + dep.id, dep.sym.c_str(), item->id, item->symbol.c_str()); + } + } + + return ret; + }; + + return inner_functor(0, item, dyn_item, inner_functor); +} + + +struct rspamd_symcache_delayed_cbdata { + cache_item *item; + struct rspamd_task *task; + symcache_runtime *runtime; + struct rspamd_async_event *event; + struct ev_timer tm; +}; + +static void +rspamd_symcache_delayed_item_fin(gpointer ud) +{ + auto *cbd = (struct rspamd_symcache_delayed_cbdata *) ud; + + cbd->event = nullptr; + cbd->runtime->unset_slow(); + ev_timer_stop(cbd->task->event_loop, &cbd->tm); +} + +static void +rspamd_symcache_delayed_item_cb(EV_P_ ev_timer *w, int what) +{ + auto *cbd = (struct rspamd_symcache_delayed_cbdata *) w->data; + + if (cbd->event) { + cbd->event = nullptr; + + /* Timer will be stopped here */ + rspamd_session_remove_event(cbd->task->s, + rspamd_symcache_delayed_item_fin, cbd); + + cbd->runtime->process_item_rdeps(cbd->task, cbd->item); + } +} + +static void +rspamd_delayed_timer_dtor(gpointer d) +{ + auto *cbd = (struct rspamd_symcache_delayed_cbdata *) d; + + if (cbd->event) { + /* Event has not been executed, this will also stop a timer */ + rspamd_session_remove_event(cbd->task->s, + rspamd_symcache_delayed_item_fin, cbd); + cbd->event = nullptr; + } +} + +auto symcache_runtime::finalize_item(struct rspamd_task *task, cache_dynamic_item *dyn_item) -> void +{ + /* Limit to consider a rule as slow (in milliseconds) */ + constexpr const gdouble slow_diff_limit = 300; + auto *item = get_item_by_dynamic_item(dyn_item); + /* Sanity checks */ + g_assert(items_inflight > 0); + g_assert(item != nullptr); + + if (dyn_item->async_events > 0) { + /* + * XXX: Race condition + * + * It is possible that some async event is still in flight, but we + * already know its result, however, it is the responsibility of that + * event to decrease async events count and call this function + * one more time + */ + msg_debug_cache_task("postpone finalisation of %s(%d) as there are %d " + "async events pending", + item->symbol.c_str(), item->id, dyn_item->async_events); + + return; + } + + msg_debug_cache_task("process finalize for item %s(%d)", item->symbol.c_str(), item->id); + dyn_item->finished = true; + items_inflight--; + cur_item = nullptr; + + auto enable_slow_timer = [&]() -> bool { + auto *cbd = rspamd_mempool_alloc0_type(task->task_pool, rspamd_symcache_delayed_cbdata); + /* Add timer to allow something else to be executed */ + ev_timer *tm = &cbd->tm; + + cbd->event = rspamd_session_add_event(task->s, + rspamd_symcache_delayed_item_fin, cbd, + "symcache"); + cbd->runtime = this; + + /* + * If no event could be added, then we are already in the destruction + * phase. So the main issue is to deal with has slow here + */ + if (cbd->event) { + ev_timer_init(tm, rspamd_symcache_delayed_item_cb, 0.1, 0.0); + ev_set_priority(tm, EV_MINPRI); + rspamd_mempool_add_destructor(task->task_pool, + rspamd_delayed_timer_dtor, cbd); + + cbd->task = task; + cbd->item = item; + tm->data = cbd; + ev_timer_start(task->event_loop, tm); + } + else { + /* Just reset as no timer is added */ + has_slow = FALSE; + return false; + } + + return true; + }; + + if (profile) { + ev_now_update_if_cheap(task->event_loop); + auto diff = ((ev_now(task->event_loop) - profile_start) * 1e3 - + dyn_item->start_msec); + + if (diff > slow_diff_limit) { + + if (!has_slow) { + has_slow = true; + + msg_info_task("slow rule: %s(%d): %.2f ms; enable slow timer delay", + item->symbol.c_str(), item->id, + diff); + + if (enable_slow_timer()) { + /* Allow network execution */ + return; + } + } + else { + msg_info_task("slow rule: %s(%d): %.2f ms", + item->symbol.c_str(), item->id, + diff); + } + } + + if (G_UNLIKELY(RSPAMD_TASK_IS_PROFILING(task))) { + rspamd_task_profile_set(task, item->symbol.c_str(), diff); + } + + if (rspamd_worker_is_scanner(task->worker)) { + rspamd_set_counter(item->cd, diff); + } + } + + process_item_rdeps(task, item); +} + +auto symcache_runtime::process_item_rdeps(struct rspamd_task *task, cache_item *item) -> void +{ + auto *cache_ptr = reinterpret_cast<symcache *>(task->cfg->cache); + + // Avoid race condition with the runtime destruction and the delay timer + if (!order) { + return; + } + + for (const auto &rdep: item->rdeps) { + if (rdep.item) { + auto *dyn_item = get_dynamic_item(rdep.item->id); + if (!dyn_item->started) { + msg_debug_cache_task("check item %d(%s) rdep of %s ", + rdep.item->id, rdep.item->symbol.c_str(), item->symbol.c_str()); + + if (!check_item_deps(task, *cache_ptr, rdep.item, dyn_item, false)) { + msg_debug_cache_task("blocked execution of %d(%s) rdep of %s " + "unless deps are resolved", + rdep.item->id, rdep.item->symbol.c_str(), item->symbol.c_str()); + } + else { + process_symbol(task, *cache_ptr, rdep.item, + dyn_item); + } + } + } + } +} + +auto symcache_runtime::get_item_by_dynamic_item(cache_dynamic_item *dyn_item) const -> cache_item * +{ + auto idx = dyn_item - dynamic_items; + + if (idx >= 0 && idx < order->size()) { + return order->d[idx].get(); + } + + msg_err("internal error: invalid index to get: %d", (int) idx); + + return nullptr; +} + +}// namespace rspamd::symcache diff --git a/src/libserver/symcache/symcache_runtime.hxx b/src/libserver/symcache/symcache_runtime.hxx new file mode 100644 index 0000000..aa8f66c --- /dev/null +++ b/src/libserver/symcache/symcache_runtime.hxx @@ -0,0 +1,209 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Symcache runtime is produced for each task and it consists of symbols + * being executed, being dynamically disabled/enabled and it also captures + * the current order of the symbols (produced by resort periodic) + */ + +#ifndef RSPAMD_SYMCACHE_RUNTIME_HXX +#define RSPAMD_SYMCACHE_RUNTIME_HXX +#pragma once + +#include "symcache_internal.hxx" + +struct rspamd_scan_result; + +namespace rspamd::symcache { +/** + * These items are saved within task structure and are used to track + * symbols execution. + * Each symcache item occupies a single dynamic item, that currently has 8 bytes + * length + */ +struct cache_dynamic_item { + std::uint16_t start_msec; /* Relative to task time */ + bool started; + bool finished; + std::uint32_t async_events; +}; + +static_assert(sizeof(cache_dynamic_item) == sizeof(std::uint64_t)); +static_assert(std::is_trivial_v<cache_dynamic_item>); + +class symcache_runtime { + unsigned items_inflight; + bool profile; + bool has_slow; + + double profile_start; + double lim; + + struct cache_dynamic_item *cur_item; + order_generation_ptr order; + /* Dynamically expanded as needed */ + mutable struct cache_dynamic_item dynamic_items[]; + /* We allocate this structure merely in memory pool, so destructor is absent */ + ~symcache_runtime() = delete; + + auto process_symbol(struct rspamd_task *task, symcache &cache, cache_item *item, + cache_dynamic_item *dyn_item) -> bool; + /* Specific stages of the processing */ + auto process_pre_postfilters(struct rspamd_task *task, symcache &cache, int start_events, unsigned int stage) -> bool; + auto process_filters(struct rspamd_task *task, symcache &cache, int start_events) -> bool; + auto check_metric_limit(struct rspamd_task *task) -> bool; + auto check_item_deps(struct rspamd_task *task, symcache &cache, cache_item *item, + cache_dynamic_item *dyn_item, bool check_only) -> bool; + +public: + /* Dropper for a shared ownership */ + auto savepoint_dtor() -> void + { + + /* Drop shared ownership */ + order.reset(); + } + /** + * Creates a cache runtime using task mempool + * @param task + * @param cache + * @return + */ + static auto create(struct rspamd_task *task, symcache &cache) -> symcache_runtime *; + /** + * Process task settings + * @param task + * @return + */ + auto process_settings(struct rspamd_task *task, const symcache &cache) -> bool; + + /** + * Disable all symbols but not touching ones that are in the specific mask + * @param skip_mask + */ + auto disable_all_symbols(int skip_mask) -> void; + + /** + * Disable a symbol (or it's parent) + * @param name + * @return + */ + auto disable_symbol(struct rspamd_task *task, const symcache &cache, std::string_view name) -> bool; + + /** + * Enable a symbol (or it's parent) + * @param name + * @return + */ + auto enable_symbol(struct rspamd_task *task, const symcache &cache, std::string_view name) -> bool; + + /** + * Checks if an item has been checked/disabled + * @param cache + * @param name + * @return + */ + auto is_symbol_checked(const symcache &cache, std::string_view name) -> bool; + + /** + * Checks if a symbol is enabled for execution, checking all pending conditions + * @param task + * @param cache + * @param name + * @return + */ + auto is_symbol_enabled(struct rspamd_task *task, const symcache &cache, std::string_view name) -> bool; + + /** + * Get the current processed item + * @return + */ + auto get_cur_item() const -> auto + { + return cur_item; + } + + /** + * Set the current processed item + * @param item + * @return + */ + auto set_cur_item(cache_dynamic_item *item) -> auto + { + std::swap(item, cur_item); + return item; + } + + /** + * Set profile mode for the runtime + * @param enable + * @return + */ + auto set_profile_mode(bool enable) -> auto + { + std::swap(profile, enable); + return enable; + } + + /** + * Returns the dynamic item by static item id + * @param id + * @return + */ + auto get_dynamic_item(int id) const -> cache_dynamic_item *; + + /** + * Returns static cache item by dynamic cache item + * @return + */ + auto get_item_by_dynamic_item(cache_dynamic_item *) const -> cache_item *; + + /** + * Process symbols in the cache + * @param task + * @param cache + * @param stage + * @return + */ + auto process_symbols(struct rspamd_task *task, symcache &cache, unsigned int stage) -> bool; + + /** + * Finalize execution of some item in the cache + * @param task + * @param item + */ + auto finalize_item(struct rspamd_task *task, cache_dynamic_item *item) -> void; + + /** + * Process unblocked reverse dependencies of the specific item + * @param task + * @param item + */ + auto process_item_rdeps(struct rspamd_task *task, cache_item *item) -> void; + + /* XXX: a helper to allow hiding internal implementation of the slow timer structure */ + auto unset_slow() -> void + { + has_slow = false; + } +}; + + +}// namespace rspamd::symcache + +#endif//RSPAMD_SYMCACHE_RUNTIME_HXX diff --git a/src/libserver/task.c b/src/libserver/task.c new file mode 100644 index 0000000..9763d1e --- /dev/null +++ b/src/libserver/task.c @@ -0,0 +1,1975 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "task.h" +#include "rspamd.h" +#include "scan_result.h" +#include "libserver/protocol.h" +#include "libserver/protocol_internal.h" +#include "message.h" +#include "lua/lua_common.h" +#include "email_addr.h" +#include "src/libserver/composites/composites.h" +#include "stat_api.h" +#include "unix-std.h" +#include "utlist.h" +#include "libserver/mempool_vars_internal.h" +#include "libserver/cfg_file_private.h" +#include "libmime/lang_detection.h" +#include "libmime/scan_result_private.h" + +#ifdef WITH_JEMALLOC +#include <jemalloc/jemalloc.h> +#else +#if defined(__GLIBC__) && defined(_GNU_SOURCE) +#include <malloc.h> +#endif +#endif + +#include <math.h> + +#ifdef SYS_ZSTD +#include "zstd.h" +#else +#include "contrib/zstd/zstd.h" +#endif + +__KHASH_IMPL(rspamd_req_headers_hash, static inline, + rspamd_ftok_t *, struct rspamd_request_header_chain *, 1, + rspamd_ftok_icase_hash, rspamd_ftok_icase_equal) + +static GQuark +rspamd_task_quark(void) +{ + return g_quark_from_static_string("task-error"); +} + +/* + * Create new task + */ +struct rspamd_task * +rspamd_task_new(struct rspamd_worker *worker, + struct rspamd_config *cfg, + rspamd_mempool_t *pool, + struct rspamd_lang_detector *lang_det, + struct ev_loop *event_loop, + gboolean debug_mem) +{ + struct rspamd_task *new_task; + rspamd_mempool_t *task_pool; + guint flags = 0; + + if (pool == NULL) { + task_pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), + "task", debug_mem ? RSPAMD_MEMPOOL_DEBUG : 0); + flags |= RSPAMD_TASK_FLAG_OWN_POOL; + } + else { + task_pool = pool; + } + + new_task = rspamd_mempool_alloc0(task_pool, sizeof(struct rspamd_task)); + new_task->task_pool = task_pool; + new_task->flags = flags; + new_task->worker = worker; + new_task->lang_det = lang_det; + + if (cfg) { + new_task->cfg = cfg; + REF_RETAIN(cfg); + + if (cfg->check_all_filters) { + new_task->flags |= RSPAMD_TASK_FLAG_PASS_ALL; + } + + + if (cfg->re_cache) { + new_task->re_rt = rspamd_re_cache_runtime_new(cfg->re_cache); + } + + if (new_task->lang_det == NULL && cfg->lang_det != NULL) { + new_task->lang_det = cfg->lang_det; + } + } + + new_task->event_loop = event_loop; + new_task->task_timestamp = ev_time(); + new_task->time_real_finish = NAN; + + new_task->request_headers = kh_init(rspamd_req_headers_hash); + new_task->sock = -1; + new_task->flags |= (RSPAMD_TASK_FLAG_MIME); + /* Default results chain */ + rspamd_create_metric_result(new_task, NULL, -1); + + new_task->queue_id = "undef"; + new_task->messages = ucl_object_typed_new(UCL_OBJECT); + kh_static_init(rspamd_task_lua_cache, &new_task->lua_cache); + + return new_task; +} + + +static void +rspamd_task_reply(struct rspamd_task *task) +{ + const ev_tstamp write_timeout = 5.0; + + if (task->fin_callback) { + task->fin_callback(task, task->fin_arg); + } + else { + if (!(task->processed_stages & RSPAMD_TASK_STAGE_REPLIED)) { + rspamd_protocol_write_reply(task, write_timeout); + } + } +} + +/* + * Called if all filters are processed + * @return TRUE if session should be terminated + */ +gboolean +rspamd_task_fin(void *arg) +{ + struct rspamd_task *task = (struct rspamd_task *) arg; + + /* Task is already finished or skipped */ + if (RSPAMD_TASK_IS_PROCESSED(task)) { + rspamd_task_reply(task); + return TRUE; + } + + if (!rspamd_task_process(task, RSPAMD_TASK_PROCESS_ALL)) { + rspamd_task_reply(task); + return TRUE; + } + + if (RSPAMD_TASK_IS_PROCESSED(task)) { + rspamd_task_reply(task); + return TRUE; + } + + /* One more iteration */ + return FALSE; +} + +/* + * Free all structures of worker_task + */ +void rspamd_task_free(struct rspamd_task *task) +{ + struct rspamd_email_address *addr; + static guint free_iters = 0; + guint i; + + if (task) { + debug_task("free pointer %p", task); + + if (task->rcpt_envelope) { + for (i = 0; i < task->rcpt_envelope->len; i++) { + addr = g_ptr_array_index(task->rcpt_envelope, i); + rspamd_email_address_free(addr); + } + + g_ptr_array_free(task->rcpt_envelope, TRUE); + } + + if (task->from_envelope) { + rspamd_email_address_free(task->from_envelope); + } + + if (task->from_envelope_orig) { + rspamd_email_address_free(task->from_envelope_orig); + } + + if (task->meta_words) { + g_array_free(task->meta_words, TRUE); + } + + ucl_object_unref(task->messages); + + if (task->re_rt) { + rspamd_re_cache_runtime_destroy(task->re_rt); + } + + if (task->http_conn != NULL) { + rspamd_http_connection_reset(task->http_conn); + rspamd_http_connection_unref(task->http_conn); + } + + if (task->settings != NULL) { + ucl_object_unref(task->settings); + } + + if (task->settings_elt != NULL) { + REF_RELEASE(task->settings_elt); + } + + if (task->client_addr) { + rspamd_inet_address_free(task->client_addr); + } + + if (task->from_addr) { + rspamd_inet_address_free(task->from_addr); + } + + if (task->err) { + g_error_free(task->err); + } + + ev_timer_stop(task->event_loop, &task->timeout_ev); + ev_io_stop(task->event_loop, &task->guard_ev); + + if (task->sock != -1) { + close(task->sock); + } + + if (task->cfg) { + + + struct rspamd_lua_cached_entry entry; + + kh_foreach_value(&task->lua_cache, entry, { + luaL_unref(task->cfg->lua_state, + LUA_REGISTRYINDEX, entry.ref); + }); + kh_static_destroy(rspamd_task_lua_cache, &task->lua_cache); + + if (task->cfg->full_gc_iters && (++free_iters > task->cfg->full_gc_iters)) { + /* Perform more expensive cleanup cycle */ + gsize allocated = 0, active = 0, metadata = 0, + resident = 0, mapped = 0, old_lua_mem = 0; + gdouble t1, t2; + + old_lua_mem = lua_gc(task->cfg->lua_state, LUA_GCCOUNT, 0); + t1 = rspamd_get_ticks(FALSE); + +#ifdef WITH_JEMALLOC + gsize sz = sizeof(gsize); + mallctl("stats.allocated", &allocated, &sz, NULL, 0); + mallctl("stats.active", &active, &sz, NULL, 0); + mallctl("stats.metadata", &metadata, &sz, NULL, 0); + mallctl("stats.resident", &resident, &sz, NULL, 0); + mallctl("stats.mapped", &mapped, &sz, NULL, 0); +#else +#if defined(__GLIBC__) && defined(_GNU_SOURCE) + malloc_trim(0); +#endif +#endif + lua_gc(task->cfg->lua_state, LUA_GCCOLLECT, 0); + t2 = rspamd_get_ticks(FALSE); + + msg_notice_task("perform full gc cycle; memory stats: " + "%Hz allocated, %Hz active, %Hz metadata, %Hz resident, %Hz mapped;" + " lua memory: %z kb -> %d kb; %f ms for gc iter", + allocated, active, metadata, resident, mapped, + old_lua_mem, lua_gc(task->cfg->lua_state, LUA_GCCOUNT, 0), + (t2 - t1) * 1000.0); + free_iters = rspamd_time_jitter(0, + (gdouble) task->cfg->full_gc_iters / 2); + } + + REF_RELEASE(task->cfg); + } + + kh_destroy(rspamd_req_headers_hash, task->request_headers); + rspamd_message_unref(task->message); + + if (task->flags & RSPAMD_TASK_FLAG_OWN_POOL) { + rspamd_mempool_destructors_enforce(task->task_pool); + + if (task->symcache_runtime) { + rspamd_symcache_runtime_destroy(task); + } + + rspamd_mempool_delete(task->task_pool); + } + else if (task->symcache_runtime) { + rspamd_symcache_runtime_destroy(task); + } + } +} + +struct rspamd_task_map { + gpointer begin; + gulong len; + gint fd; +}; + +static void +rspamd_task_unmapper(gpointer ud) +{ + struct rspamd_task_map *m = ud; + + munmap(m->begin, m->len); + close(m->fd); +} + +gboolean +rspamd_task_load_message(struct rspamd_task *task, + struct rspamd_http_message *msg, const gchar *start, gsize len) +{ + guint control_len, r; + struct ucl_parser *parser; + ucl_object_t *control_obj; + gchar filepath[PATH_MAX], *fp; + gint fd, flen; + gulong offset = 0, shmem_size = 0; + rspamd_ftok_t *tok; + gpointer map; + struct stat st; + struct rspamd_task_map *m; + const gchar *ft; + +#ifdef HAVE_SANE_SHMEM + ft = "shm"; +#else + ft = "file"; +#endif + + if (msg) { + rspamd_protocol_handle_headers(task, msg); + } + + tok = rspamd_task_get_request_header(task, "shm"); + + if (tok) { + /* Shared memory part */ + r = rspamd_strlcpy(filepath, tok->begin, + MIN(sizeof(filepath), tok->len + 1)); + + rspamd_url_decode(filepath, filepath, r + 1); + flen = strlen(filepath); + + if (filepath[0] == '"' && flen > 2) { + /* We need to unquote filepath */ + fp = &filepath[1]; + fp[flen - 2] = '\0'; + } + else { + fp = &filepath[0]; + } +#ifdef HAVE_SANE_SHMEM + fd = shm_open(fp, O_RDONLY, 00600); +#else + fd = open(fp, O_RDONLY, 00600); +#endif + if (fd == -1) { + g_set_error(&task->err, rspamd_task_quark(), RSPAMD_PROTOCOL_ERROR, + "Cannot open %s segment (%s): %s", ft, fp, strerror(errno)); + return FALSE; + } + + if (fstat(fd, &st) == -1) { + g_set_error(&task->err, rspamd_task_quark(), RSPAMD_PROTOCOL_ERROR, + "Cannot stat %s segment (%s): %s", ft, fp, strerror(errno)); + close(fd); + + return FALSE; + } + + map = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); + + if (map == MAP_FAILED) { + close(fd); + g_set_error(&task->err, rspamd_task_quark(), RSPAMD_PROTOCOL_ERROR, + "Cannot mmap %s (%s): %s", ft, fp, strerror(errno)); + return FALSE; + } + + tok = rspamd_task_get_request_header(task, "shm-offset"); + + if (tok) { + rspamd_strtoul(tok->begin, tok->len, &offset); + + if (offset > (gulong) st.st_size) { + msg_err_task("invalid offset %ul (%ul available) for shm " + "segment %s", + offset, (gulong) st.st_size, fp); + munmap(map, st.st_size); + close(fd); + + return FALSE; + } + } + + tok = rspamd_task_get_request_header(task, "shm-length"); + shmem_size = st.st_size; + + + if (tok) { + rspamd_strtoul(tok->begin, tok->len, &shmem_size); + + if (shmem_size > (gulong) st.st_size) { + msg_err_task("invalid length %ul (%ul available) for %s " + "segment %s", + shmem_size, (gulong) st.st_size, ft, fp); + munmap(map, st.st_size); + close(fd); + + return FALSE; + } + } + + task->msg.begin = ((guchar *) map) + offset; + task->msg.len = shmem_size; + m = rspamd_mempool_alloc(task->task_pool, sizeof(*m)); + m->begin = map; + m->len = st.st_size; + m->fd = fd; + + msg_info_task("loaded message from shared memory %s (%ul size, %ul offset), fd=%d", + fp, shmem_size, offset, fd); + + rspamd_mempool_add_destructor(task->task_pool, rspamd_task_unmapper, m); + + return TRUE; + } + + tok = rspamd_task_get_request_header(task, "file"); + + if (tok == NULL) { + tok = rspamd_task_get_request_header(task, "path"); + } + + if (tok) { + debug_task("want to scan file %T", tok); + + r = rspamd_strlcpy(filepath, tok->begin, + MIN(sizeof(filepath), tok->len + 1)); + + rspamd_url_decode(filepath, filepath, r + 1); + flen = strlen(filepath); + + if (filepath[0] == '"' && flen > 2) { + /* We need to unquote filepath */ + fp = &filepath[1]; + fp[flen - 2] = '\0'; + } + else { + fp = &filepath[0]; + } + + if (stat(fp, &st) == -1) { + g_set_error(&task->err, rspamd_task_quark(), RSPAMD_PROTOCOL_ERROR, + "Invalid file (%s): %s", fp, strerror(errno)); + return FALSE; + } + + if (G_UNLIKELY(st.st_size == 0)) { + /* Empty file */ + task->flags |= RSPAMD_TASK_FLAG_EMPTY; + task->msg.begin = rspamd_mempool_strdup(task->task_pool, ""); + task->msg.len = 0; + } + else { + fd = open(fp, O_RDONLY); + + if (fd == -1) { + g_set_error(&task->err, rspamd_task_quark(), + RSPAMD_PROTOCOL_ERROR, + "Cannot open file (%s): %s", fp, strerror(errno)); + return FALSE; + } + + map = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); + + + if (map == MAP_FAILED) { + close(fd); + g_set_error(&task->err, rspamd_task_quark(), + RSPAMD_PROTOCOL_ERROR, + "Cannot mmap file (%s): %s", fp, strerror(errno)); + return FALSE; + } + + task->msg.begin = map; + task->msg.len = st.st_size; + m = rspamd_mempool_alloc(task->task_pool, sizeof(*m)); + m->begin = map; + m->len = st.st_size; + m->fd = fd; + + rspamd_mempool_add_destructor(task->task_pool, rspamd_task_unmapper, m); + } + + task->msg.fpath = rspamd_mempool_strdup(task->task_pool, fp); + task->flags |= RSPAMD_TASK_FLAG_FILE; + + msg_info_task("loaded message from file %s", fp); + + return TRUE; + } + + /* Plain data */ + debug_task("got input of length %z", task->msg.len); + + /* Check compression */ + tok = rspamd_task_get_request_header(task, "compression"); + + if (tok) { + /* Need to uncompress */ + rspamd_ftok_t t; + + t.begin = "zstd"; + t.len = 4; + + if (rspamd_ftok_casecmp(tok, &t) == 0) { + ZSTD_DStream *zstream; + ZSTD_inBuffer zin; + ZSTD_outBuffer zout; + guchar *out; + gsize outlen, r; + gulong dict_id; + + if (!rspamd_libs_reset_decompression(task->cfg->libs_ctx)) { + g_set_error(&task->err, rspamd_task_quark(), + RSPAMD_PROTOCOL_ERROR, + "Cannot decompress, decompressor init failed"); + + return FALSE; + } + + tok = rspamd_task_get_request_header(task, "dictionary"); + + if (tok != NULL) { + /* We need to use custom dictionary */ + if (!rspamd_strtoul(tok->begin, tok->len, &dict_id)) { + g_set_error(&task->err, rspamd_task_quark(), RSPAMD_PROTOCOL_ERROR, + "Non numeric dictionary"); + + return FALSE; + } + + if (!task->cfg->libs_ctx->in_dict) { + g_set_error(&task->err, rspamd_task_quark(), RSPAMD_PROTOCOL_ERROR, + "Unknown dictionary, undefined locally"); + + return FALSE; + } + + if (task->cfg->libs_ctx->in_dict->id != dict_id) { + g_set_error(&task->err, rspamd_task_quark(), RSPAMD_PROTOCOL_ERROR, + "Unknown dictionary, invalid dictionary id"); + + return FALSE; + } + } + + zstream = task->cfg->libs_ctx->in_zstream; + + zin.pos = 0; + zin.src = start; + zin.size = len; + + if ((outlen = ZSTD_getDecompressedSize(start, len)) == 0) { + outlen = ZSTD_DStreamOutSize(); + } + + out = g_malloc(outlen); + zout.dst = out; + zout.pos = 0; + zout.size = outlen; + + while (zin.pos < zin.size) { + r = ZSTD_decompressStream(zstream, &zout, &zin); + + if (ZSTD_isError(r)) { + g_set_error(&task->err, rspamd_task_quark(), + RSPAMD_PROTOCOL_ERROR, + "Decompression error: %s", ZSTD_getErrorName(r)); + + return FALSE; + } + + if (zout.pos == zout.size) { + /* We need to extend output buffer */ + zout.size = zout.size * 2 + 1; + zout.dst = g_realloc(zout.dst, zout.size); + } + } + + rspamd_mempool_add_destructor(task->task_pool, g_free, zout.dst); + task->msg.begin = zout.dst; + task->msg.len = zout.pos; + task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_COMPRESSED; + + msg_info_task("loaded message from zstd compressed stream; " + "compressed: %ul; uncompressed: %ul", + (gulong) zin.size, (gulong) zout.pos); + } + else { + g_set_error(&task->err, rspamd_task_quark(), RSPAMD_PROTOCOL_ERROR, + "Invalid compression method"); + return FALSE; + } + } + else { + task->msg.begin = start; + task->msg.len = len; + } + + if (task->msg.len == 0) { + task->flags |= RSPAMD_TASK_FLAG_EMPTY; + } + + if (task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_HAS_CONTROL) { + rspamd_ftok_t *hv = rspamd_task_get_request_header(task, MLEN_HEADER); + gulong message_len = 0; + + if (!hv || !rspamd_strtoul(hv->begin, hv->len, &message_len) || + task->msg.len < message_len) { + msg_warn_task("message has invalid message length: %ul and total len: %ul", + message_len, task->msg.len); + g_set_error(&task->err, rspamd_task_quark(), RSPAMD_PROTOCOL_ERROR, + "Invalid length"); + return FALSE; + } + + control_len = task->msg.len - message_len; + + if (control_len > 0) { + parser = ucl_parser_new(UCL_PARSER_KEY_LOWERCASE); + + if (!ucl_parser_add_chunk(parser, task->msg.begin, control_len)) { + msg_warn_task("processing of control chunk failed: %s", + ucl_parser_get_error(parser)); + ucl_parser_free(parser); + } + else { + control_obj = ucl_parser_get_object(parser); + ucl_parser_free(parser); + rspamd_protocol_handle_control(task, control_obj); + ucl_object_unref(control_obj); + } + + task->msg.begin += control_len; + task->msg.len -= control_len; + } + } + + return TRUE; +} + +static guint +rspamd_task_select_processing_stage(struct rspamd_task *task, guint stages) +{ + guint st, mask; + + mask = task->processed_stages; + + if (mask == 0) { + st = 0; + } + else { + for (st = 1; mask != 1; st++) { + mask = mask >> 1u; + } + } + + st = 1 << st; + + if (stages & st) { + return st; + } + else if (st < RSPAMD_TASK_STAGE_DONE) { + /* We assume that the stage that was not requested is done */ + task->processed_stages |= st; + return rspamd_task_select_processing_stage(task, stages); + } + + /* We are done */ + return RSPAMD_TASK_STAGE_DONE; +} + +gboolean +rspamd_task_process(struct rspamd_task *task, guint stages) +{ + guint st; + gboolean ret = TRUE, all_done = TRUE; + GError *stat_error = NULL; + + /* Avoid nested calls */ + if (task->flags & RSPAMD_TASK_FLAG_PROCESSING) { + return TRUE; + } + + if (RSPAMD_TASK_IS_PROCESSED(task)) { + return TRUE; + } + + task->flags |= RSPAMD_TASK_FLAG_PROCESSING; + + st = rspamd_task_select_processing_stage(task, stages); + + switch (st) { + case RSPAMD_TASK_STAGE_CONNFILTERS: + all_done = rspamd_symcache_process_symbols(task, task->cfg->cache, st); + break; + + case RSPAMD_TASK_STAGE_READ_MESSAGE: + if (!rspamd_message_parse(task)) { + ret = FALSE; + } + break; + + case RSPAMD_TASK_STAGE_PROCESS_MESSAGE: + if (!(task->flags & RSPAMD_TASK_FLAG_SKIP_PROCESS)) { + rspamd_message_process(task); + } + break; + + case RSPAMD_TASK_STAGE_PRE_FILTERS: + case RSPAMD_TASK_STAGE_FILTERS: + all_done = rspamd_symcache_process_symbols(task, task->cfg->cache, st); + break; + + case RSPAMD_TASK_STAGE_CLASSIFIERS: + case RSPAMD_TASK_STAGE_CLASSIFIERS_PRE: + case RSPAMD_TASK_STAGE_CLASSIFIERS_POST: + if (!RSPAMD_TASK_IS_EMPTY(task)) { + if (rspamd_stat_classify(task, task->cfg->lua_state, st, &stat_error) == + RSPAMD_STAT_PROCESS_ERROR) { + msg_err_task("classify error: %e", stat_error); + g_error_free(stat_error); + } + } + break; + + case RSPAMD_TASK_STAGE_COMPOSITES: + rspamd_composites_process_task(task); + task->result->nresults_postfilters = task->result->nresults; + break; + + case RSPAMD_TASK_STAGE_POST_FILTERS: + all_done = rspamd_symcache_process_symbols(task, task->cfg->cache, + st); + + if (all_done && (task->flags & RSPAMD_TASK_FLAG_LEARN_AUTO) && + !RSPAMD_TASK_IS_EMPTY(task) && + !(task->flags & (RSPAMD_TASK_FLAG_LEARN_SPAM | RSPAMD_TASK_FLAG_LEARN_HAM))) { + rspamd_stat_check_autolearn(task); + } + break; + + case RSPAMD_TASK_STAGE_LEARN: + case RSPAMD_TASK_STAGE_LEARN_PRE: + case RSPAMD_TASK_STAGE_LEARN_POST: + if (task->flags & (RSPAMD_TASK_FLAG_LEARN_SPAM | RSPAMD_TASK_FLAG_LEARN_HAM)) { + if (task->err == NULL) { + if (!rspamd_stat_learn(task, + task->flags & RSPAMD_TASK_FLAG_LEARN_SPAM, + task->cfg->lua_state, task->classifier, + st, &stat_error)) { + + if (stat_error == NULL) { + g_set_error(&stat_error, + g_quark_from_static_string("stat"), 500, + "Unknown statistics error, found on stage %s;" + " classifier: %s", + rspamd_task_stage_name(st), task->classifier); + } + + if (stat_error->code >= 400) { + msg_err_task("learn error: %e", stat_error); + } + else { + msg_notice_task("skip learning: %e", stat_error); + } + + if (!(task->flags & RSPAMD_TASK_FLAG_LEARN_AUTO)) { + task->err = stat_error; + task->processed_stages |= RSPAMD_TASK_STAGE_DONE; + } + else { + /* Do not skip idempotent in case of learn error */ + if (stat_error) { + g_error_free(stat_error); + } + + task->processed_stages |= RSPAMD_TASK_STAGE_LEARN | + RSPAMD_TASK_STAGE_LEARN_PRE | + RSPAMD_TASK_STAGE_LEARN_POST; + } + } + } + } + break; + case RSPAMD_TASK_STAGE_COMPOSITES_POST: + /* Second run of composites processing before idempotent filters (if needed) */ + if (task->result->nresults_postfilters != task->result->nresults) { + rspamd_composites_process_task(task); + } + else { + msg_debug_task("skip second run of composites as the result has not been changed"); + } + break; + + case RSPAMD_TASK_STAGE_IDEMPOTENT: + /* Stop task timeout */ + if (ev_can_stop(&task->timeout_ev)) { + ev_timer_stop(task->event_loop, &task->timeout_ev); + } + + all_done = rspamd_symcache_process_symbols(task, task->cfg->cache, st); + break; + + case RSPAMD_TASK_STAGE_DONE: + task->processed_stages |= RSPAMD_TASK_STAGE_DONE; + break; + + default: + /* TODO: not implemented stage */ + break; + } + + if (RSPAMD_TASK_IS_SKIPPED(task)) { + /* Set all bits except idempotent filters */ + task->processed_stages |= 0x7FFF; + } + + task->flags &= ~RSPAMD_TASK_FLAG_PROCESSING; + + if (!ret || RSPAMD_TASK_IS_PROCESSED(task)) { + if (!ret) { + /* Set processed flags */ + task->processed_stages |= RSPAMD_TASK_STAGE_DONE; + } + + msg_debug_task("task is processed"); + + return ret; + } + + if (ret) { + if (rspamd_session_events_pending(task->s) != 0) { + /* We have events pending, so we consider this stage as incomplete */ + msg_debug_task("need more work on stage %d", st); + } + else { + if (all_done) { + /* Mark the current stage as done and go to the next stage */ + msg_debug_task("completed stage %d", st); + task->processed_stages |= st; + } + else { + msg_debug_task("need more processing on stage %d", st); + } + + /* Tail recursion */ + return rspamd_task_process(task, stages); + } + } + + return ret; +} + +struct rspamd_email_address * +rspamd_task_get_sender(struct rspamd_task *task) +{ + return task->from_envelope; +} + +static const gchar * +rspamd_task_cache_principal_recipient(struct rspamd_task *task, + const gchar *rcpt, gsize len) +{ + gchar *rcpt_lc; + + if (rcpt == NULL) { + return NULL; + } + + rcpt_lc = rspamd_mempool_alloc(task->task_pool, len + 1); + rspamd_strlcpy(rcpt_lc, rcpt, len + 1); + rspamd_str_lc(rcpt_lc, len); + + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_PRINCIPAL_RECIPIENT, rcpt_lc, NULL); + + return rcpt_lc; +} + +const gchar * +rspamd_task_get_principal_recipient(struct rspamd_task *task) +{ + const gchar *val; + struct rspamd_email_address *addr; + guint i; + + val = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_PRINCIPAL_RECIPIENT); + + if (val) { + return val; + } + + if (task->deliver_to) { + return rspamd_task_cache_principal_recipient(task, task->deliver_to, + strlen(task->deliver_to)); + } + if (task->rcpt_envelope != NULL) { + + PTR_ARRAY_FOREACH(task->rcpt_envelope, i, addr) + { + if (addr->addr && !(addr->flags & RSPAMD_EMAIL_ADDR_ORIGINAL)) { + return rspamd_task_cache_principal_recipient(task, addr->addr, + addr->addr_len); + } + } + } + + GPtrArray *rcpt_mime = MESSAGE_FIELD_CHECK(task, rcpt_mime); + if (rcpt_mime != NULL && rcpt_mime->len > 0) { + PTR_ARRAY_FOREACH(rcpt_mime, i, addr) + { + if (addr->addr && !(addr->flags & RSPAMD_EMAIL_ADDR_ORIGINAL)) { + return rspamd_task_cache_principal_recipient(task, addr->addr, + addr->addr_len); + } + } + } + + return NULL; +} + +gboolean +rspamd_learn_task_spam(struct rspamd_task *task, + gboolean is_spam, + const gchar *classifier, + GError **err) +{ + if (is_spam) { + task->flags |= RSPAMD_TASK_FLAG_LEARN_SPAM; + } + else { + task->flags |= RSPAMD_TASK_FLAG_LEARN_HAM; + } + + task->classifier = classifier; + + return TRUE; +} + +static gboolean +rspamd_task_log_check_condition(struct rspamd_task *task, + struct rspamd_log_format *lf) +{ + gboolean ret = FALSE; + + switch (lf->type) { + case RSPAMD_LOG_MID: + if (MESSAGE_FIELD_CHECK(task, message_id) && + strcmp(MESSAGE_FIELD(task, message_id), "undef") != 0) { + ret = TRUE; + } + break; + case RSPAMD_LOG_QID: + if (task->queue_id && strcmp(task->queue_id, "undef") != 0) { + ret = TRUE; + } + break; + case RSPAMD_LOG_USER: + if (task->auth_user) { + ret = TRUE; + } + break; + case RSPAMD_LOG_IP: + if (task->from_addr && rspamd_ip_is_valid(task->from_addr)) { + ret = TRUE; + } + break; + case RSPAMD_LOG_SMTP_RCPT: + case RSPAMD_LOG_SMTP_RCPTS: + if (task->rcpt_envelope && task->rcpt_envelope->len > 0) { + ret = TRUE; + } + break; + case RSPAMD_LOG_MIME_RCPT: + case RSPAMD_LOG_MIME_RCPTS: + if (MESSAGE_FIELD_CHECK(task, rcpt_mime) && + MESSAGE_FIELD(task, rcpt_mime)->len > 0) { + ret = TRUE; + } + break; + case RSPAMD_LOG_SMTP_FROM: + if (task->from_envelope) { + ret = TRUE; + } + break; + case RSPAMD_LOG_MIME_FROM: + if (MESSAGE_FIELD_CHECK(task, from_mime) && + MESSAGE_FIELD(task, from_mime)->len > 0) { + ret = TRUE; + } + break; + case RSPAMD_LOG_FILENAME: + if (task->msg.fpath) { + ret = TRUE; + } + break; + case RSPAMD_LOG_FORCED_ACTION: + if (task->result->passthrough_result) { + ret = TRUE; + } + break; + case RSPAMD_LOG_SETTINGS_ID: + if (task->settings_elt) { + ret = TRUE; + } + break; + default: + ret = TRUE; + break; + } + + return ret; +} + +/* + * Sort by symbol's score -> name + */ +static gint +rspamd_task_compare_log_sym(gconstpointer a, gconstpointer b) +{ + const struct rspamd_symbol_result *s1 = *(const struct rspamd_symbol_result **) a, + *s2 = *(const struct rspamd_symbol_result **) b; + gdouble w1, w2; + + + w1 = fabs(s1->score); + w2 = fabs(s2->score); + + if (w1 == w2 && s1->name && s2->name) { + return strcmp(s1->name, s2->name); + } + + return (w2 - w1) * 1000.0; +} + +static gint +rspamd_task_compare_log_group(gconstpointer a, gconstpointer b) +{ + const struct rspamd_symbols_group *s1 = *(const struct rspamd_symbols_group **) a, + *s2 = *(const struct rspamd_symbols_group **) b; + + return strcmp(s1->name, s2->name); +} + + +static rspamd_ftok_t +rspamd_task_log_metric_res(struct rspamd_task *task, + struct rspamd_log_format *lf) +{ + static gchar scorebuf[32]; + rspamd_ftok_t res = {.begin = NULL, .len = 0}; + struct rspamd_scan_result *mres; + gboolean first = TRUE; + rspamd_fstring_t *symbuf; + struct rspamd_symbol_result *sym; + GPtrArray *sorted_symbols; + struct rspamd_action *act; + struct rspamd_symbols_group *gr; + guint i, j; + khiter_t k; + guint max_log_elts = task->cfg->log_task_max_elts; + + mres = task->result; + act = rspamd_check_action_metric(task, NULL, NULL); + + if (mres != NULL) { + switch (lf->type) { + case RSPAMD_LOG_ISSPAM: + if (RSPAMD_TASK_IS_SKIPPED(task)) { + res.begin = "S"; + } + else if (!(act->flags & RSPAMD_ACTION_HAM)) { + res.begin = "T"; + } + else { + res.begin = "F"; + } + + res.len = 1; + break; + case RSPAMD_LOG_ACTION: + res.begin = act->name; + res.len = strlen(res.begin); + break; + case RSPAMD_LOG_SCORES: + res.len = rspamd_snprintf(scorebuf, sizeof(scorebuf), "%.2f/%.2f", + mres->score, rspamd_task_get_required_score(task, mres)); + res.begin = scorebuf; + break; + case RSPAMD_LOG_SYMBOLS: + symbuf = rspamd_fstring_sized_new(128); + sorted_symbols = g_ptr_array_sized_new(kh_size(mres->symbols)); + + kh_foreach_value(mres->symbols, sym, { + if (!(sym->flags & RSPAMD_SYMBOL_RESULT_IGNORED)) { + g_ptr_array_add(sorted_symbols, (gpointer) sym); + } + }); + + g_ptr_array_sort(sorted_symbols, rspamd_task_compare_log_sym); + + for (i = 0; i < sorted_symbols->len; i++) { + sym = g_ptr_array_index(sorted_symbols, i); + + if (first) { + rspamd_printf_fstring(&symbuf, "%s", sym->name); + } + else { + rspamd_printf_fstring(&symbuf, ",%s", sym->name); + } + + if (lf->flags & RSPAMD_LOG_FMT_FLAG_SYMBOLS_SCORES) { + rspamd_printf_fstring(&symbuf, "(%.2f)", sym->score); + } + + if (lf->flags & RSPAMD_LOG_FMT_FLAG_SYMBOLS_PARAMS) { + rspamd_printf_fstring(&symbuf, "{"); + + if (sym->options) { + struct rspamd_symbol_option *opt; + + j = 0; + + DL_FOREACH(sym->opts_head, opt) + { + rspamd_printf_fstring(&symbuf, "%*s;", + (gint) opt->optlen, opt->option); + + if (j >= max_log_elts && opt->next) { + rspamd_printf_fstring(&symbuf, "...;"); + break; + } + + j++; + } + } + + rspamd_printf_fstring(&symbuf, "}"); + } + + first = FALSE; + } + + g_ptr_array_free(sorted_symbols, TRUE); + + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) rspamd_fstring_free, + symbuf); + rspamd_mempool_notify_alloc(task->task_pool, symbuf->len); + res.begin = symbuf->str; + res.len = symbuf->len; + break; + + case RSPAMD_LOG_GROUPS: + case RSPAMD_LOG_PUBLIC_GROUPS: + + symbuf = rspamd_fstring_sized_new(128); + sorted_symbols = g_ptr_array_sized_new(kh_size(mres->sym_groups)); + + kh_foreach_key(mres->sym_groups, gr, { + if (!(gr->flags & RSPAMD_SYMBOL_GROUP_PUBLIC)) { + if (lf->type == RSPAMD_LOG_PUBLIC_GROUPS) { + continue; + } + } + + g_ptr_array_add(sorted_symbols, gr); + }); + + g_ptr_array_sort(sorted_symbols, rspamd_task_compare_log_group); + + for (i = 0; i < sorted_symbols->len; i++) { + gr = g_ptr_array_index(sorted_symbols, i); + + if (first) { + rspamd_printf_fstring(&symbuf, "%s", gr->name); + } + else { + rspamd_printf_fstring(&symbuf, ",%s", gr->name); + } + + k = kh_get(rspamd_symbols_group_hash, mres->sym_groups, gr); + + rspamd_printf_fstring(&symbuf, "(%.2f)", + kh_value(mres->sym_groups, k)); + + first = FALSE; + } + + g_ptr_array_free(sorted_symbols, TRUE); + + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) rspamd_fstring_free, + symbuf); + rspamd_mempool_notify_alloc(task->task_pool, symbuf->len); + res.begin = symbuf->str; + res.len = symbuf->len; + break; + default: + break; + } + } + + return res; +} + +static rspamd_fstring_t * +rspamd_task_log_write_var(struct rspamd_task *task, rspamd_fstring_t *logbuf, + const rspamd_ftok_t *var, const rspamd_ftok_t *content) +{ + rspamd_fstring_t *res = logbuf; + const gchar *p, *c, *end; + + if (content == NULL) { + /* Just output variable */ + res = rspamd_fstring_append(res, var->begin, var->len); + } + else { + /* Replace $ with variable value */ + p = content->begin; + c = p; + end = p + content->len; + + while (p < end) { + if (*p == '$') { + if (p > c) { + res = rspamd_fstring_append(res, c, p - c); + } + + res = rspamd_fstring_append(res, var->begin, var->len); + p++; + c = p; + } + else { + p++; + } + } + + if (p > c) { + res = rspamd_fstring_append(res, c, p - c); + } + } + + return res; +} + +static rspamd_fstring_t * +rspamd_task_write_ialist(struct rspamd_task *task, + GPtrArray *addrs, gint lim, + struct rspamd_log_format *lf, + rspamd_fstring_t *logbuf) +{ + rspamd_fstring_t *res = logbuf, *varbuf; + rspamd_ftok_t var = {.begin = NULL, .len = 0}; + struct rspamd_email_address *addr; + gint i, nchars = 0, wr = 0, cur_chars; + gboolean has_orig = FALSE; + guint max_log_elts = task->cfg->log_task_max_elts; + + if (addrs && lim <= 0) { + lim = addrs->len; + } + + PTR_ARRAY_FOREACH(addrs, i, addr) + { + if (addr->flags & RSPAMD_EMAIL_ADDR_ORIGINAL) { + has_orig = TRUE; + break; + } + } + + varbuf = rspamd_fstring_new(); + + PTR_ARRAY_FOREACH(addrs, i, addr) + { + if (wr >= lim) { + break; + } + + if (has_orig) { + /* Report merely original addresses */ + if (!(addr->flags & RSPAMD_EMAIL_ADDR_ORIGINAL)) { + continue; + } + } + + bool last = i == lim - 1; + + cur_chars = addr->addr_len; + varbuf = rspamd_fstring_append(varbuf, addr->addr, + cur_chars); + nchars += cur_chars; + wr++; + + if (varbuf->len > 0 && !last) { + varbuf = rspamd_fstring_append(varbuf, ",", 1); + } + + if (!last && (wr >= max_log_elts || nchars >= max_log_elts * 16)) { + varbuf = rspamd_fstring_append(varbuf, "...", 3); + break; + } + } + + if (varbuf->len > 0) { + var.begin = varbuf->str; + var.len = varbuf->len; + res = rspamd_task_log_write_var(task, logbuf, + &var, (const rspamd_ftok_t *) lf->data); + } + + rspamd_fstring_free(varbuf); + + return res; +} + +static rspamd_fstring_t * +rspamd_task_write_addr_list(struct rspamd_task *task, + GPtrArray *addrs, gint lim, + struct rspamd_log_format *lf, + rspamd_fstring_t *logbuf) +{ + rspamd_fstring_t *res = logbuf, *varbuf; + rspamd_ftok_t var = {.begin = NULL, .len = 0}; + struct rspamd_email_address *addr; + guint max_log_elts = task->cfg->log_task_max_elts; + guint i; + + if (lim <= 0) { + lim = addrs->len; + } + + varbuf = rspamd_fstring_new(); + + for (i = 0; i < lim; i++) { + addr = g_ptr_array_index(addrs, i); + bool last = i == lim - 1; + + if (addr->addr) { + varbuf = rspamd_fstring_append(varbuf, addr->addr, addr->addr_len); + } + + if (varbuf->len > 0 && !last) { + varbuf = rspamd_fstring_append(varbuf, ",", 1); + } + + if (!last && i >= max_log_elts) { + varbuf = rspamd_fstring_append(varbuf, "...", 3); + break; + } + } + + if (varbuf->len > 0) { + var.begin = varbuf->str; + var.len = varbuf->len; + res = rspamd_task_log_write_var(task, logbuf, + &var, (const rspamd_ftok_t *) lf->data); + } + + rspamd_fstring_free(varbuf); + + return res; +} + +static rspamd_fstring_t * +rspamd_task_log_variable(struct rspamd_task *task, + struct rspamd_log_format *lf, rspamd_fstring_t *logbuf) +{ + rspamd_fstring_t *res = logbuf; + rspamd_ftok_t var = {.begin = NULL, .len = 0}; + static gchar numbuf[128]; + static const gchar undef[] = "undef"; + + switch (lf->type) { + /* String vars */ + case RSPAMD_LOG_MID: + if (MESSAGE_FIELD_CHECK(task, message_id)) { + var.begin = MESSAGE_FIELD(task, message_id); + var.len = strlen(var.begin); + } + else { + var.begin = undef; + var.len = sizeof(undef) - 1; + } + break; + case RSPAMD_LOG_QID: + if (task->queue_id) { + var.begin = task->queue_id; + var.len = strlen(var.begin); + } + else { + var.begin = undef; + var.len = sizeof(undef) - 1; + } + break; + case RSPAMD_LOG_USER: + if (task->auth_user) { + var.begin = task->auth_user; + var.len = strlen(var.begin); + } + else { + var.begin = undef; + var.len = sizeof(undef) - 1; + } + break; + case RSPAMD_LOG_IP: + if (task->from_addr && rspamd_ip_is_valid(task->from_addr)) { + var.begin = rspamd_inet_address_to_string(task->from_addr); + var.len = strlen(var.begin); + } + else { + var.begin = undef; + var.len = sizeof(undef) - 1; + } + break; + /* Numeric vars */ + case RSPAMD_LOG_LEN: + var.len = rspamd_snprintf(numbuf, sizeof(numbuf), "%uz", + task->msg.len); + var.begin = numbuf; + break; + case RSPAMD_LOG_DNS_REQ: + var.len = rspamd_snprintf(numbuf, sizeof(numbuf), "%uD", + task->dns_requests); + var.begin = numbuf; + break; + case RSPAMD_LOG_TIME_REAL: + case RSPAMD_LOG_TIME_VIRTUAL: + var.begin = rspamd_log_check_time(task->task_timestamp, + task->time_real_finish, + task->cfg->clock_res); + var.len = strlen(var.begin); + break; + /* InternetAddress vars */ + case RSPAMD_LOG_SMTP_FROM: + if (task->from_envelope) { + var.begin = task->from_envelope->addr; + var.len = task->from_envelope->addr_len; + } + break; + case RSPAMD_LOG_MIME_FROM: + if (MESSAGE_FIELD_CHECK(task, from_mime)) { + return rspamd_task_write_ialist(task, + MESSAGE_FIELD(task, from_mime), + 1, + lf, + logbuf); + } + break; + case RSPAMD_LOG_SMTP_RCPT: + if (task->rcpt_envelope) { + return rspamd_task_write_addr_list(task, task->rcpt_envelope, 1, lf, + logbuf); + } + break; + case RSPAMD_LOG_MIME_RCPT: + if (MESSAGE_FIELD_CHECK(task, rcpt_mime)) { + return rspamd_task_write_ialist(task, + MESSAGE_FIELD(task, rcpt_mime), + 1, + lf, + logbuf); + } + break; + case RSPAMD_LOG_SMTP_RCPTS: + if (task->rcpt_envelope) { + return rspamd_task_write_addr_list(task, task->rcpt_envelope, -1, lf, + logbuf); + } + break; + case RSPAMD_LOG_MIME_RCPTS: + if (MESSAGE_FIELD_CHECK(task, rcpt_mime)) { + return rspamd_task_write_ialist(task, + MESSAGE_FIELD(task, rcpt_mime), + -1, /* All addresses */ + lf, + logbuf); + } + break; + case RSPAMD_LOG_DIGEST: + if (task->message) { + var.len = rspamd_snprintf(numbuf, sizeof(numbuf), "%*xs", + (gint) sizeof(MESSAGE_FIELD(task, digest)), + MESSAGE_FIELD(task, digest)); + var.begin = numbuf; + } + else { + var.begin = undef; + var.len = sizeof(undef) - 1; + } + break; + case RSPAMD_LOG_FILENAME: + if (task->msg.fpath) { + var.len = strlen(task->msg.fpath); + var.begin = task->msg.fpath; + } + else { + var.begin = undef; + var.len = sizeof(undef) - 1; + } + break; + case RSPAMD_LOG_FORCED_ACTION: + if (task->result->passthrough_result) { + struct rspamd_passthrough_result *pr = task->result->passthrough_result; + + if (!isnan(pr->target_score)) { + var.len = rspamd_snprintf(numbuf, sizeof(numbuf), + "%s \"%s\"; score=%.2f (set by %s)", + pr->action->name, + pr->message, + pr->target_score, + pr->module); + } + else { + var.len = rspamd_snprintf(numbuf, sizeof(numbuf), + "%s \"%s\"; score=nan (set by %s)", + pr->action->name, + pr->message, + pr->module); + } + var.begin = numbuf; + } + else { + var.begin = undef; + var.len = sizeof(undef) - 1; + } + break; + case RSPAMD_LOG_SETTINGS_ID: + if (task->settings_elt) { + var.begin = task->settings_elt->name; + var.len = strlen(task->settings_elt->name); + } + else { + var.begin = undef; + var.len = sizeof(undef) - 1; + } + break; + case RSPAMD_LOG_MEMPOOL_SIZE: + var.len = rspamd_snprintf(numbuf, sizeof(numbuf), + "%Hz", + rspamd_mempool_get_used_size(task->task_pool)); + var.begin = numbuf; + break; + case RSPAMD_LOG_MEMPOOL_WASTE: + var.len = rspamd_snprintf(numbuf, sizeof(numbuf), + "%Hz", + rspamd_mempool_get_wasted_size(task->task_pool)); + var.begin = numbuf; + break; + default: + var = rspamd_task_log_metric_res(task, lf); + break; + } + + if (var.len > 0) { + res = rspamd_task_log_write_var(task, logbuf, + &var, (const rspamd_ftok_t *) lf->data); + } + + return res; +} + +void rspamd_task_write_log(struct rspamd_task *task) +{ + rspamd_fstring_t *logbuf; + struct rspamd_log_format *lf; + struct rspamd_task **ptask; + const gchar *lua_str; + gsize lua_str_len; + lua_State *L; + + g_assert(task != NULL); + + if (task->cfg->log_format == NULL || + (task->flags & RSPAMD_TASK_FLAG_NO_LOG)) { + msg_debug_task("skip logging due to no log flag"); + return; + } + + logbuf = rspamd_fstring_sized_new(1000); + + DL_FOREACH(task->cfg->log_format, lf) + { + switch (lf->type) { + case RSPAMD_LOG_STRING: + logbuf = rspamd_fstring_append(logbuf, lf->data, lf->len); + break; + case RSPAMD_LOG_LUA: + L = task->cfg->lua_state; + lua_rawgeti(L, LUA_REGISTRYINDEX, GPOINTER_TO_INT(lf->data)); + ptask = lua_newuserdata(L, sizeof(*ptask)); + rspamd_lua_setclass(L, "rspamd{task}", -1); + *ptask = task; + + if (lua_pcall(L, 1, 1, 0) != 0) { + msg_err_task("call to log function failed: %s", + lua_tostring(L, -1)); + lua_pop(L, 1); + } + else { + lua_str = lua_tolstring(L, -1, &lua_str_len); + + if (lua_str != NULL) { + logbuf = rspamd_fstring_append(logbuf, lua_str, lua_str_len); + } + lua_pop(L, 1); + } + break; + default: + /* We have a variable in log format */ + if (lf->flags & RSPAMD_LOG_FMT_FLAG_CONDITION) { + if (!rspamd_task_log_check_condition(task, lf)) { + continue; + } + } + + logbuf = rspamd_task_log_variable(task, lf, logbuf); + break; + } + } + + msg_notice_task("%V", logbuf); + + rspamd_fstring_free(logbuf); +} + +gdouble +rspamd_task_get_required_score(struct rspamd_task *task, struct rspamd_scan_result *m) +{ + if (m == NULL) { + m = task->result; + + if (m == NULL) { + return NAN; + } + } + + for (guint i = m->nactions; i-- > 0;) { + struct rspamd_action_config *action_lim = &m->actions_config[i]; + + + if (!isnan(action_lim->cur_limit) && + !(action_lim->action->flags & (RSPAMD_ACTION_NO_THRESHOLD | RSPAMD_ACTION_HAM))) { + return m->actions_config[i].cur_limit; + } + } + + return NAN; +} + +rspamd_ftok_t * +rspamd_task_get_request_header(struct rspamd_task *task, + const gchar *name) +{ + struct rspamd_request_header_chain *ret = + rspamd_task_get_request_header_multiple(task, name); + + if (ret) { + return ret->hdr; + } + + return NULL; +} + +struct rspamd_request_header_chain * +rspamd_task_get_request_header_multiple(struct rspamd_task *task, + const gchar *name) +{ + struct rspamd_request_header_chain *ret = NULL; + rspamd_ftok_t srch; + khiter_t k; + + srch.begin = (gchar *) name; + srch.len = strlen(name); + + k = kh_get(rspamd_req_headers_hash, task->request_headers, + &srch); + + if (k != kh_end(task->request_headers)) { + ret = kh_value(task->request_headers, k); + } + + return ret; +} + + +void rspamd_task_add_request_header(struct rspamd_task *task, + rspamd_ftok_t *name, rspamd_ftok_t *value) +{ + + khiter_t k; + gint res; + struct rspamd_request_header_chain *chain, *nchain; + + k = kh_put(rspamd_req_headers_hash, task->request_headers, + name, &res); + + if (res == 0) { + /* Existing name */ + nchain = rspamd_mempool_alloc(task->task_pool, sizeof(*nchain)); + nchain->hdr = value; + nchain->next = NULL; + chain = kh_value(task->request_headers, k); + + /* Slow but OK here */ + LL_APPEND(chain, nchain); + } + else { + nchain = rspamd_mempool_alloc(task->task_pool, sizeof(*nchain)); + nchain->hdr = value; + nchain->next = NULL; + + kh_value(task->request_headers, k) = nchain; + } +} + + +void rspamd_task_profile_set(struct rspamd_task *task, const gchar *key, + gdouble value) +{ + GHashTable *tbl; + gdouble *pval; + + if (key == NULL) { + return; + } + + tbl = rspamd_mempool_get_variable(task->task_pool, RSPAMD_MEMPOOL_PROFILE); + + if (tbl == NULL) { + tbl = g_hash_table_new(rspamd_str_hash, rspamd_str_equal); + rspamd_mempool_set_variable(task->task_pool, RSPAMD_MEMPOOL_PROFILE, + tbl, (rspamd_mempool_destruct_t) g_hash_table_unref); + } + + pval = g_hash_table_lookup(tbl, key); + + if (pval == NULL) { + pval = rspamd_mempool_alloc(task->task_pool, sizeof(*pval)); + *pval = value; + g_hash_table_insert(tbl, (void *) key, pval); + } + else { + *pval = value; + } +} + +gdouble * +rspamd_task_profile_get(struct rspamd_task *task, const gchar *key) +{ + GHashTable *tbl; + gdouble *pval = NULL; + + tbl = rspamd_mempool_get_variable(task->task_pool, RSPAMD_MEMPOOL_PROFILE); + + if (tbl != NULL) { + pval = g_hash_table_lookup(tbl, key); + } + + return pval; +} + + +gboolean +rspamd_task_set_finish_time(struct rspamd_task *task) +{ + if (isnan(task->time_real_finish)) { + task->time_real_finish = ev_time(); + + return TRUE; + } + + return FALSE; +} + +const gchar * +rspamd_task_stage_name(enum rspamd_task_stage stg) +{ + const gchar *ret = "unknown stage"; + + switch (stg) { + case RSPAMD_TASK_STAGE_CONNECT: + ret = "connect"; + break; + case RSPAMD_TASK_STAGE_CONNFILTERS: + ret = "connection_filter"; + break; + case RSPAMD_TASK_STAGE_READ_MESSAGE: + ret = "read_message"; + break; + case RSPAMD_TASK_STAGE_PRE_FILTERS: + ret = "prefilters"; + break; + case RSPAMD_TASK_STAGE_PROCESS_MESSAGE: + ret = "process_message"; + break; + case RSPAMD_TASK_STAGE_FILTERS: + ret = "filters"; + break; + case RSPAMD_TASK_STAGE_CLASSIFIERS_PRE: + ret = "classifiers_pre"; + break; + case RSPAMD_TASK_STAGE_CLASSIFIERS: + ret = "classifiers"; + break; + case RSPAMD_TASK_STAGE_CLASSIFIERS_POST: + ret = "classifiers_post"; + break; + case RSPAMD_TASK_STAGE_COMPOSITES: + ret = "composites"; + break; + case RSPAMD_TASK_STAGE_POST_FILTERS: + ret = "postfilters"; + break; + case RSPAMD_TASK_STAGE_LEARN_PRE: + ret = "learn_pre"; + break; + case RSPAMD_TASK_STAGE_LEARN: + ret = "learn"; + break; + case RSPAMD_TASK_STAGE_LEARN_POST: + ret = "learn_post"; + break; + case RSPAMD_TASK_STAGE_COMPOSITES_POST: + ret = "composites_post"; + break; + case RSPAMD_TASK_STAGE_IDEMPOTENT: + ret = "idempotent"; + break; + case RSPAMD_TASK_STAGE_DONE: + ret = "done"; + break; + case RSPAMD_TASK_STAGE_REPLIED: + ret = "replied"; + break; + default: + break; + } + + return ret; +} + +void rspamd_task_timeout(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_task *task = (struct rspamd_task *) w->data; + + if (!(task->processed_stages & RSPAMD_TASK_STAGE_FILTERS)) { + ev_now_update_if_cheap(task->event_loop); + msg_info_task("processing of task time out: %.1fs spent; %.1fs limit; " + "forced processing", + ev_now(task->event_loop) - task->task_timestamp, + w->repeat); + + if (task->cfg->soft_reject_on_timeout) { + struct rspamd_action *action, *soft_reject; + + action = rspamd_check_action_metric(task, NULL, NULL); + + if (action->action_type != METRIC_ACTION_REJECT) { + soft_reject = rspamd_config_get_action_by_type(task->cfg, + METRIC_ACTION_SOFT_REJECT); + rspamd_add_passthrough_result(task, + soft_reject, + 0, + NAN, + "timeout processing message", + "task timeout", + 0, NULL); + } + } + + ev_timer_again(EV_A_ w); + task->processed_stages |= RSPAMD_TASK_STAGE_FILTERS; + rspamd_session_cleanup(task->s, true); + rspamd_task_process(task, RSPAMD_TASK_PROCESS_ALL); + rspamd_session_pending(task->s); + } + else { + /* Postprocessing timeout */ + msg_info_task("post-processing of task time out: %.1f second spent; forced processing", + ev_now(task->event_loop) - task->task_timestamp); + + if (task->cfg->soft_reject_on_timeout) { + struct rspamd_action *action, *soft_reject; + + action = rspamd_check_action_metric(task, NULL, NULL); + + if (action->action_type != METRIC_ACTION_REJECT) { + soft_reject = rspamd_config_get_action_by_type(task->cfg, + METRIC_ACTION_SOFT_REJECT); + rspamd_add_passthrough_result(task, + soft_reject, + 0, + NAN, + "timeout post-processing message", + "task timeout", + 0, NULL); + } + } + + ev_timer_stop(EV_A_ w); + task->processed_stages |= RSPAMD_TASK_STAGE_DONE; + rspamd_session_cleanup(task->s, true); + rspamd_task_process(task, RSPAMD_TASK_PROCESS_ALL); + rspamd_session_pending(task->s); + } +} + +void rspamd_worker_guard_handler(EV_P_ ev_io *w, int revents) +{ + struct rspamd_task *task = (struct rspamd_task *) w->data; + gchar fake_buf[1024]; + gssize r; + + r = read(w->fd, fake_buf, sizeof(fake_buf)); + + if (r > 0) { + msg_warn_task("received extra data after task is loaded, ignoring"); + } + else { + if (r == 0) { + /* + * Poor man approach, that might break things in case of + * shutdown (SHUT_WR) but sockets are so bad that there's no + * reliable way to distinguish between shutdown(SHUT_WR) and + * close. + */ + if (task->cmd != CMD_CHECK_V2 && task->cfg->enable_shutdown_workaround) { + msg_info_task("workaround for shutdown enabled, please update " + "your client, this support might be removed in future"); + shutdown(w->fd, SHUT_RD); + ev_io_stop(task->event_loop, &task->guard_ev); + } + else { + msg_err_task("the peer has closed connection unexpectedly"); + rspamd_session_destroy(task->s); + } + } + else if (errno != EAGAIN) { + msg_err_task("the peer has closed connection unexpectedly: %s", + strerror(errno)); + rspamd_session_destroy(task->s); + } + else { + return; + } + } +} diff --git a/src/libserver/task.h b/src/libserver/task.h new file mode 100644 index 0000000..5404a11 --- /dev/null +++ b/src/libserver/task.h @@ -0,0 +1,392 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TASK_H_ +#define TASK_H_ + +#include "config.h" +#include "libserver/http/http_connection.h" +#include "async_session.h" +#include "util.h" +#include "mem_pool.h" +#include "dns.h" +#include "re_cache.h" +#include "khash.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum rspamd_command { + CMD_SKIP = 0, + CMD_PING, + CMD_CHECK_SPAMC, /* Legacy spamassassin format */ + CMD_CHECK_RSPAMC, /* Legacy rspamc format (like SA one) */ + CMD_CHECK, /* Legacy check - metric json reply */ + CMD_CHECK_V2, /* Modern check - symbols in json reply */ +}; + +enum rspamd_task_stage { + RSPAMD_TASK_STAGE_CONNECT = (1u << 0u), + RSPAMD_TASK_STAGE_CONNFILTERS = (1u << 1u), + RSPAMD_TASK_STAGE_READ_MESSAGE = (1u << 2u), + RSPAMD_TASK_STAGE_PROCESS_MESSAGE = (1u << 3u), + RSPAMD_TASK_STAGE_PRE_FILTERS = (1u << 4u), + RSPAMD_TASK_STAGE_FILTERS = (1u << 5u), + RSPAMD_TASK_STAGE_CLASSIFIERS_PRE = (1u << 6u), + RSPAMD_TASK_STAGE_CLASSIFIERS = (1u << 7u), + RSPAMD_TASK_STAGE_CLASSIFIERS_POST = (1u << 8u), + RSPAMD_TASK_STAGE_COMPOSITES = (1u << 9u), + RSPAMD_TASK_STAGE_POST_FILTERS = (1u << 10u), + RSPAMD_TASK_STAGE_LEARN_PRE = (1u << 11u), + RSPAMD_TASK_STAGE_LEARN = (1u << 12u), + RSPAMD_TASK_STAGE_LEARN_POST = (1u << 13u), + RSPAMD_TASK_STAGE_COMPOSITES_POST = (1u << 14u), + RSPAMD_TASK_STAGE_IDEMPOTENT = (1u << 15u), + RSPAMD_TASK_STAGE_DONE = (1u << 16u), + RSPAMD_TASK_STAGE_REPLIED = (1u << 17u) +}; + +#define RSPAMD_TASK_PROCESS_ALL (RSPAMD_TASK_STAGE_CONNECT | \ + RSPAMD_TASK_STAGE_CONNFILTERS | \ + RSPAMD_TASK_STAGE_READ_MESSAGE | \ + RSPAMD_TASK_STAGE_PRE_FILTERS | \ + RSPAMD_TASK_STAGE_PROCESS_MESSAGE | \ + RSPAMD_TASK_STAGE_FILTERS | \ + RSPAMD_TASK_STAGE_CLASSIFIERS_PRE | \ + RSPAMD_TASK_STAGE_CLASSIFIERS | \ + RSPAMD_TASK_STAGE_CLASSIFIERS_POST | \ + RSPAMD_TASK_STAGE_COMPOSITES | \ + RSPAMD_TASK_STAGE_POST_FILTERS | \ + RSPAMD_TASK_STAGE_LEARN_PRE | \ + RSPAMD_TASK_STAGE_LEARN | \ + RSPAMD_TASK_STAGE_LEARN_POST | \ + RSPAMD_TASK_STAGE_COMPOSITES_POST | \ + RSPAMD_TASK_STAGE_IDEMPOTENT | \ + RSPAMD_TASK_STAGE_DONE) +#define RSPAMD_TASK_PROCESS_LEARN (RSPAMD_TASK_STAGE_CONNECT | \ + RSPAMD_TASK_STAGE_READ_MESSAGE | \ + RSPAMD_TASK_STAGE_PROCESS_MESSAGE | \ + RSPAMD_TASK_STAGE_CLASSIFIERS_PRE | \ + RSPAMD_TASK_STAGE_CLASSIFIERS | \ + RSPAMD_TASK_STAGE_CLASSIFIERS_POST | \ + RSPAMD_TASK_STAGE_LEARN_PRE | \ + RSPAMD_TASK_STAGE_LEARN | \ + RSPAMD_TASK_STAGE_LEARN_POST | \ + RSPAMD_TASK_STAGE_DONE) + +#define RSPAMD_TASK_FLAG_MIME (1u << 0u) +#define RSPAMD_TASK_FLAG_SKIP_PROCESS (1u << 1u) +#define RSPAMD_TASK_FLAG_SKIP (1u << 2u) +#define RSPAMD_TASK_FLAG_PASS_ALL (1u << 3u) +#define RSPAMD_TASK_FLAG_NO_LOG (1u << 4u) +#define RSPAMD_TASK_FLAG_NO_IP (1u << 5u) +#define RSPAMD_TASK_FLAG_PROCESSING (1u << 6u) +#define RSPAMD_TASK_FLAG_GTUBE (1u << 7u) +#define RSPAMD_TASK_FLAG_FILE (1u << 8u) +#define RSPAMD_TASK_FLAG_NO_STAT (1u << 9u) +#define RSPAMD_TASK_FLAG_UNLEARN (1u << 10u) +#define RSPAMD_TASK_FLAG_ALREADY_LEARNED (1u << 11u) +#define RSPAMD_TASK_FLAG_LEARN_SPAM (1u << 12u) +#define RSPAMD_TASK_FLAG_LEARN_HAM (1u << 13u) +#define RSPAMD_TASK_FLAG_LEARN_AUTO (1u << 14u) +#define RSPAMD_TASK_FLAG_BROKEN_HEADERS (1u << 15u) +#define RSPAMD_TASK_FLAG_HAS_SPAM_TOKENS (1u << 16u) +#define RSPAMD_TASK_FLAG_HAS_HAM_TOKENS (1u << 17u) +#define RSPAMD_TASK_FLAG_EMPTY (1u << 18u) +#define RSPAMD_TASK_FLAG_PROFILE (1u << 19u) +#define RSPAMD_TASK_FLAG_GREYLISTED (1u << 20u) +#define RSPAMD_TASK_FLAG_OWN_POOL (1u << 21u) +#define RSPAMD_TASK_FLAG_SSL (1u << 22u) +#define RSPAMD_TASK_FLAG_BAD_UNICODE (1u << 23u) +#define RSPAMD_TASK_FLAG_MESSAGE_REWRITE (1u << 24u) +#define RSPAMD_TASK_FLAG_MAX_SHIFT (24u) + + +/* Request has a JSON control block */ +#define RSPAMD_TASK_PROTOCOL_FLAG_HAS_CONTROL (1u << 0u) +/* Request has been done by a local client */ +#define RSPAMD_TASK_PROTOCOL_FLAG_LOCAL_CLIENT (1u << 1u) +/* Request has been sent via milter */ +#define RSPAMD_TASK_PROTOCOL_FLAG_MILTER (1u << 2u) +/* Compress protocol reply */ +#define RSPAMD_TASK_PROTOCOL_FLAG_COMPRESSED (1u << 3u) +/* Include all URLs */ +#define RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS (1u << 4u) +/* Client allows body block (including headers in no FLAG_MILTER) */ +#define RSPAMD_TASK_PROTOCOL_FLAG_BODY_BLOCK (1u << 5u) +/* Emit groups information */ +#define RSPAMD_TASK_PROTOCOL_FLAG_GROUPS (1u << 6u) +#define RSPAMD_TASK_PROTOCOL_FLAG_MAX_SHIFT (6u) + +#define RSPAMD_TASK_IS_SKIPPED(task) (G_UNLIKELY((task)->flags & RSPAMD_TASK_FLAG_SKIP)) +#define RSPAMD_TASK_IS_SPAMC(task) (G_UNLIKELY((task)->cmd == CMD_CHECK_SPAMC)) +#define RSPAMD_TASK_IS_PROCESSED(task) (G_UNLIKELY((task)->processed_stages & RSPAMD_TASK_STAGE_DONE)) +#define RSPAMD_TASK_IS_CLASSIFIED(task) (((task)->processed_stages & RSPAMD_TASK_STAGE_CLASSIFIERS)) +#define RSPAMD_TASK_IS_EMPTY(task) (G_UNLIKELY((task)->flags & RSPAMD_TASK_FLAG_EMPTY)) +#define RSPAMD_TASK_IS_PROFILING(task) (G_UNLIKELY((task)->flags & RSPAMD_TASK_FLAG_PROFILE)) +#define RSPAMD_TASK_IS_MIME(task) (G_LIKELY((task)->flags & RSPAMD_TASK_FLAG_MIME)) + +struct rspamd_email_address; +struct rspamd_lang_detector; +enum rspamd_newlines_type; +struct rspamd_message; + +struct rspamd_task_data_storage { + const gchar *begin; + gsize len; + gchar *fpath; +}; + +struct rspamd_request_header_chain { + rspamd_ftok_t *hdr; + struct rspamd_request_header_chain *next; +}; + +__KHASH_TYPE(rspamd_req_headers_hash, rspamd_ftok_t *, struct rspamd_request_header_chain *); + +struct rspamd_lua_cached_entry { + gint ref; + guint id; +}; + +KHASH_INIT(rspamd_task_lua_cache, char *, struct rspamd_lua_cached_entry, 1, kh_str_hash_func, kh_str_hash_equal); + +/** + * Worker task structure + */ +struct rspamd_task { + struct rspamd_worker *worker; /**< pointer to worker object */ + enum rspamd_command cmd; /**< command */ + gint sock; /**< socket descriptor */ + guint32 dns_requests; /**< number of DNS requests per this task */ + guint32 flags; /**< Bit flags */ + guint32 protocol_flags; + guint32 processed_stages; /**< bits of stages that are processed */ + gchar *helo; /**< helo header value */ + gchar *queue_id; /**< queue id if specified */ + rspamd_inet_addr_t *from_addr; /**< from addr for a task */ + rspamd_inet_addr_t *client_addr; /**< address of connected socket */ + gchar *deliver_to; /**< address to deliver */ + gchar *auth_user; /**< SMTP authenticated user */ + const gchar *hostname; /**< hostname reported by MTA */ + khash_t(rspamd_req_headers_hash) * request_headers; /**< HTTP headers in a request */ + struct rspamd_task_data_storage msg; /**< message buffer */ + struct rspamd_http_connection *http_conn; /**< HTTP server connection */ + struct rspamd_async_session *s; /**< async session object */ + struct rspamd_scan_result *result; /**< Metric result */ + khash_t(rspamd_task_lua_cache) lua_cache; /**< cache of lua objects */ + GPtrArray *tokens; /**< statistics tokens */ + GArray *meta_words; /**< rspamd_stat_token_t produced from meta headers + (e.g. Subject) */ + + GPtrArray *rcpt_envelope; /**< array of rspamd_email_address */ + struct rspamd_email_address *from_envelope; + struct rspamd_email_address *from_envelope_orig; + + ucl_object_t *messages; /**< list of messages that would be reported */ + struct rspamd_re_runtime *re_rt; /**< regexp runtime */ + GPtrArray *stat_runtimes; /**< backend runtime */ + struct rspamd_config *cfg; /**< pointer to config object */ + GError *err; + rspamd_mempool_t *task_pool; /**< memory pool for task */ + double time_real_finish; + ev_tstamp task_timestamp; + + gboolean (*fin_callback)(struct rspamd_task *task, void *arg); + /**< callback for filters finalizing */ + void *fin_arg; /**< argument for fin callback */ + + struct rspamd_dns_resolver *resolver; /**< DNS resolver */ + struct ev_loop *event_loop; /**< Event base */ + struct ev_timer timeout_ev; /**< Global task timeout */ + struct ev_io guard_ev; /**< Event for input sanity guard */ + + gpointer symcache_runtime; /**< Opaque checkpoint data */ + ucl_object_t *settings; /**< Settings applied to task */ + struct rspamd_config_settings_elt *settings_elt; /**< preprocessed settings id elt */ + + const gchar *classifier; /**< Classifier to learn (if needed) */ + struct rspamd_lang_detector *lang_det; /**< Languages detector */ + struct rspamd_message *message; +}; + +/** + * Construct new task for worker + */ +struct rspamd_task *rspamd_task_new(struct rspamd_worker *worker, + struct rspamd_config *cfg, + rspamd_mempool_t *pool, + struct rspamd_lang_detector *lang_det, + struct ev_loop *event_loop, + gboolean debug_mem); + +/** + * Destroy task object and remove its IO dispatcher if it exists + */ +void rspamd_task_free(struct rspamd_task *task); + +/** + * Called if all filters are processed + * @return TRUE if session should be terminated + */ +gboolean rspamd_task_fin(void *arg); + +/** + * Load HTTP message with body in `msg` to an rspamd_task + * @param task + * @param msg + * @param start + * @param len + * @return + */ +gboolean rspamd_task_load_message(struct rspamd_task *task, + struct rspamd_http_message *msg, + const gchar *start, gsize len); + +/** + * Process task + * @param task task to process + * @return task has been successfully parsed and processed + */ +gboolean rspamd_task_process(struct rspamd_task *task, guint stages); + +/** + * Return address of sender or NULL + * @param task + * @return + */ +struct rspamd_email_address *rspamd_task_get_sender(struct rspamd_task *task); + +/** + * Return addresses in the following precedence: + * - deliver to + * - the first smtp recipient + * - the first mime recipient + * @param task + * @return + */ +const gchar *rspamd_task_get_principal_recipient(struct rspamd_task *task); + +/** + * Add a recipient for a task + * @param task task object + * @param rcpt string representation of recipient address + * @return TRUE if an address has been parsed and added + */ +gboolean rspamd_task_add_recipient(struct rspamd_task *task, const gchar *rcpt); + +/** + * Learn specified statfile with message in a task + * @param task worker's task object + * @param classifier classifier to learn (or NULL to learn all) + * @param err pointer to GError + * @return true if learn succeed + */ +gboolean rspamd_learn_task_spam(struct rspamd_task *task, + gboolean is_spam, + const gchar *classifier, + GError **err); + +/** + * Returns required score for a message (usually reject score) + * @param task + * @param m + * @return + */ +struct rspamd_scan_result; + +gdouble rspamd_task_get_required_score(struct rspamd_task *task, + struct rspamd_scan_result *m); + +/** + * Returns the first header as value for a header + * @param task + * @param name + * @return + */ +rspamd_ftok_t *rspamd_task_get_request_header(struct rspamd_task *task, + const gchar *name); + +/** + * Returns all headers with the specific name + * @param task + * @param name + * @return + */ +struct rspamd_request_header_chain *rspamd_task_get_request_header_multiple( + struct rspamd_task *task, + const gchar *name); + +/** + * Adds a new request header to task (name and value should be mapped to fstring) + * @param task + * @param name + * @param value + */ +void rspamd_task_add_request_header(struct rspamd_task *task, + rspamd_ftok_t *name, rspamd_ftok_t *value); + +/** + * Write log line about the specified task if needed + */ +void rspamd_task_write_log(struct rspamd_task *task); + +/** + * Set profiling value for a specific key + * @param task + * @param key + * @param value + */ +void rspamd_task_profile_set(struct rspamd_task *task, const gchar *key, + gdouble value); + +/** + * Get value for a specific profiling key + * @param task + * @param key + * @return + */ +gdouble *rspamd_task_profile_get(struct rspamd_task *task, const gchar *key); + +/** + * Sets finishing time for a task if not yet set + * @param task + * @return + */ +gboolean rspamd_task_set_finish_time(struct rspamd_task *task); + +/** + * Returns task processing stage name + * @param stg + * @return + */ +const gchar *rspamd_task_stage_name(enum rspamd_task_stage stg); + +/* + * Called on forced timeout + */ +void rspamd_task_timeout(EV_P_ ev_timer *w, int revents); + +/* + * Called on unexpected IO error (e.g. ECONNRESET) + */ +void rspamd_worker_guard_handler(EV_P_ ev_io *w, int revents); + +#ifdef __cplusplus +} +#endif + +#endif /* TASK_H_ */ diff --git a/src/libserver/url.c b/src/libserver/url.c new file mode 100644 index 0000000..0842a1e --- /dev/null +++ b/src/libserver/url.c @@ -0,0 +1,4365 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "url.h" +#include "util.h" +#include "rspamd.h" +#include "message.h" +#include "multipattern.h" +#include "contrib/uthash/utlist.h" +#include "contrib/http-parser/http_parser.h" +#include <unicode/utf8.h> +#include <unicode/uchar.h> +#include <unicode/usprep.h> +#include <unicode/ucnv.h> + +typedef struct url_match_s { + const gchar *m_begin; + gsize m_len; + const gchar *pattern; + const gchar *prefix; + const gchar *newline_pos; + const gchar *prev_newline_pos; + gboolean add_prefix; + gchar st; +} url_match_t; + +#define URL_MATCHER_FLAG_NOHTML (1u << 0u) +#define URL_MATCHER_FLAG_TLD_MATCH (1u << 1u) +#define URL_MATCHER_FLAG_STAR_MATCH (1u << 2u) +#define URL_MATCHER_FLAG_REGEXP (1u << 3u) + +struct url_callback_data; + +static const struct { + enum rspamd_url_protocol proto; + const gchar *name; + gsize len; +} rspamd_url_protocols[] = { + {.proto = PROTOCOL_FILE, + .name = "file", + .len = 4}, + {.proto = PROTOCOL_FTP, + .name = "ftp", + .len = 3}, + {.proto = PROTOCOL_HTTP, + .name = "http", + .len = 4}, + {.proto = PROTOCOL_HTTPS, + .name = "https", + .len = 5}, + {.proto = PROTOCOL_MAILTO, + .name = "mailto", + .len = 6}, + {.proto = PROTOCOL_TELEPHONE, + .name = "tel", + .len = 3}, + {.proto = PROTOCOL_TELEPHONE, + .name = "callto", + .len = 3}, + {.proto = PROTOCOL_UNKNOWN, + .name = NULL, + .len = 0}}; +struct url_matcher { + const gchar *pattern; + const gchar *prefix; + + gboolean (*start)(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + + gboolean (*end)(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + + gint flags; +}; + +static gboolean url_file_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_file_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_web_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_web_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_tld_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_tld_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_email_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_email_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_tel_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +static gboolean url_tel_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match); + +struct url_matcher static_matchers[] = { + /* Common prefixes */ + {"file://", "", url_file_start, url_file_end, + 0}, + {"file:\\\\", "", url_file_start, url_file_end, + 0}, + {"ftp://", "", url_web_start, url_web_end, + 0}, + {"ftp:\\\\", "", url_web_start, url_web_end, + 0}, + {"sftp://", "", url_web_start, url_web_end, + 0}, + {"http:", "", url_web_start, url_web_end, + 0}, + {"https:", "", url_web_start, url_web_end, + 0}, + {"news://", "", url_web_start, url_web_end, + 0}, + {"nntp://", "", url_web_start, url_web_end, + 0}, + {"telnet://", "", url_web_start, url_web_end, + 0}, + {"tel:", "", url_tel_start, url_tel_end, + 0}, + {"webcal://", "", url_web_start, url_web_end, + 0}, + {"mailto:", "", url_email_start, url_email_end, + 0}, + {"callto:", "", url_tel_start, url_tel_end, + 0}, + {"h323:", "", url_web_start, url_web_end, + 0}, + {"sip:", "", url_web_start, url_web_end, + 0}, + {"www\\.[0-9a-z]", "http://", url_web_start, url_web_end, + URL_MATCHER_FLAG_REGEXP}, + {"ftp.", "ftp://", url_web_start, url_web_end, + 0}, + /* Likely emails */ + { + "@", "mailto://", url_email_start, url_email_end, + 0}}; + +struct rspamd_url_flag_name { + const gchar *name; + gint flag; + gint hash; +} url_flag_names[] = { + {"phished", RSPAMD_URL_FLAG_PHISHED, -1}, + {"numeric", RSPAMD_URL_FLAG_NUMERIC, -1}, + {"obscured", RSPAMD_URL_FLAG_OBSCURED, -1}, + {"redirected", RSPAMD_URL_FLAG_REDIRECTED, -1}, + {"html_displayed", RSPAMD_URL_FLAG_HTML_DISPLAYED, -1}, + {"text", RSPAMD_URL_FLAG_FROM_TEXT, -1}, + {"subject", RSPAMD_URL_FLAG_SUBJECT, -1}, + {"host_encoded", RSPAMD_URL_FLAG_HOSTENCODED, -1}, + {"schema_encoded", RSPAMD_URL_FLAG_SCHEMAENCODED, -1}, + {"path_encoded", RSPAMD_URL_FLAG_PATHENCODED, -1}, + {"query_encoded", RSPAMD_URL_FLAG_QUERYENCODED, -1}, + {"missing_slashes", RSPAMD_URL_FLAG_MISSINGSLASHES, -1}, + {"idn", RSPAMD_URL_FLAG_IDN, -1}, + {"has_port", RSPAMD_URL_FLAG_HAS_PORT, -1}, + {"has_user", RSPAMD_URL_FLAG_HAS_USER, -1}, + {"schemaless", RSPAMD_URL_FLAG_SCHEMALESS, -1}, + {"unnormalised", RSPAMD_URL_FLAG_UNNORMALISED, -1}, + {"zw_spaces", RSPAMD_URL_FLAG_ZW_SPACES, -1}, + {"url_displayed", RSPAMD_URL_FLAG_DISPLAY_URL, -1}, + {"image", RSPAMD_URL_FLAG_IMAGE, -1}, + {"query", RSPAMD_URL_FLAG_QUERY, -1}, + {"content", RSPAMD_URL_FLAG_CONTENT, -1}, + {"no_tld", RSPAMD_URL_FLAG_NO_TLD, -1}, + {"truncated", RSPAMD_URL_FLAG_TRUNCATED, -1}, + {"redirect_target", RSPAMD_URL_FLAG_REDIRECT_TARGET, -1}, + {"invisible", RSPAMD_URL_FLAG_INVISIBLE, -1}, + {"special", RSPAMD_URL_FLAG_SPECIAL, -1}, +}; + + +static inline khint_t rspamd_url_hash(struct rspamd_url *u); + +static inline khint_t rspamd_url_host_hash(struct rspamd_url *u); +static inline bool rspamd_urls_cmp(struct rspamd_url *a, struct rspamd_url *b); +static inline bool rspamd_urls_host_cmp(struct rspamd_url *a, struct rspamd_url *b); + +/* Hash table implementation */ +__KHASH_IMPL(rspamd_url_hash, kh_inline, struct rspamd_url *, char, false, + rspamd_url_hash, rspamd_urls_cmp); +__KHASH_IMPL(rspamd_url_host_hash, kh_inline, struct rspamd_url *, char, false, + rspamd_url_host_hash, rspamd_urls_host_cmp); + +struct url_callback_data { + const gchar *begin; + gchar *url_str; + rspamd_mempool_t *pool; + gint len; + enum rspamd_url_find_type how; + gboolean prefix_added; + guint newline_idx; + GArray *matchers; + GPtrArray *newlines; + const gchar *start; + const gchar *fin; + const gchar *end; + const gchar *last_at; + url_insert_function func; + void *funcd; +}; + +struct url_match_scanner { + GArray *matchers_full; + GArray *matchers_strict; + struct rspamd_multipattern *search_trie_full; + struct rspamd_multipattern *search_trie_strict; + bool has_tld_file; +}; + +struct url_match_scanner *url_scanner = NULL; + +enum { + IS_LWSP = (1 << 0), + IS_DOMAIN = (1 << 1), + IS_URLSAFE = (1 << 2), + IS_MAILSAFE = (1 << 3), + IS_DOMAIN_END = (1 << 4) +}; + +static const unsigned int url_scanner_table[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, IS_LWSP, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, IS_LWSP /* */, + IS_MAILSAFE /* ! */, IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* " */, + IS_MAILSAFE /* # */, IS_MAILSAFE /* $ */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* % */, 0 /* & */, IS_MAILSAFE /* ' */, + 0 /* ( */, 0 /* ) */, IS_MAILSAFE /* * */, + IS_MAILSAFE /* + */, IS_MAILSAFE /* , */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* - */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* . */, IS_DOMAIN_END | IS_MAILSAFE /* / */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 0 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 1 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 2 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 3 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 4 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 5 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 6 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 7 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 8 */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* 9 */, IS_DOMAIN_END /* : */, + 0 /* ; */, IS_URLSAFE | IS_DOMAIN_END /* < */, 0 /* = */, + IS_URLSAFE | IS_DOMAIN_END /* > */, IS_DOMAIN_END /* ? */, 0 /* @ */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* A */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* B */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* C */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* D */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* E */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* F */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* G */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* H */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* I */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* J */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* K */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* L */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* M */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* N */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* O */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* P */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Q */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* R */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* S */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* T */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* U */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* V */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* W */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* X */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Y */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* Z */, 0 /* [ */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* \ */, 0 /* ] */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* ^ */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* _ */, + IS_URLSAFE | IS_DOMAIN_END /* ` */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* a */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* b */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* c */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* d */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* e */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* f */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* g */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* h */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* i */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* j */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* k */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* l */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* m */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* n */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* o */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* p */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* q */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* r */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* s */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* t */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* u */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* v */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* w */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* x */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* y */, + IS_URLSAFE | IS_DOMAIN | IS_MAILSAFE /* z */, + IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* { */, + IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* | */, + IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* } */, + IS_URLSAFE | IS_DOMAIN_END | IS_MAILSAFE /* ~ */, 0, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, IS_URLSAFE | IS_DOMAIN, + IS_URLSAFE | IS_DOMAIN}; + +#define is_lwsp(x) ((url_scanner_table[(guchar) (x)] & IS_LWSP) != 0) +#define is_mailsafe(x) ((url_scanner_table[(guchar) (x)] & (IS_MAILSAFE)) != 0) +#define is_domain(x) ((url_scanner_table[(guchar) (x)] & IS_DOMAIN) != 0) +#define is_urlsafe(x) ((url_scanner_table[(guchar) (x)] & (IS_URLSAFE)) != 0) + +const gchar * +rspamd_url_strerror(int err) +{ + switch (err) { + case URI_ERRNO_OK: + return "Parsing went well"; + case URI_ERRNO_EMPTY: + return "The URI string was empty"; + case URI_ERRNO_INVALID_PROTOCOL: + return "No protocol was found"; + case URI_ERRNO_BAD_FORMAT: + return "Bad URL format"; + case URI_ERRNO_BAD_ENCODING: + return "Invalid symbols encoded"; + case URI_ERRNO_INVALID_PORT: + return "Port number is bad"; + case URI_ERRNO_TLD_MISSING: + return "TLD part is not detected"; + case URI_ERRNO_HOST_MISSING: + return "Host part is missing"; + case URI_ERRNO_TOO_LONG: + return "URL is too long"; + } + + return NULL; +} + +static gboolean +rspamd_url_parse_tld_file(const gchar *fname, + struct url_match_scanner *scanner) +{ + FILE *f; + struct url_matcher m; + gchar *linebuf = NULL, *p; + gsize buflen = 0; + gssize r; + gint flags; + + f = fopen(fname, "r"); + + if (f == NULL) { + msg_err("cannot open TLD file %s: %s", fname, strerror(errno)); + return FALSE; + } + + m.end = url_tld_end; + m.start = url_tld_start; + m.prefix = "http://"; + + while ((r = getline(&linebuf, &buflen, f)) > 0) { + if (linebuf[0] == '/' || g_ascii_isspace(linebuf[0])) { + /* Skip comment or empty line */ + continue; + } + + g_strchomp(linebuf); + + /* TODO: add support for ! patterns */ + if (linebuf[0] == '!') { + msg_debug("skip '!' patterns from parsing for now: %s", linebuf); + continue; + } + + flags = URL_MATCHER_FLAG_NOHTML | URL_MATCHER_FLAG_TLD_MATCH; + + if (linebuf[0] == '*') { + flags |= URL_MATCHER_FLAG_STAR_MATCH; + p = strchr(linebuf, '.'); + + if (p == NULL) { + msg_err("got bad star line, skip it: %s", linebuf); + continue; + } + p++; + } + else { + p = linebuf; + } + + m.flags = flags; + rspamd_multipattern_add_pattern(url_scanner->search_trie_full, p, + RSPAMD_MULTIPATTERN_TLD | RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); + m.pattern = rspamd_multipattern_get_pattern(url_scanner->search_trie_full, + rspamd_multipattern_get_npatterns(url_scanner->search_trie_full) - 1); + + g_array_append_val(url_scanner->matchers_full, m); + } + + free(linebuf); + fclose(f); + + return TRUE; +} + +static void +rspamd_url_add_static_matchers(struct url_match_scanner *sc) +{ + gint n = G_N_ELEMENTS(static_matchers), i; + + for (i = 0; i < n; i++) { + if (static_matchers[i].flags & URL_MATCHER_FLAG_REGEXP) { + rspamd_multipattern_add_pattern(url_scanner->search_trie_strict, + static_matchers[i].pattern, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 | + RSPAMD_MULTIPATTERN_RE); + } + else { + rspamd_multipattern_add_pattern(url_scanner->search_trie_strict, + static_matchers[i].pattern, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); + } + } + + g_array_append_vals(sc->matchers_strict, static_matchers, n); + + if (sc->matchers_full) { + for (i = 0; i < n; i++) { + if (static_matchers[i].flags & URL_MATCHER_FLAG_REGEXP) { + rspamd_multipattern_add_pattern(url_scanner->search_trie_full, + static_matchers[i].pattern, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 | + RSPAMD_MULTIPATTERN_RE); + } + else { + rspamd_multipattern_add_pattern(url_scanner->search_trie_full, + static_matchers[i].pattern, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); + } + } + g_array_append_vals(sc->matchers_full, static_matchers, n); + } +} + +void rspamd_url_deinit(void) +{ + if (url_scanner != NULL) { + if (url_scanner->search_trie_full) { + rspamd_multipattern_destroy(url_scanner->search_trie_full); + g_array_free(url_scanner->matchers_full, TRUE); + } + + rspamd_multipattern_destroy(url_scanner->search_trie_strict); + g_array_free(url_scanner->matchers_strict, TRUE); + g_free(url_scanner); + + url_scanner = NULL; + } +} + +void rspamd_url_init(const gchar *tld_file) +{ + GError *err = NULL; + gboolean ret = TRUE; + + if (url_scanner != NULL) { + rspamd_url_deinit(); + } + + url_scanner = g_malloc(sizeof(struct url_match_scanner)); + + url_scanner->matchers_strict = g_array_sized_new(FALSE, TRUE, + sizeof(struct url_matcher), G_N_ELEMENTS(static_matchers)); + url_scanner->search_trie_strict = rspamd_multipattern_create_sized( + G_N_ELEMENTS(static_matchers), + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); + + if (tld_file) { + /* Reserve larger multipattern */ + url_scanner->matchers_full = g_array_sized_new(FALSE, TRUE, + sizeof(struct url_matcher), 13000); + url_scanner->search_trie_full = rspamd_multipattern_create_sized(13000, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); + url_scanner->has_tld_file = true; + } + else { + url_scanner->matchers_full = NULL; + url_scanner->search_trie_full = NULL; + url_scanner->has_tld_file = false; + } + + rspamd_url_add_static_matchers(url_scanner); + + if (tld_file != NULL) { + ret = rspamd_url_parse_tld_file(tld_file, url_scanner); + } + + if (url_scanner->matchers_full && url_scanner->matchers_full->len > 1000) { + msg_info("start compiling of %d TLD suffixes; it might take a long time", + url_scanner->matchers_full->len); + } + + if (!rspamd_multipattern_compile(url_scanner->search_trie_strict, &err)) { + msg_err("cannot compile url matcher static patterns, fatal error: %e", err); + abort(); + } + + if (url_scanner->search_trie_full) { + if (!rspamd_multipattern_compile(url_scanner->search_trie_full, &err)) { + msg_err("cannot compile tld patterns, url matching will be " + "incomplete: %e", + err); + g_error_free(err); + ret = FALSE; + } + } + + if (tld_file != NULL) { + if (ret) { + msg_info("initialized %ud url match suffixes from '%s'", + url_scanner->matchers_full->len - url_scanner->matchers_strict->len, + tld_file); + } + else { + msg_err("failed to initialize url tld suffixes from '%s', " + "use %ud internal match suffixes", + tld_file, + url_scanner->matchers_strict->len); + } + } + + /* Generate hashes for flags */ + for (gint i = 0; i < G_N_ELEMENTS(url_flag_names); i++) { + url_flag_names[i].hash = + rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, + url_flag_names[i].name, + strlen(url_flag_names[i].name), 0); + } + /* Ensure that we have no hashes collisions O(N^2) but this array is small */ + for (gint i = 0; i < G_N_ELEMENTS(url_flag_names) - 1; i++) { + for (gint j = i + 1; j < G_N_ELEMENTS(url_flag_names); j++) { + if (url_flag_names[i].hash == url_flag_names[j].hash) { + msg_err("collision: both %s and %s map to %d", + url_flag_names[i].name, url_flag_names[j].name, + url_flag_names[i].hash); + abort(); + } + } + } +} + +#define SET_U(u, field) \ + do { \ + if ((u) != NULL) { \ + (u)->field_set |= 1 << (field); \ + (u)->field_data[(field)].len = p - c; \ + (u)->field_data[(field)].off = c - str; \ + } \ + } while (0) + +static bool +is_url_start(gchar c) +{ + if (c == '(' || + c == '{' || + c == '[' || + c == '<' || + c == '\'') { + return TRUE; + } + + return FALSE; +} + +static bool +is_url_end(gchar c) +{ + if (c == ')' || + c == '}' || + c == ']' || + c == '>' || + c == '\'') { + return TRUE; + } + + return FALSE; +} + +static bool +is_domain_start(int p) +{ + if (g_ascii_isalnum(p) || + p == '[' || + p == '%' || + p == '_' || + (p & 0x80)) { + return TRUE; + } + + return FALSE; +} + +static const guint max_domain_length = 253; +static const guint max_dns_label = 63; +static const guint max_email_user = 64; + +static gint +rspamd_mailto_parse(struct http_parser_url *u, + const gchar *str, gsize len, + gchar const **end, + enum rspamd_url_parse_flags parse_flags, guint *flags) +{ + const gchar *p = str, *c = str, *last = str + len; + gchar t; + gint ret = 1; + enum { + parse_mailto, + parse_slash, + parse_slash_slash, + parse_semicolon, + parse_prefix_question, + parse_destination, + parse_equal, + parse_user, + parse_at, + parse_domain, + parse_suffix_question, + parse_query + } st = parse_mailto; + + if (u != NULL) { + memset(u, 0, sizeof(*u)); + } + + while (p < last) { + t = *p; + + if (p - str > max_email_user + max_domain_length + 1) { + goto out; + } + + switch (st) { + case parse_mailto: + if (t == ':') { + st = parse_semicolon; + SET_U(u, UF_SCHEMA); + } + p++; + break; + case parse_semicolon: + if (t == '/' || t == '\\') { + st = parse_slash; + p++; + } + else { + *flags |= RSPAMD_URL_FLAG_MISSINGSLASHES; + st = parse_slash_slash; + } + break; + case parse_slash: + if (t == '/' || t == '\\') { + st = parse_slash_slash; + } + else { + goto out; + } + p++; + break; + case parse_slash_slash: + if (t == '?') { + st = parse_prefix_question; + p++; + } + else if (t != '/' && t != '\\') { + c = p; + st = parse_user; + } + else { + /* Skip multiple slashes */ + p++; + } + break; + case parse_prefix_question: + if (t == 't') { + /* XXX: accept only to= */ + st = parse_destination; + } + else { + goto out; + } + break; + case parse_destination: + if (t == '=') { + st = parse_equal; + } + p++; + break; + case parse_equal: + c = p; + st = parse_user; + break; + case parse_user: + if (t == '@') { + if (p - c == 0) { + goto out; + } + SET_U(u, UF_USERINFO); + st = parse_at; + } + else if (!is_mailsafe(t)) { + goto out; + } + else if (p - c > max_email_user) { + goto out; + } + p++; + break; + case parse_at: + c = p; + st = parse_domain; + break; + case parse_domain: + if (t == '?') { + SET_U(u, UF_HOST); + st = parse_suffix_question; + } + else if (!is_domain(t) && t != '.' && t != '_') { + goto out; + } + else if (p - c > max_domain_length) { + goto out; + } + p++; + break; + case parse_suffix_question: + c = p; + st = parse_query; + break; + case parse_query: + if (t == '#') { + if (p - c != 0) { + SET_U(u, UF_QUERY); + } + c = p + 1; + ret = 0; + + goto out; + } + else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { + ret = 0; + goto out; + } + else if (is_lwsp(t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace(t)) { + ret = 0; + } + goto out; + } + else { + goto out; + } + } + p++; + break; + } + } + + if (st == parse_domain) { + if (p - c != 0) { + SET_U(u, UF_HOST); + ret = 0; + } + } + else if (st == parse_query) { + if (p - c > 0) { + SET_U(u, UF_QUERY); + } + + ret = 0; + } + +out: + if (end != NULL) { + *end = p; + } + + if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) { + return 0; + } + + return ret; +} + +static gint +rspamd_telephone_parse(struct http_parser_url *u, + const gchar *str, gsize len, + gchar const **end, + enum rspamd_url_parse_flags parse_flags, + guint *flags) +{ + enum { + parse_protocol, + parse_semicolon, + parse_slash, + parse_slash_slash, + parse_spaces, + parse_plus, + parse_phone_start, + parse_phone, + } st = parse_protocol; + + const gchar *p = str, *c = str, *last = str + len; + gchar t; + gint ret = 1, i; + UChar32 uc; + + if (u != NULL) { + memset(u, 0, sizeof(*u)); + } + + while (p < last) { + t = *p; + + if (p - str > max_email_user) { + goto out; + } + + switch (st) { + case parse_protocol: + if (t == ':') { + st = parse_semicolon; + SET_U(u, UF_SCHEMA); + } + p++; + break; + case parse_semicolon: + if (t == '/' || t == '\\') { + st = parse_slash; + p++; + } + else { + st = parse_slash_slash; + } + break; + case parse_slash: + if (t == '/' || t == '\\') { + st = parse_slash_slash; + } + else { + goto out; + } + p++; + break; + case parse_slash_slash: + if (g_ascii_isspace(t)) { + st = parse_spaces; + p++; + } + else if (t == '+') { + c = p; + st = parse_plus; + } + else if (t == '/') { + /* Skip multiple slashes */ + p++; + } + else { + st = parse_phone_start; + c = p; + } + break; + case parse_spaces: + if (t == '+') { + c = p; + st = parse_plus; + } + else if (!g_ascii_isspace(t)) { + st = parse_phone_start; + c = p; + } + else { + p++; + } + break; + case parse_plus: + c = p; + p++; + st = parse_phone_start; + break; + case parse_phone_start: + if (*p == '%' || *p == '(' || g_ascii_isdigit(*p)) { + st = parse_phone; + p++; + } + else { + goto out; + } + break; + case parse_phone: + i = p - str; + U8_NEXT(str, i, len, uc); + p = str + i; + + if (u_isdigit(uc) || uc == '(' || uc == ')' || uc == '[' || uc == ']' || u_isspace(uc) || uc == '%') { + /* p is already incremented by U8_NEXT! */ + } + else if (uc <= 0 || is_url_end(uc)) { + ret = 0; + goto set; + } + break; + } + } + +set: + if (st == parse_phone) { + if (p - c != 0) { + SET_U(u, UF_HOST); + ret = 0; + } + } + +out: + if (end != NULL) { + *end = p; + } + + if ((parse_flags & RSPAMD_URL_PARSE_CHECK)) { + return 0; + } + + return ret; +} + +static gint +rspamd_web_parse(struct http_parser_url *u, const gchar *str, gsize len, + gchar const **end, + enum rspamd_url_parse_flags parse_flags, + guint *flags) +{ + const gchar *p = str, *c = str, *last = str + len, *slash = NULL, + *password_start = NULL, *user_start = NULL; + gchar t = 0; + UChar32 uc; + glong pt; + gint ret = 1; + gboolean user_seen = FALSE; + enum { + parse_protocol, + parse_slash, + parse_slash_slash, + parse_semicolon, + parse_user, + parse_at, + parse_multiple_at, + parse_password_start, + parse_password, + parse_domain_start, + parse_domain, + parse_ipv6, + parse_port_password, + parse_port, + parse_suffix_slash, + parse_path, + parse_query, + parse_part + } st = parse_protocol; + + if (u != NULL) { + memset(u, 0, sizeof(*u)); + } + + while (p < last) { + t = *p; + + switch (st) { + case parse_protocol: + if (t == ':') { + st = parse_semicolon; + SET_U(u, UF_SCHEMA); + } + else if (!g_ascii_isalnum(t) && t != '+' && t != '-') { + if ((parse_flags & RSPAMD_URL_PARSE_CHECK) && p > c) { + /* We might have some domain, but no protocol */ + st = parse_domain_start; + p = c; + slash = c; + break; + } + else { + goto out; + } + } + p++; + break; + case parse_semicolon: + if (t == '/' || t == '\\') { + st = parse_slash; + p++; + } + else { + st = parse_slash_slash; + *(flags) |= RSPAMD_URL_FLAG_MISSINGSLASHES; + } + break; + case parse_slash: + if (t == '/' || t == '\\') { + st = parse_slash_slash; + } + else { + goto out; + } + p++; + break; + case parse_slash_slash: + + if (t != '/' && t != '\\') { + c = p; + slash = p; + st = parse_domain_start; + + /* + * Unfortunately, due to brain damage of the RFC 3986 authors, + * we have to distinguish two possibilities here: + * authority = [ userinfo "@" ] host [ ":" port ] + * So if we have @ somewhere before hostname then we must process + * with the username state. Otherwise, we have to process via + * the hostname state. Unfortunately, there is no way to distinguish + * them aside of running NFA or two DFA or performing lookahead. + * Lookahead approach looks easier to implement. + */ + + const char *tp = p; + while (tp < last) { + if (*tp == '@') { + user_seen = TRUE; + st = parse_user; + break; + } + else if (*tp == '/' || *tp == '#' || *tp == '?') { + st = parse_domain_start; + break; + } + + tp++; + } + + if (st == parse_domain_start && *p == '[') { + st = parse_ipv6; + p++; + c = p; + } + } + else { + /* Skip multiple slashes */ + p++; + } + break; + case parse_ipv6: + if (t == ']') { + if (p - c == 0) { + goto out; + } + SET_U(u, UF_HOST); + p++; + + if (*p == ':') { + st = parse_port; + c = p + 1; + } + else if (*p == '/' || *p == '\\') { + st = parse_path; + c = p + 1; + } + else if (*p == '?') { + st = parse_query; + c = p + 1; + } + else if (*p == '#') { + st = parse_part; + c = p + 1; + } + else if (p != last) { + goto out; + } + } + else if (!g_ascii_isxdigit(t) && t != ':' && t != '.') { + goto out; + } + p++; + break; + case parse_user: + if (t == ':') { + if (p - c == 0) { + goto out; + } + user_start = c; + st = parse_password_start; + } + else if (t == '@') { + /* No password */ + if (p - c == 0) { + /* We have multiple at in fact */ + st = parse_multiple_at; + user_seen = TRUE; + *flags |= RSPAMD_URL_FLAG_OBSCURED; + + continue; + } + + SET_U(u, UF_USERINFO); + *flags |= RSPAMD_URL_FLAG_HAS_USER; + st = parse_at; + } + else if (!g_ascii_isgraph(t)) { + goto out; + } + else if (p - c > max_email_user) { + goto out; + } + + p++; + break; + case parse_multiple_at: + if (t != '@') { + if (p - c == 0) { + goto out; + } + + /* For now, we ignore all that stuff as it is bogus */ + /* Off by one */ + p--; + SET_U(u, UF_USERINFO); + p++; + *flags |= RSPAMD_URL_FLAG_HAS_USER; + st = parse_at; + } + else { + p++; + } + break; + case parse_password_start: + if (t == '@') { + /* Empty password */ + SET_U(u, UF_USERINFO); + if (u != NULL && u->field_data[UF_USERINFO].len > 0) { + /* Eat semicolon */ + u->field_data[UF_USERINFO].len--; + } + *flags |= RSPAMD_URL_FLAG_HAS_USER; + st = parse_at; + } + else { + c = p; + password_start = p; + st = parse_password; + } + p++; + break; + case parse_password: + if (t == '@') { + /* XXX: password is not stored */ + if (u != NULL) { + if (u->field_data[UF_USERINFO].len == 0 && password_start && user_start && password_start > user_start + 1) { + *flags |= RSPAMD_URL_FLAG_HAS_USER; + u->field_set |= 1u << (UF_USERINFO); + u->field_data[UF_USERINFO].len = + password_start - user_start - 1; + u->field_data[UF_USERINFO].off = + user_start - str; + } + } + st = parse_at; + } + else if (!g_ascii_isgraph(t)) { + goto out; + } + else if (p - c > max_domain_length) { + goto out; + } + p++; + break; + case parse_at: + c = p; + + if (t == '@') { + *flags |= RSPAMD_URL_FLAG_OBSCURED; + p++; + } + else if (t == '[') { + st = parse_ipv6; + p++; + c = p; + } + else { + st = parse_domain_start; + } + break; + case parse_domain_start: + if (is_domain_start(t)) { + st = parse_domain; + } + else { + goto out; + } + break; + case parse_domain: + if (p - c > max_domain_length) { + /* Too large domain */ + goto out; + } + if (t == '/' || t == '\\' || t == ':' || t == '?' || t == '#') { + if (p - c == 0) { + goto out; + } + if (t == '/' || t == '\\') { + SET_U(u, UF_HOST); + st = parse_suffix_slash; + } + else if (t == '?') { + SET_U(u, UF_HOST); + st = parse_query; + c = p + 1; + } + else if (t == '#') { + SET_U(u, UF_HOST); + st = parse_part; + c = p + 1; + } + else if (t == ':' && !user_seen) { + /* + * Here we can have both port and password, hence we need + * to apply some heuristic here + */ + st = parse_port_password; + } + else { + /* + * We can go only for parsing port here + */ + SET_U(u, UF_HOST); + st = parse_port; + c = p + 1; + } + p++; + } + else { + if (is_url_end(t) || is_url_start(t)) { + goto set; + } + else if (*p == '@' && !user_seen) { + /* We need to fallback and test user */ + p = slash; + user_seen = TRUE; + st = parse_user; + } + else if (*p != '.' && *p != '-' && *p != '_' && *p != '%') { + if (*p & 0x80) { + guint i = 0; + + U8_NEXT(((const guchar *) p), i, last - p, uc); + + if (uc < 0) { + /* Bad utf8 */ + goto out; + } + + if (!u_isalnum(uc)) { + /* Bad symbol */ + if (IS_ZERO_WIDTH_SPACE(uc)) { + (*flags) |= RSPAMD_URL_FLAG_ZW_SPACES; + } + else { + if (!u_isgraph(uc)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + goto out; + } + else { + goto set; + } + } + } + } + else { + (*flags) |= RSPAMD_URL_FLAG_IDN; + } + + p = p + i; + } + else if (is_urlsafe(*p)) { + p++; + } + else { + if (parse_flags & RSPAMD_URL_PARSE_HREF) { + /* We have to use all shit we are given here */ + p++; + (*flags) |= RSPAMD_URL_FLAG_OBSCURED; + } + else { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + goto out; + } + else { + goto set; + } + } + } + } + else { + p++; + } + } + break; + case parse_port_password: + if (g_ascii_isdigit(t)) { + const gchar *tmp = p; + + while (tmp < last) { + if (!g_ascii_isdigit(*tmp)) { + if (*tmp == '/' || *tmp == '#' || *tmp == '?' || + is_url_end(*tmp) || g_ascii_isspace(*tmp)) { + /* Port + something */ + st = parse_port; + c = slash; + p--; + SET_U(u, UF_HOST); + p++; + c = p; + break; + } + else { + /* Not a port, bad character at the end */ + break; + } + } + tmp++; + } + + if (tmp == last) { + /* Host + port only */ + st = parse_port; + c = slash; + p--; + SET_U(u, UF_HOST); + p++; + c = p; + } + + if (st != parse_port) { + /* Fallback to user:password */ + p = slash; + c = slash; + user_seen = TRUE; + st = parse_user; + } + } + else { + /* Rewind back */ + p = slash; + c = slash; + user_seen = TRUE; + st = parse_user; + } + break; + case parse_port: + if (t == '/' || t == '\\') { + pt = strtoul(c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; + } + st = parse_suffix_slash; + } + else if (t == '?') { + pt = strtoul(c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; + } + + c = p + 1; + st = parse_query; + } + else if (t == '#') { + pt = strtoul(c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + *flags |= RSPAMD_URL_FLAG_HAS_PORT; + } + + c = p + 1; + st = parse_part; + } + else if (is_url_end(t)) { + goto set; + } + else if (!g_ascii_isdigit(t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK) || + !g_ascii_isspace(t)) { + goto out; + } + else { + goto set; + } + } + p++; + break; + case parse_suffix_slash: + if (t != '/' && t != '\\') { + c = p; + st = parse_path; + } + else { + /* Skip extra slashes */ + p++; + } + break; + case parse_path: + if (t == '?') { + if (p - c != 0) { + SET_U(u, UF_PATH); + } + c = p + 1; + st = parse_query; + } + else if (t == '#') { + /* No query, just fragment */ + if (p - c != 0) { + SET_U(u, UF_PATH); + } + c = p + 1; + st = parse_part; + } + else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { + goto set; + } + else if (is_lwsp(t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace(t)) { + goto set; + } + goto out; + } + else { + goto set; + } + } + p++; + break; + case parse_query: + if (t == '#') { + if (p - c != 0) { + SET_U(u, UF_QUERY); + } + c = p + 1; + st = parse_part; + } + else if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { + goto set; + } + else if (is_lwsp(t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace(t)) { + goto set; + } + goto out; + } + else { + goto set; + } + } + p++; + break; + case parse_part: + if (!(parse_flags & RSPAMD_URL_PARSE_HREF) && is_url_end(t)) { + goto set; + } + else if (is_lwsp(t)) { + if (!(parse_flags & RSPAMD_URL_PARSE_CHECK)) { + if (g_ascii_isspace(t)) { + goto set; + } + goto out; + } + else { + goto set; + } + } + p++; + break; + } + } + +set: + /* Parse remaining */ + switch (st) { + case parse_domain: + if (p - c == 0 || !is_domain(*(p - 1)) || !is_domain(*c)) { + goto out; + } + SET_U(u, UF_HOST); + ret = 0; + + break; + case parse_port: + pt = strtoul(c, NULL, 10); + if (pt == 0 || pt > 65535) { + goto out; + } + if (u != NULL) { + u->port = pt; + } + + ret = 0; + break; + case parse_suffix_slash: + /* Url ends with '/' */ + ret = 0; + break; + case parse_path: + if (p - c > 0) { + SET_U(u, UF_PATH); + } + ret = 0; + break; + case parse_query: + if (p - c > 0) { + SET_U(u, UF_QUERY); + } + ret = 0; + break; + case parse_part: + if (p - c > 0) { + SET_U(u, UF_FRAGMENT); + } + ret = 0; + break; + case parse_ipv6: + if (t != ']') { + ret = 1; + } + else { + /* e.g. http://[::] */ + ret = 0; + } + break; + default: + /* Error state */ + ret = 1; + break; + } +out: + if (end != NULL) { + *end = p; + } + + return ret; +} + +#undef SET_U + +static gint +rspamd_tld_trie_callback(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + struct url_matcher *matcher; + const gchar *start, *pos, *p; + struct rspamd_url *url = context; + gint ndots; + + matcher = &g_array_index(url_scanner->matchers_full, struct url_matcher, + strnum); + ndots = 1; + + if (matcher->flags & URL_MATCHER_FLAG_STAR_MATCH) { + /* Skip one more tld component */ + ndots++; + } + + pos = text + match_start; + p = pos - 1; + start = rspamd_url_host_unsafe(url); + + if (*pos != '.' || match_pos != (gint) url->hostlen) { + /* Something weird has been found */ + if (match_pos == (gint) url->hostlen - 1) { + pos = rspamd_url_host_unsafe(url) + match_pos; + if (*pos == '.') { + /* This is dot at the end of domain */ + url->hostlen--; + } + else { + return 0; + } + } + else { + return 0; + } + } + + /* Now we need to find top level domain */ + pos = start; + while (p >= start && ndots > 0) { + if (*p == '.') { + ndots--; + pos = p + 1; + } + else { + pos = p; + } + + p--; + } + + if ((ndots == 0 || p == start - 1) && + url->tldlen < rspamd_url_host_unsafe(url) + url->hostlen - pos) { + url->tldshift = (pos - url->string); + url->tldlen = rspamd_url_host_unsafe(url) + url->hostlen - pos; + } + + return 0; +} + +static void +rspamd_url_regen_from_inet_addr(struct rspamd_url *uri, const void *addr, int af, + rspamd_mempool_t *pool) +{ + gchar *strbuf, *p; + const gchar *start_offset; + gsize slen = uri->urllen - uri->hostlen; + goffset r = 0; + + if (af == AF_INET) { + slen += INET_ADDRSTRLEN; + } + else { + slen += INET6_ADDRSTRLEN; + } + + if (uri->flags & RSPAMD_URL_FLAG_HAS_PORT) { + slen += sizeof("65535") - 1; + } + + /* Allocate new string to build it from IP */ + strbuf = rspamd_mempool_alloc(pool, slen + 1); + r += rspamd_snprintf(strbuf + r, slen - r, "%*s", + (gint) (uri->hostshift), + uri->string); + + uri->hostshift = r; + uri->tldshift = r; + start_offset = strbuf + r; + inet_ntop(af, addr, strbuf + r, slen - r + 1); + uri->hostlen = strlen(start_offset); + r += uri->hostlen; + uri->tldlen = uri->hostlen; + uri->flags |= RSPAMD_URL_FLAG_NUMERIC; + + /* Reconstruct URL */ + if (uri->flags & RSPAMD_URL_FLAG_HAS_PORT && uri->ext) { + p = strbuf + r; + start_offset = p + 1; + r += rspamd_snprintf(strbuf + r, slen - r, ":%ud", + (unsigned int) uri->ext->port); + } + if (uri->datalen > 0) { + p = strbuf + r; + start_offset = p + 1; + r += rspamd_snprintf(strbuf + r, slen - r, "/%*s", + (gint) uri->datalen, + rspamd_url_data_unsafe(uri)); + uri->datashift = start_offset - strbuf; + } + else { + /* Add trailing slash if needed */ + if (uri->hostlen + uri->hostshift < uri->urllen && + *(rspamd_url_host_unsafe(uri) + uri->hostlen) == '/') { + r += rspamd_snprintf(strbuf + r, slen - r, "/"); + } + } + + if (uri->querylen > 0) { + p = strbuf + r; + start_offset = p + 1; + r += rspamd_snprintf(strbuf + r, slen - r, "?%*s", + (gint) uri->querylen, + rspamd_url_query_unsafe(uri)); + uri->queryshift = start_offset - strbuf; + } + if (uri->fragmentlen > 0) { + p = strbuf + r; + start_offset = p + 1; + r += rspamd_snprintf(strbuf + r, slen - r, "#%*s", + (gint) uri->fragmentlen, + rspamd_url_fragment_unsafe(uri)); + uri->fragmentshift = start_offset - strbuf; + } + + uri->string = strbuf; + uri->urllen = r; +} + +static gboolean +rspamd_url_maybe_regenerate_from_ip(struct rspamd_url *uri, rspamd_mempool_t *pool) +{ + const gchar *p, *end, *c; + gchar *errstr; + struct in_addr in4; + struct in6_addr in6; + gboolean ret = FALSE, check_num = TRUE; + guint32 n, dots, t = 0, i = 0, shift, nshift; + + p = rspamd_url_host_unsafe(uri); + end = p + uri->hostlen; + + if (*p == '[' && *(end - 1) == ']') { + p++; + end--; + } + + while (*(end - 1) == '.' && end > p) { + end--; + } + + if (end - p == 0 || end - p > INET6_ADDRSTRLEN) { + return FALSE; + } + + if (rspamd_str_has_8bit(p, end - p)) { + return FALSE; + } + + if (rspamd_parse_inet_address_ip4(p, end - p, &in4)) { + rspamd_url_regen_from_inet_addr(uri, &in4, AF_INET, pool); + ret = TRUE; + } + else if (rspamd_parse_inet_address_ip6(p, end - p, &in6)) { + rspamd_url_regen_from_inet_addr(uri, &in6, AF_INET6, pool); + ret = TRUE; + } + else { + /* Heuristics for broken urls */ + gchar buf[INET6_ADDRSTRLEN + 1]; + /* Try also numeric notation */ + c = p; + n = 0; + dots = 0; + shift = 0; + + while (p <= end && check_num) { + if (shift < 32 && + ((*p == '.' && dots < 3) || (p == end && dots <= 3))) { + if (p - c + 1 >= (gint) sizeof(buf)) { + msg_debug_pool("invalid numeric url %*.s...: too long", + INET6_ADDRSTRLEN, c); + return FALSE; + } + + rspamd_strlcpy(buf, c, p - c + 1); + c = p + 1; + + if (p < end && *p == '.') { + dots++; + } + + glong long_n = strtol(buf, &errstr, 0); + + if ((errstr == NULL || *errstr == '\0') && long_n >= 0) { + + t = long_n; /* Truncate as windows does */ + /* + * Even if we have zero, we need to shift by 1 octet + */ + nshift = (t == 0 ? shift + 8 : shift); + + /* + * Here we count number of octets encoded in this element + */ + for (i = 0; i < 4; i++) { + if ((t >> (8 * i)) > 0) { + nshift += 8; + } + else { + break; + } + } + /* + * Here we need to find the proper shift of the previous + * components, so we check possible cases: + * 1) 1 octet - just use it applying shift + * 2) 2 octets - convert to big endian 16 bit number + * 3) 3 octets - convert to big endian 24 bit number + * 4) 4 octets - convert to big endian 32 bit number + */ + switch (i) { + case 4: + t = GUINT32_TO_BE(t); + break; + case 3: + t = (GUINT32_TO_BE(t & 0xFFFFFFU)) >> 8; + break; + case 2: + t = GUINT16_TO_BE(t & 0xFFFFU); + break; + default: + t = t & 0xFF; + break; + } + + if (p != end) { + n |= t << shift; + + shift = nshift; + } + } + else { + check_num = FALSE; + } + } + + p++; + } + + /* The last component should be last according to url normalization: + * 192.168.1 -> 192.168.0.1 + * 192 -> 0.0.0.192 + * 192.168 -> 192.0.0.168 + */ + shift = 8 * (4 - i); + + if (shift < 32) { + n |= t << shift; + } + + if (check_num) { + if (dots <= 4) { + memcpy(&in4, &n, sizeof(in4)); + rspamd_url_regen_from_inet_addr(uri, &in4, AF_INET, pool); + uri->flags |= RSPAMD_URL_FLAG_OBSCURED; + ret = TRUE; + } + else if (end - c > (gint) sizeof(buf) - 1) { + rspamd_strlcpy(buf, c, end - c + 1); + + if (inet_pton(AF_INET6, buf, &in6) == 1) { + rspamd_url_regen_from_inet_addr(uri, &in6, AF_INET6, pool); + uri->flags |= RSPAMD_URL_FLAG_OBSCURED; + ret = TRUE; + } + } + } + } + + return ret; +} + +static void +rspamd_url_shift(struct rspamd_url *uri, gsize nlen, + enum http_parser_url_fields field) +{ + guint old_shift, shift = 0; + gint remain; + + /* Shift remaining data */ + switch (field) { + case UF_SCHEMA: + if (nlen >= uri->protocollen) { + return; + } + else { + shift = uri->protocollen - nlen; + } + + old_shift = uri->protocollen; + uri->protocollen -= shift; + remain = uri->urllen - uri->protocollen; + g_assert(remain >= 0); + memmove(uri->string + uri->protocollen, uri->string + old_shift, + remain); + uri->urllen -= shift; + uri->flags |= RSPAMD_URL_FLAG_SCHEMAENCODED; + break; + case UF_HOST: + if (nlen >= uri->hostlen) { + return; + } + else { + shift = uri->hostlen - nlen; + } + + old_shift = uri->hostlen; + uri->hostlen -= shift; + remain = (uri->urllen - (uri->hostshift)) - old_shift; + g_assert(remain >= 0); + memmove(rspamd_url_host_unsafe(uri) + uri->hostlen, + rspamd_url_host_unsafe(uri) + old_shift, + remain); + uri->urllen -= shift; + uri->flags |= RSPAMD_URL_FLAG_HOSTENCODED; + break; + case UF_PATH: + if (nlen >= uri->datalen) { + return; + } + else { + shift = uri->datalen - nlen; + } + + old_shift = uri->datalen; + uri->datalen -= shift; + remain = (uri->urllen - (uri->datashift)) - old_shift; + g_assert(remain >= 0); + memmove(rspamd_url_data_unsafe(uri) + uri->datalen, + rspamd_url_data_unsafe(uri) + old_shift, + remain); + uri->urllen -= shift; + uri->flags |= RSPAMD_URL_FLAG_PATHENCODED; + break; + case UF_QUERY: + if (nlen >= uri->querylen) { + return; + } + else { + shift = uri->querylen - nlen; + } + + old_shift = uri->querylen; + uri->querylen -= shift; + remain = (uri->urllen - (uri->queryshift)) - old_shift; + g_assert(remain >= 0); + memmove(rspamd_url_query_unsafe(uri) + uri->querylen, + rspamd_url_query_unsafe(uri) + old_shift, + remain); + uri->urllen -= shift; + uri->flags |= RSPAMD_URL_FLAG_QUERYENCODED; + break; + case UF_FRAGMENT: + if (nlen >= uri->fragmentlen) { + return; + } + else { + shift = uri->fragmentlen - nlen; + } + + uri->fragmentlen -= shift; + uri->urllen -= shift; + break; + default: + break; + } + + /* Now adjust lengths and offsets */ + switch (field) { + case UF_SCHEMA: + if (uri->userlen > 0) { + uri->usershift -= shift; + } + if (uri->hostlen > 0) { + uri->hostshift -= shift; + } + /* Go forward */ + /* FALLTHRU */ + case UF_HOST: + if (uri->datalen > 0) { + uri->datashift -= shift; + } + /* Go forward */ + /* FALLTHRU */ + case UF_PATH: + if (uri->querylen > 0) { + uri->queryshift -= shift; + } + /* Go forward */ + /* FALLTHRU */ + case UF_QUERY: + if (uri->fragmentlen > 0) { + uri->fragmentshift -= shift; + } + /* Go forward */ + /* FALLTHRU */ + case UF_FRAGMENT: + default: + break; + } +} + +static void +rspamd_telephone_normalise_inplace(struct rspamd_url *uri) +{ + gchar *t, *h, *end; + gint i = 0, w, orig_len; + UChar32 uc; + + t = rspamd_url_host_unsafe(uri); + h = t; + end = t + uri->hostlen; + orig_len = uri->hostlen; + + if (*h == '+') { + h++; + t++; + } + + while (h < end) { + i = 0; + U8_NEXT(h, i, end - h, uc); + + if (u_isdigit(uc)) { + w = 0; + U8_APPEND_UNSAFE(t, w, uc); + t += w; + } + + h += i; + } + + uri->hostlen = t - rspamd_url_host_unsafe(uri); + uri->urllen -= (orig_len - uri->hostlen); +} + +static inline bool +is_idna_label_dot(UChar ch) +{ + switch (ch) { + case 0x3002: + case 0xFF0E: + case 0xFF61: + return true; + default: + return false; + } +} + +/* + * All credits for this investigation should go to + * Dr. Hajime Shimada and Mr. Shirakura as they have revealed this case in their + * research. + */ + +/* + * This function replaces unsafe IDNA dots in host labels. Unfortunately, + * IDNA extends dot definition from '.' to multiple other characters that + * should be treated equally. + * This function replaces such dots and returns `true` if these dots are found. + * In this case, it should be treated as obfuscation attempt. + */ +static bool +rspamd_url_remove_dots(struct rspamd_url *uri) +{ + const gchar *hstart = rspamd_url_host_unsafe(uri); + gchar *t; + UChar32 uc; + gint i = 0, hlen; + bool ret = false; + + if (uri->hostlen == 0) { + return false; + } + + hlen = uri->hostlen; + t = rspamd_url_host_unsafe(uri); + + while (i < hlen) { + gint prev_i = i; + U8_NEXT(hstart, i, hlen, uc); + + if (is_idna_label_dot(uc)) { + *t++ = '.'; + ret = true; + } + else { + if (ret) { + /* We have to shift the remaining stuff */ + while (prev_i < i) { + *t++ = *(hstart + prev_i); + prev_i++; + } + } + else { + t += (i - prev_i); + } + } + } + + if (ret) { + rspamd_url_shift(uri, t - hstart, UF_HOST); + } + + return ret; +} + +enum uri_errno +rspamd_url_parse(struct rspamd_url *uri, + gchar *uristring, gsize len, + rspamd_mempool_t *pool, + enum rspamd_url_parse_flags parse_flags) +{ + struct http_parser_url u; + gchar *p; + const gchar *end; + guint complen, ret, flags = 0; + gsize unquoted_len = 0; + + memset(uri, 0, sizeof(*uri)); + memset(&u, 0, sizeof(u)); + uri->count = 1; + /* Undefine order */ + uri->order = -1; + uri->part_order = -1; + + if (*uristring == '\0') { + return URI_ERRNO_EMPTY; + } + + if (len >= G_MAXUINT16 / 2) { + flags |= RSPAMD_URL_FLAG_TRUNCATED; + len = G_MAXUINT16 / 2; + } + + p = uristring; + uri->protocol = PROTOCOL_UNKNOWN; + + if (len > sizeof("mailto:") - 1) { + /* For mailto: urls we also need to add slashes to make it a valid URL */ + if (g_ascii_strncasecmp(p, "mailto:", sizeof("mailto:") - 1) == 0) { + ret = rspamd_mailto_parse(&u, uristring, len, &end, parse_flags, + &flags); + } + else if (g_ascii_strncasecmp(p, "tel:", sizeof("tel:") - 1) == 0 || + g_ascii_strncasecmp(p, "callto:", sizeof("callto:") - 1) == 0) { + ret = rspamd_telephone_parse(&u, uristring, len, &end, parse_flags, + &flags); + uri->protocol = PROTOCOL_TELEPHONE; + } + else { + ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, + &flags); + } + } + else { + ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, &flags); + } + + if (ret != 0) { + return URI_ERRNO_BAD_FORMAT; + } + + if (end > uristring && (guint) (end - uristring) != len) { + len = end - uristring; + } + + uri->raw = p; + uri->rawlen = len; + + if (flags & RSPAMD_URL_FLAG_MISSINGSLASHES) { + len += 2; + uri->string = rspamd_mempool_alloc(pool, len + 1); + memcpy(uri->string, p, u.field_data[UF_SCHEMA].len); + memcpy(uri->string + u.field_data[UF_SCHEMA].len, "://", 3); + rspamd_strlcpy(uri->string + u.field_data[UF_SCHEMA].len + 3, + p + u.field_data[UF_SCHEMA].len + 1, + len - 2 - u.field_data[UF_SCHEMA].len); + /* Compensate slashes added */ + for (int i = UF_SCHEMA + 1; i < UF_MAX; i++) { + if (u.field_set & (1 << i)) { + u.field_data[i].off += 2; + } + } + } + else { + uri->string = rspamd_mempool_alloc(pool, len + 1); + rspamd_strlcpy(uri->string, p, len + 1); + } + + uri->urllen = len; + uri->flags = flags; + + for (guint i = 0; i < UF_MAX; i++) { + if (u.field_set & (1 << i)) { + guint shift = u.field_data[i].off; + complen = u.field_data[i].len; + + if (complen >= G_MAXUINT16) { + /* Too large component length */ + return URI_ERRNO_BAD_FORMAT; + } + + switch (i) { + case UF_SCHEMA: + uri->protocollen = u.field_data[i].len; + break; + case UF_HOST: + uri->hostshift = shift; + uri->hostlen = complen; + break; + case UF_PATH: + uri->datashift = shift; + uri->datalen = complen; + break; + case UF_QUERY: + uri->queryshift = shift; + uri->querylen = complen; + break; + case UF_FRAGMENT: + uri->fragmentshift = shift; + uri->fragmentlen = complen; + break; + case UF_USERINFO: + uri->usershift = shift; + uri->userlen = complen; + break; + default: + break; + } + } + } + + /* Port is 'special' in case of url_parser as it is not a part of UF_* macro logic */ + if (u.port != 0) { + if (!uri->ext) { + uri->ext = rspamd_mempool_alloc0_type(pool, struct rspamd_url_ext); + } + uri->flags |= RSPAMD_URL_FLAG_HAS_PORT; + uri->ext->port = u.port; + } + + if (!uri->hostlen) { + return URI_ERRNO_HOST_MISSING; + } + + /* Now decode url symbols */ + unquoted_len = rspamd_url_decode(uri->string, + uri->string, + uri->protocollen); + rspamd_url_shift(uri, unquoted_len, UF_SCHEMA); + unquoted_len = rspamd_url_decode(rspamd_url_host_unsafe(uri), + rspamd_url_host_unsafe(uri), uri->hostlen); + + rspamd_url_normalise_propagate_flags(pool, rspamd_url_host_unsafe(uri), + &unquoted_len, uri->flags); + + rspamd_url_shift(uri, unquoted_len, UF_HOST); + + if (rspamd_url_remove_dots(uri)) { + uri->flags |= RSPAMD_URL_FLAG_OBSCURED; + } + + if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_MAILTO | PROTOCOL_FTP | PROTOCOL_FILE)) { + /* Ensure that hostname starts with something sane (exclude numeric urls) */ + const gchar *host = rspamd_url_host_unsafe(uri); + + if (!(is_domain_start(host[0]) || host[0] == ':')) { + return URI_ERRNO_BAD_FORMAT; + } + } + + /* Apply nameprep algorithm */ + static UStringPrepProfile *nameprep = NULL; + UErrorCode uc_err = U_ZERO_ERROR; + + if (nameprep == NULL) { + /* Open and cache profile */ + nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, &uc_err); + + g_assert(U_SUCCESS(uc_err)); + } + + UChar *utf16_hostname, *norm_utf16; + gint32 utf16_len, norm_utf16_len, norm_utf8_len; + UParseError parse_error; + + utf16_hostname = rspamd_mempool_alloc(pool, uri->hostlen * sizeof(UChar)); + struct UConverter *utf8_conv = rspamd_get_utf8_converter(); + + utf16_len = ucnv_toUChars(utf8_conv, utf16_hostname, uri->hostlen, + rspamd_url_host_unsafe(uri), uri->hostlen, &uc_err); + + if (!U_SUCCESS(uc_err)) { + + return URI_ERRNO_BAD_FORMAT; + } + + norm_utf16 = rspamd_mempool_alloc(pool, utf16_len * sizeof(UChar)); + norm_utf16_len = usprep_prepare(nameprep, utf16_hostname, utf16_len, + norm_utf16, utf16_len, USPREP_DEFAULT, &parse_error, &uc_err); + + if (!U_SUCCESS(uc_err)) { + + return URI_ERRNO_BAD_FORMAT; + } + + /* Convert back to utf8, sigh... */ + norm_utf8_len = ucnv_fromUChars(utf8_conv, + rspamd_url_host_unsafe(uri), uri->hostlen, + norm_utf16, norm_utf16_len, &uc_err); + + if (!U_SUCCESS(uc_err)) { + + return URI_ERRNO_BAD_FORMAT; + } + + /* Final shift of lengths */ + rspamd_url_shift(uri, norm_utf8_len, UF_HOST); + + /* Process data part */ + if (uri->datalen) { + unquoted_len = rspamd_url_decode(rspamd_url_data_unsafe(uri), + rspamd_url_data_unsafe(uri), uri->datalen); + + rspamd_url_normalise_propagate_flags(pool, rspamd_url_data_unsafe(uri), + &unquoted_len, uri->flags); + + rspamd_url_shift(uri, unquoted_len, UF_PATH); + /* We now normalize path */ + rspamd_normalize_path_inplace(rspamd_url_data_unsafe(uri), + uri->datalen, &unquoted_len); + rspamd_url_shift(uri, unquoted_len, UF_PATH); + } + + if (uri->querylen) { + unquoted_len = rspamd_url_decode(rspamd_url_query_unsafe(uri), + rspamd_url_query_unsafe(uri), + uri->querylen); + + rspamd_url_normalise_propagate_flags(pool, rspamd_url_query_unsafe(uri), + &unquoted_len, uri->flags); + rspamd_url_shift(uri, unquoted_len, UF_QUERY); + } + + if (uri->fragmentlen) { + unquoted_len = rspamd_url_decode(rspamd_url_fragment_unsafe(uri), + rspamd_url_fragment_unsafe(uri), + uri->fragmentlen); + + rspamd_url_normalise_propagate_flags(pool, rspamd_url_fragment_unsafe(uri), + &unquoted_len, uri->flags); + rspamd_url_shift(uri, unquoted_len, UF_FRAGMENT); + } + + rspamd_str_lc(uri->string, uri->protocollen); + unquoted_len = rspamd_str_lc_utf8(rspamd_url_host_unsafe(uri), uri->hostlen); + rspamd_url_shift(uri, unquoted_len, UF_HOST); + + if (uri->protocol == PROTOCOL_UNKNOWN) { + for (int i = 0; i < G_N_ELEMENTS(rspamd_url_protocols); i++) { + if (uri->protocollen == rspamd_url_protocols[i].len) { + if (memcmp(uri->string, + rspamd_url_protocols[i].name, uri->protocollen) == 0) { + uri->protocol = rspamd_url_protocols[i].proto; + break; + } + } + } + } + + if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_MAILTO | PROTOCOL_FTP | PROTOCOL_FILE)) { + /* Find TLD part */ + if (url_scanner->search_trie_full) { + rspamd_multipattern_lookup(url_scanner->search_trie_full, + rspamd_url_host_unsafe(uri), uri->hostlen, + rspamd_tld_trie_callback, uri, NULL); + } + + if (uri->tldlen == 0) { + /* + * If we have not detected eSLD, but there are no dots in the hostname, + * then we should treat the whole hostname as eSLD - a rule of thumb + * + * We also check that a hostname ends with a permitted character, and all characters are forming + * DNS label. We also need to check for a numeric IP within this check. + */ + const char *dot_pos = memchr(rspamd_url_host_unsafe(uri), '.', uri->hostlen); + bool is_whole_hostname_tld = false; + + if (uri->hostlen > 0 && (dot_pos == NULL || dot_pos == rspamd_url_host_unsafe(uri) + uri->hostlen - 1)) { + bool all_chars_domain = true; + + for (int i = 0; i < uri->hostlen; i++) { + if (!is_domain(rspamd_url_host_unsafe(uri)[i])) { + all_chars_domain = false; + break; + } + } + + char last_c = rspamd_url_host_unsafe(uri)[uri->hostlen - 1]; + + if (all_chars_domain) { + /* Also check the last character to be either a dot or alphanumeric character */ + if (last_c != '.' && !g_ascii_isalnum(last_c)) { + all_chars_domain = false; + } + } + + if (all_chars_domain) { + /* Additionally check for a numeric IP as we can have some number here... */ + rspamd_url_maybe_regenerate_from_ip(uri, pool); + + if (last_c == '.' && uri->hostlen > 1) { + /* Skip the last dot */ + uri->tldlen = uri->hostlen - 1; + } + else { + uri->tldlen = uri->hostlen; + } + + uri->tldshift = uri->hostshift; + is_whole_hostname_tld = true; + } + } + + if (!is_whole_hostname_tld) { + if (uri->protocol != PROTOCOL_MAILTO) { + if (url_scanner->has_tld_file && !(parse_flags & RSPAMD_URL_PARSE_HREF)) { + /* Ignore URL's without TLD if it is not a numeric URL */ + if (!rspamd_url_maybe_regenerate_from_ip(uri, pool)) { + return URI_ERRNO_TLD_MISSING; + } + } + else { + if (!rspamd_url_maybe_regenerate_from_ip(uri, pool)) { + /* Assume tld equal to host */ + uri->tldshift = uri->hostshift; + uri->tldlen = uri->hostlen; + } + else if (uri->flags & RSPAMD_URL_FLAG_SCHEMALESS) { + /* Ignore urls with both no schema and no tld */ + return URI_ERRNO_TLD_MISSING; + } + + uri->flags |= RSPAMD_URL_FLAG_NO_TLD; + } + } + else { + /* Ignore IP like domains for mailto, as it is really never supported */ + return URI_ERRNO_TLD_MISSING; + } + } + } + + /* Replace stupid '\' with '/' after schema */ + if (uri->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP) && + uri->protocollen > 0 && uri->urllen > uri->protocollen + 2) { + + gchar *pos = &uri->string[uri->protocollen], + *host_start = rspamd_url_host_unsafe(uri); + + while (pos < host_start) { + if (*pos == '\\') { + *pos = '/'; + uri->flags |= RSPAMD_URL_FLAG_OBSCURED; + } + pos++; + } + } + } + else if (uri->protocol & PROTOCOL_TELEPHONE) { + /* We need to normalise phone number: remove all spaces and braces */ + rspamd_telephone_normalise_inplace(uri); + + if (rspamd_url_host_unsafe(uri)[0] == '+') { + uri->tldshift = uri->hostshift + 1; + uri->tldlen = uri->hostlen - 1; + } + else { + uri->tldshift = uri->hostshift; + uri->tldlen = uri->hostlen; + } + } + + if (uri->protocol == PROTOCOL_UNKNOWN) { + if (!(parse_flags & RSPAMD_URL_PARSE_HREF)) { + return URI_ERRNO_INVALID_PROTOCOL; + } + else { + /* Hack, hack, hack */ + uri->protocol = PROTOCOL_UNKNOWN; + } + } + + return URI_ERRNO_OK; +} + +struct tld_trie_cbdata { + const gchar *begin; + gsize len; + rspamd_ftok_t *out; +}; + +static gint +rspamd_tld_trie_find_callback(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + struct url_matcher *matcher; + const gchar *start, *pos, *p; + struct tld_trie_cbdata *cbdata = context; + gint ndots = 1; + + matcher = &g_array_index(url_scanner->matchers_full, struct url_matcher, + strnum); + + if (matcher->flags & URL_MATCHER_FLAG_STAR_MATCH) { + /* Skip one more tld component */ + ndots = 2; + } + + pos = text + match_start; + p = pos - 1; + start = text; + + if (*pos != '.' || match_pos != (gint) cbdata->len) { + /* Something weird has been found */ + if (match_pos != (gint) cbdata->len - 1) { + /* Search more */ + return 0; + } + } + + /* Now we need to find top level domain */ + pos = start; + + while (p >= start && ndots > 0) { + if (*p == '.') { + ndots--; + pos = p + 1; + } + else { + pos = p; + } + + p--; + } + + if (ndots == 0 || p == start - 1) { + if (cbdata->begin + cbdata->len - pos > cbdata->out->len) { + cbdata->out->begin = pos; + cbdata->out->len = cbdata->begin + cbdata->len - pos; + } + } + + return 0; +} + +gboolean +rspamd_url_find_tld(const gchar *in, gsize inlen, rspamd_ftok_t *out) +{ + struct tld_trie_cbdata cbdata; + + g_assert(in != NULL); + g_assert(out != NULL); + g_assert(url_scanner != NULL); + + cbdata.begin = in; + cbdata.len = inlen; + cbdata.out = out; + out->len = 0; + + if (url_scanner->search_trie_full) { + rspamd_multipattern_lookup(url_scanner->search_trie_full, in, inlen, + rspamd_tld_trie_find_callback, &cbdata, NULL); + } + + if (out->len > 0) { + return TRUE; + } + + return FALSE; +} + +static const gchar url_braces[] = { + '(', ')', + '{', '}', + '[', ']', + '<', '>', + '|', '|', + '\'', '\''}; + + +static gboolean +url_file_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + match->m_begin = pos; + + if (pos > cb->begin) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + + return TRUE; +} + +static gboolean +url_file_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *p; + gchar stop; + guint i; + + p = pos + strlen(match->pattern); + stop = *p; + if (*p == '/') { + p++; + } + + for (i = 0; i < G_N_ELEMENTS(url_braces) / 2; i += 2) { + if (*p == url_braces[i]) { + stop = url_braces[i + 1]; + break; + } + } + + while (p < cb->end && *p != stop && is_urlsafe(*p)) { + p++; + } + + if (p == cb->begin) { + return FALSE; + } + match->m_len = p - match->m_begin; + + return TRUE; +} + +static gboolean +url_tld_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *p = pos; + guint processed = 0; + static const guint max_shift = 253 + sizeof("https://"); + + /* Try to find the start of the url by finding any non-urlsafe character or whitespace/punctuation */ + while (p >= cb->begin) { + if (!is_domain(*p) || g_ascii_isspace(*p) || is_url_start(*p) || + p == match->prev_newline_pos) { + if (!is_url_start(*p) && !g_ascii_isspace(*p) && + p != match->prev_newline_pos) { + return FALSE; + } + + if (p != match->prev_newline_pos) { + match->st = *p; + + p++; + } + else { + match->st = '\n'; + } + + if (!g_ascii_isalnum(*p)) { + /* Urls cannot start with strange symbols */ + return FALSE; + } + + match->m_begin = p; + return TRUE; + } + else if (p == cb->begin && p != pos) { + match->st = '\0'; + match->m_begin = p; + + return TRUE; + } + else if (*p == '.') { + if (p == cb->begin) { + /* Urls cannot start with a dot */ + return FALSE; + } + if (!g_ascii_isalnum(p[1])) { + /* Wrong we have an invalid character after dot */ + return FALSE; + } + } + else if (*p == '/') { + /* Urls cannot contain '/' in their body */ + return FALSE; + } + + p--; + processed++; + + if (processed > max_shift) { + /* Too long */ + return FALSE; + } + } + + return FALSE; +} + +static gboolean +url_tld_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *p; + gboolean ret = FALSE; + + p = pos + match->m_len; + + if (p == cb->end) { + match->m_len = p - match->m_begin; + return TRUE; + } + else if (*p == '/' || *p == ':' || is_url_end(*p) || is_lwsp(*p) || + (match->st != '<' && p == match->newline_pos)) { + /* Parse arguments, ports by normal way by url default function */ + p = match->m_begin; + /* Check common prefix */ + if (g_ascii_strncasecmp(p, "http://", sizeof("http://") - 1) == 0) { + ret = url_web_end(cb, + match->m_begin + sizeof("http://") - 1, + match); + } + else { + ret = url_web_end(cb, match->m_begin, match); + } + } + else if (*p == '.') { + p++; + if (p < cb->end) { + if (g_ascii_isspace(*p) || *p == '/' || + *p == '?' || *p == ':') { + ret = url_web_end(cb, match->m_begin, match); + } + } + } + + if (ret) { + /* Check sanity of match found */ + if (match->m_begin + match->m_len <= pos) { + return FALSE; + } + } + + return ret; +} + +static gboolean +url_web_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + /* Check what we have found */ + if (pos > cb->begin) { + if (g_ascii_strncasecmp(pos, "www", 3) == 0) { + + if (!(is_url_start(*(pos - 1)) || + g_ascii_isspace(*(pos - 1)) || + pos - 1 == match->prev_newline_pos || + (*(pos - 1) & 0x80))) { /* Chinese trick */ + return FALSE; + } + } + else { + guchar prev = *(pos - 1); + + if (g_ascii_isalnum(prev)) { + /* Part of another url */ + return FALSE; + } + } + } + + if (*pos == '.') { + /* Urls cannot start with . */ + return FALSE; + } + + if (pos > cb->begin) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + + match->m_begin = pos; + + return TRUE; +} + +static gboolean +url_web_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *last = NULL; + gint len = cb->end - pos; + guint flags = 0; + + if (match->newline_pos && match->st != '<') { + /* We should also limit our match end to the newline */ + len = MIN(len, match->newline_pos - pos); + } + + if (rspamd_web_parse(NULL, pos, len, &last, + RSPAMD_URL_PARSE_CHECK, &flags) != 0) { + return FALSE; + } + + if (last < cb->end && (*last == '>' && last != match->newline_pos)) { + /* We need to ensure that url also starts with '>' */ + if (match->st != '<') { + if (last + 1 < cb->end) { + if (g_ascii_isspace(last[1])) { + return FALSE; + } + } + else { + return FALSE; + } + } + } + + match->m_len = (last - pos); + cb->fin = last + 1; + + return TRUE; +} + + +static gboolean +url_email_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + if (!match->prefix || match->prefix[0] == '\0') { + /* We have mailto:// at the beginning */ + match->m_begin = pos; + + if (pos >= cb->begin + 1) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + } + else { + /* Just '@' */ + + /* Check if this match is a part of the previous mailto: email */ + if (cb->last_at != NULL && cb->last_at == pos) { + cb->last_at = NULL; + return FALSE; + } + else if (pos == cb->begin) { + /* Just @ at the start of input */ + return FALSE; + } + + match->st = '\0'; + } + + return TRUE; +} + +static gboolean +url_email_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *last = NULL; + struct http_parser_url u; + gint len = cb->end - pos; + guint flags = 0; + + if (match->newline_pos && match->st != '<') { + /* We should also limit our match end to the newline */ + len = MIN(len, match->newline_pos - pos); + } + + if (!match->prefix || match->prefix[0] == '\0') { + /* We have mailto:// at the beginning */ + if (rspamd_mailto_parse(&u, pos, len, &last, + RSPAMD_URL_PARSE_CHECK, &flags) != 0) { + return FALSE; + } + + if (!(u.field_set & (1 << UF_USERINFO))) { + return FALSE; + } + + cb->last_at = match->m_begin + u.field_data[UF_USERINFO].off + + u.field_data[UF_USERINFO].len; + + g_assert(*cb->last_at == '@'); + match->m_len = (last - pos); + + return TRUE; + } + else { + const gchar *c, *p; + /* + * Here we have just '@', so we need to find both start and end of the + * pattern + */ + g_assert(*pos == '@'); + + if (pos >= cb->end - 2 || pos < cb->begin + 1) { + /* Boundary violation */ + return FALSE; + } + + /* Check the next character after `@` */ + if (!g_ascii_isalnum(pos[1]) || !g_ascii_isalnum(*(pos - 1))) { + return FALSE; + } + + + c = pos - 1; + while (c > cb->begin) { + if (!is_mailsafe(*c)) { + break; + } + if (c == match->prev_newline_pos) { + break; + } + + c--; + } + /* Rewind to the first alphanumeric character */ + while (c < pos && !g_ascii_isalnum(*c)) { + c++; + } + + /* Find the end of email */ + p = pos + 1; + while (p < cb->end && is_domain(*p)) { + if (p == match->newline_pos) { + break; + } + + p++; + } + + /* Rewind it again to avoid bad emails to be detected */ + while (p > pos && p < cb->end && !g_ascii_isalnum(*p)) { + p--; + } + + if (p < cb->end && g_ascii_isalnum(*p) && + (match->newline_pos == NULL || p < match->newline_pos)) { + p++; + } + + if (p > c) { + match->m_begin = c; + match->m_len = p - c; + return TRUE; + } + } + + return FALSE; +} + +static gboolean +url_tel_start(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + match->m_begin = pos; + + if (pos >= cb->begin + 1) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + + return TRUE; +} + +static gboolean +url_tel_end(struct url_callback_data *cb, + const gchar *pos, + url_match_t *match) +{ + const gchar *last = NULL; + struct http_parser_url u; + gint len = cb->end - pos; + guint flags = 0; + + if (match->newline_pos && match->st != '<') { + /* We should also limit our match end to the newline */ + len = MIN(len, match->newline_pos - pos); + } + + if (rspamd_telephone_parse(&u, pos, len, &last, + RSPAMD_URL_PARSE_CHECK, &flags) != 0) { + return FALSE; + } + + if (!(u.field_set & (1 << UF_HOST))) { + return FALSE; + } + + match->m_len = (last - pos); + + return TRUE; +} + + +static gboolean +rspamd_url_trie_is_match(struct url_matcher *matcher, const gchar *pos, + const gchar *end, const gchar *newline_pos) +{ + if (matcher->flags & URL_MATCHER_FLAG_TLD_MATCH) { + /* Immediately check pos for valid chars */ + if (pos < end) { + if (pos != newline_pos && !g_ascii_isspace(*pos) && *pos != '/' && *pos != '?' && + *pos != ':' && !is_url_end(*pos)) { + if (*pos == '.') { + /* We allow . at the end of the domain however */ + pos++; + if (pos < end) { + if (!g_ascii_isspace(*pos) && *pos != '/' && + *pos != '?' && *pos != ':' && !is_url_end(*pos)) { + return FALSE; + } + } + } + else { + return FALSE; + } + } + } + } + + return TRUE; +} + +static gint +rspamd_url_trie_callback(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + struct url_matcher *matcher; + url_match_t m; + const gchar *pos, *newline_pos = NULL; + struct url_callback_data *cb = context; + + pos = text + match_pos; + + if (cb->fin > pos) { + /* Already seen */ + return 0; + } + + matcher = &g_array_index(cb->matchers, struct url_matcher, + strnum); + + if ((matcher->flags & URL_MATCHER_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) { + /* Do not try to match non-html like urls in html texts */ + return 0; + } + + memset(&m, 0, sizeof(m)); + m.m_begin = text + match_start; + m.m_len = match_pos - match_start; + + if (cb->newlines && cb->newlines->len > 0) { + newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); + + while (pos > newline_pos && cb->newline_idx < cb->newlines->len) { + cb->newline_idx++; + newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); + } + + if (pos > newline_pos) { + newline_pos = NULL; + } + + if (cb->newline_idx > 0) { + m.prev_newline_pos = g_ptr_array_index(cb->newlines, + cb->newline_idx - 1); + } + } + + if (!rspamd_url_trie_is_match(matcher, pos, cb->end, newline_pos)) { + return 0; + } + + m.pattern = matcher->pattern; + m.prefix = matcher->prefix; + m.add_prefix = FALSE; + m.newline_pos = newline_pos; + pos = cb->begin + match_start; + + if (matcher->start(cb, pos, &m) && + matcher->end(cb, pos, &m)) { + if (m.add_prefix || matcher->prefix[0] != '\0') { + cb->len = m.m_len + strlen(matcher->prefix); + cb->url_str = rspamd_mempool_alloc(cb->pool, cb->len + 1); + cb->len = rspamd_snprintf(cb->url_str, + cb->len + 1, + "%s%*s", + m.prefix, + (gint) m.m_len, + m.m_begin); + cb->prefix_added = TRUE; + } + else { + cb->url_str = rspamd_mempool_alloc(cb->pool, m.m_len + 1); + rspamd_strlcpy(cb->url_str, m.m_begin, m.m_len + 1); + } + + cb->start = m.m_begin; + + if (pos > cb->fin) { + cb->fin = pos; + } + + return 1; + } + else { + cb->url_str = NULL; + } + + /* Continue search */ + return 0; +} + +gboolean +rspamd_url_find(rspamd_mempool_t *pool, + const gchar *begin, gsize len, + gchar **url_str, + enum rspamd_url_find_type how, + goffset *url_pos, + gboolean *prefix_added) +{ + struct url_callback_data cb; + gint ret; + + memset(&cb, 0, sizeof(cb)); + cb.begin = begin; + cb.end = begin + len; + cb.how = how; + cb.pool = pool; + + if (how == RSPAMD_URL_FIND_ALL) { + if (url_scanner->search_trie_full) { + cb.matchers = url_scanner->matchers_full; + ret = rspamd_multipattern_lookup(url_scanner->search_trie_full, + begin, len, + rspamd_url_trie_callback, &cb, NULL); + } + else { + cb.matchers = url_scanner->matchers_strict; + ret = rspamd_multipattern_lookup(url_scanner->search_trie_strict, + begin, len, + rspamd_url_trie_callback, &cb, NULL); + } + } + else { + cb.matchers = url_scanner->matchers_strict; + ret = rspamd_multipattern_lookup(url_scanner->search_trie_strict, + begin, len, + rspamd_url_trie_callback, &cb, NULL); + } + + if (ret) { + if (url_str) { + *url_str = cb.url_str; + } + + if (url_pos) { + *url_pos = cb.start - begin; + } + + if (prefix_added) { + *prefix_added = cb.prefix_added; + } + + return TRUE; + } + + return FALSE; +} + +static gint +rspamd_url_trie_generic_callback_common(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context, + gboolean multiple) +{ + struct rspamd_url *url; + struct url_matcher *matcher; + url_match_t m; + const gchar *pos, *newline_pos = NULL; + struct url_callback_data *cb = context; + gint rc; + rspamd_mempool_t *pool; + + pos = text + match_pos; + + if (cb->fin > pos) { + /* Already seen */ + return 0; + } + + matcher = &g_array_index(cb->matchers, struct url_matcher, + strnum); + pool = cb->pool; + + if ((matcher->flags & URL_MATCHER_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) { + /* Do not try to match non-html like urls in html texts, continue matching */ + return 0; + } + + memset(&m, 0, sizeof(m)); + + + /* Find the next newline after our pos */ + if (cb->newlines && cb->newlines->len > 0) { + newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); + + while (pos > newline_pos && cb->newline_idx < cb->newlines->len - 1) { + cb->newline_idx++; + newline_pos = g_ptr_array_index(cb->newlines, cb->newline_idx); + } + + if (pos > newline_pos) { + newline_pos = NULL; + } + if (cb->newline_idx > 0) { + m.prev_newline_pos = g_ptr_array_index(cb->newlines, + cb->newline_idx - 1); + } + } + + if (!rspamd_url_trie_is_match(matcher, pos, text + len, newline_pos)) { + /* Mismatch, continue */ + return 0; + } + + pos = cb->begin + match_start; + m.pattern = matcher->pattern; + m.prefix = matcher->prefix; + m.add_prefix = FALSE; + m.m_begin = text + match_start; + m.m_len = match_pos - match_start; + m.newline_pos = newline_pos; + + if (matcher->start(cb, pos, &m) && + matcher->end(cb, pos, &m)) { + if (m.add_prefix || matcher->prefix[0] != '\0') { + cb->len = m.m_len + strlen(matcher->prefix); + cb->url_str = rspamd_mempool_alloc(cb->pool, cb->len + 1); + cb->len = rspamd_snprintf(cb->url_str, + cb->len + 1, + "%s%*s", + m.prefix, + (gint) m.m_len, + m.m_begin); + cb->prefix_added = TRUE; + } + else { + cb->url_str = rspamd_mempool_alloc(cb->pool, m.m_len + 1); + cb->len = rspamd_strlcpy(cb->url_str, m.m_begin, m.m_len + 1); + } + + cb->start = m.m_begin; + + if (pos > cb->fin) { + cb->fin = pos; + } + + url = rspamd_mempool_alloc0(pool, sizeof(struct rspamd_url)); + g_strstrip(cb->url_str); + rc = rspamd_url_parse(url, cb->url_str, + strlen(cb->url_str), pool, + RSPAMD_URL_PARSE_TEXT); + + if (rc == URI_ERRNO_OK && url->hostlen > 0) { + if (cb->prefix_added) { + url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; + cb->prefix_added = FALSE; + } + + if (cb->func) { + if (!cb->func(url, cb->start - text, (m.m_begin + m.m_len) - text, + cb->funcd)) { + /* We need to stop here in any case! */ + return -1; + } + } + } + else if (rc != URI_ERRNO_OK) { + msg_debug_pool_check("extract of url '%s' failed: %s", + cb->url_str, + rspamd_url_strerror(rc)); + } + } + else { + cb->url_str = NULL; + /* Continue search if no pattern has been found */ + return 0; + } + + /* Continue search if required (return 0 means continue) */ + return !multiple; +} + +static gint +rspamd_url_trie_generic_callback_multiple(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + return rspamd_url_trie_generic_callback_common(mp, strnum, match_start, + match_pos, text, len, context, TRUE); +} + +static gint +rspamd_url_trie_generic_callback_single(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + return rspamd_url_trie_generic_callback_common(mp, strnum, match_start, + match_pos, text, len, context, FALSE); +} + +struct rspamd_url_mimepart_cbdata { + struct rspamd_task *task; + struct rspamd_mime_text_part *part; + gsize url_len; + uint16_t *cur_url_order; /* Global ordering */ + uint16_t cur_part_order; /* Per part ordering */ +}; + +static gboolean +rspamd_url_query_callback(struct rspamd_url *url, gsize start_offset, + gsize end_offset, gpointer ud) +{ + struct rspamd_url_mimepart_cbdata *cbd = + (struct rspamd_url_mimepart_cbdata *) ud; + struct rspamd_task *task; + + task = cbd->task; + + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen == 0) { + return FALSE; + } + } + /* Also check max urls */ + if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) { + if (kh_size(MESSAGE_FIELD(task, urls)) > cbd->task->cfg->max_urls) { + msg_err_task("part has too many URLs, we cannot process more: " + "%d urls extracted ", + (guint) kh_size(MESSAGE_FIELD(task, urls))); + + return FALSE; + } + } + + url->flags |= RSPAMD_URL_FLAG_QUERY; + + + if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) { + if (cbd->part && cbd->part->mime_part->urls) { + g_ptr_array_add(cbd->part->mime_part->urls, url); + } + + url->part_order = cbd->cur_part_order++; + + if (cbd->cur_url_order) { + url->order = (*cbd->cur_url_order)++; + } + } + + return TRUE; +} + +static gboolean +rspamd_url_text_part_callback(struct rspamd_url *url, gsize start_offset, + gsize end_offset, gpointer ud) +{ + struct rspamd_url_mimepart_cbdata *cbd = + (struct rspamd_url_mimepart_cbdata *) ud; + struct rspamd_process_exception *ex; + struct rspamd_task *task; + + task = cbd->task; + ex = rspamd_mempool_alloc0(task->task_pool, sizeof(struct rspamd_process_exception)); + + ex->pos = start_offset; + ex->len = end_offset - start_offset; + ex->type = RSPAMD_EXCEPTION_URL; + ex->ptr = url; + + cbd->url_len += ex->len; + + if (cbd->part->utf_stripped_content && + cbd->url_len > cbd->part->utf_stripped_content->len * 10) { + /* Absurd case, stop here now */ + msg_err_task("part has too many URLs, we cannot process more: %z url len; " + "%d stripped content length", + cbd->url_len, cbd->part->utf_stripped_content->len); + + return FALSE; + } + + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen == 0) { + return FALSE; + } + } + /* Also check max urls */ + if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) { + if (kh_size(MESSAGE_FIELD(task, urls)) > cbd->task->cfg->max_urls) { + msg_err_task("part has too many URLs, we cannot process more: " + "%d urls extracted ", + (guint) kh_size(MESSAGE_FIELD(task, urls))); + + return FALSE; + } + } + + url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; + + if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) && + cbd->part->mime_part->urls) { + url->part_order = cbd->cur_part_order++; + + if (cbd->cur_url_order) { + url->order = (*cbd->cur_url_order)++; + } + g_ptr_array_add(cbd->part->mime_part->urls, url); + } + + cbd->part->exceptions = g_list_prepend( + cbd->part->exceptions, + ex); + + /* We also search the query for additional url inside */ + if (url->querylen > 0) { + rspamd_url_find_multiple(task->task_pool, + rspamd_url_query_unsafe(url), url->querylen, + RSPAMD_URL_FIND_ALL, NULL, + rspamd_url_query_callback, cbd); + } + + return TRUE; +} + +void rspamd_url_text_extract(rspamd_mempool_t *pool, + struct rspamd_task *task, + struct rspamd_mime_text_part *part, + uint16_t *cur_url_order, + enum rspamd_url_find_type how) +{ + struct rspamd_url_mimepart_cbdata mcbd; + + if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) { + msg_warn_task("got empty text part"); + return; + } + + mcbd.task = task; + mcbd.part = part; + mcbd.url_len = 0; + mcbd.cur_url_order = cur_url_order; + mcbd.cur_part_order = 0; + + rspamd_url_find_multiple(task->task_pool, part->utf_stripped_content->data, + part->utf_stripped_content->len, how, part->newlines, + rspamd_url_text_part_callback, &mcbd); +} + +void rspamd_url_find_multiple(rspamd_mempool_t *pool, + const gchar *in, + gsize inlen, + enum rspamd_url_find_type how, + GPtrArray *nlines, + url_insert_function func, + gpointer ud) +{ + struct url_callback_data cb; + + g_assert(in != NULL); + + if (inlen == 0) { + inlen = strlen(in); + } + + memset(&cb, 0, sizeof(cb)); + cb.begin = in; + cb.end = in + inlen; + cb.how = how; + cb.pool = pool; + + cb.funcd = ud; + cb.func = func; + cb.newlines = nlines; + + if (how == RSPAMD_URL_FIND_ALL) { + if (url_scanner->search_trie_full) { + cb.matchers = url_scanner->matchers_full; + rspamd_multipattern_lookup(url_scanner->search_trie_full, + in, inlen, + rspamd_url_trie_generic_callback_multiple, &cb, NULL); + } + else { + cb.matchers = url_scanner->matchers_strict; + rspamd_multipattern_lookup(url_scanner->search_trie_strict, + in, inlen, + rspamd_url_trie_generic_callback_multiple, &cb, NULL); + } + } + else { + cb.matchers = url_scanner->matchers_strict; + rspamd_multipattern_lookup(url_scanner->search_trie_strict, + in, inlen, + rspamd_url_trie_generic_callback_multiple, &cb, NULL); + } +} + +void rspamd_url_find_single(rspamd_mempool_t *pool, + const gchar *in, + gsize inlen, + enum rspamd_url_find_type how, + url_insert_function func, + gpointer ud) +{ + struct url_callback_data cb; + + g_assert(in != NULL); + + if (inlen == 0) { + inlen = strlen(in); + } + + /* + * We might have a situation when we need to parse URLs on config file + * parsing, but there is no valid url_scanner loaded. Hence, we just load + * some defaults and it should be fine... + */ + if (url_scanner == NULL) { + rspamd_url_init(NULL); + } + + memset(&cb, 0, sizeof(cb)); + cb.begin = in; + cb.end = in + inlen; + cb.how = how; + cb.pool = pool; + + cb.funcd = ud; + cb.func = func; + + if (how == RSPAMD_URL_FIND_ALL) { + if (url_scanner->search_trie_full) { + cb.matchers = url_scanner->matchers_full; + rspamd_multipattern_lookup(url_scanner->search_trie_full, + in, inlen, + rspamd_url_trie_generic_callback_single, &cb, NULL); + } + else { + cb.matchers = url_scanner->matchers_strict; + rspamd_multipattern_lookup(url_scanner->search_trie_strict, + in, inlen, + rspamd_url_trie_generic_callback_single, &cb, NULL); + } + } + else { + cb.matchers = url_scanner->matchers_strict; + rspamd_multipattern_lookup(url_scanner->search_trie_strict, + in, inlen, + rspamd_url_trie_generic_callback_single, &cb, NULL); + } +} + + +gboolean +rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset, + gsize end_offset, gpointer ud) +{ + struct rspamd_task *task = ud; + gchar *url_str = NULL; + struct rspamd_url *query_url; + gint rc; + gboolean prefix_added; + + /* It is just a displayed URL, we should not check it for certain things */ + url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED | RSPAMD_URL_FLAG_SUBJECT; + + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen == 0) { + return FALSE; + } + } + + rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false); + + /* We also search the query for additional url inside */ + if (url->querylen > 0) { + if (rspamd_url_find(task->task_pool, rspamd_url_query_unsafe(url), url->querylen, + &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { + + query_url = rspamd_mempool_alloc0(task->task_pool, + sizeof(struct rspamd_url)); + rc = rspamd_url_parse(query_url, + url_str, + strlen(url_str), + task->task_pool, + RSPAMD_URL_PARSE_TEXT); + + if (rc == URI_ERRNO_OK && + url->hostlen > 0) { + msg_debug_task("found url %s in query of url" + " %*s", + url_str, url->querylen, rspamd_url_query_unsafe(url)); + + if (prefix_added) { + query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; + } + + if (query_url->protocol == PROTOCOL_MAILTO) { + if (query_url->userlen == 0) { + return TRUE; + } + } + + rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), + query_url, false); + } + } + } + + return TRUE; +} + +static inline khint_t +rspamd_url_hash(struct rspamd_url *url) +{ + if (url->urllen > 0) { + return (khint_t) rspamd_cryptobox_fast_hash(url->string, url->urllen, + rspamd_hash_seed()); + } + + return 0; +} + +static inline khint_t +rspamd_url_host_hash(struct rspamd_url *url) +{ + if (url->hostlen > 0) { + return (khint_t) rspamd_cryptobox_fast_hash(rspamd_url_host_unsafe(url), + url->hostlen, + rspamd_hash_seed()); + } + + return 0; +} + +/* Compare two emails for building emails tree */ +static inline bool +rspamd_emails_cmp(struct rspamd_url *u1, struct rspamd_url *u2) +{ + gint r; + + if (u1->hostlen != u2->hostlen || u1->hostlen == 0) { + return FALSE; + } + else { + if ((r = rspamd_lc_cmp(rspamd_url_host_unsafe(u1), + rspamd_url_host_unsafe(u2), u1->hostlen)) == 0) { + if (u1->userlen != u2->userlen || u1->userlen == 0) { + return FALSE; + } + else { + return (rspamd_lc_cmp(rspamd_url_user_unsafe(u1), + rspamd_url_user_unsafe(u2), + u1->userlen) == 0); + } + } + else { + return r == 0; + } + } + + return FALSE; +} + +static inline bool +rspamd_urls_cmp(struct rspamd_url *u1, struct rspamd_url *u2) +{ + int r = 0; + + if (u1->protocol != u2->protocol || u1->urllen != u2->urllen) { + return false; + } + else { + if (u1->protocol & PROTOCOL_MAILTO) { + return rspamd_emails_cmp(u1, u2); + } + + r = memcmp(u1->string, u2->string, u1->urllen); + } + + return r == 0; +} + +static inline bool +rspamd_urls_host_cmp(struct rspamd_url *u1, struct rspamd_url *u2) +{ + int r = 0; + + if (u1->hostlen != u2->hostlen) { + return false; + } + else { + r = memcmp(rspamd_url_host_unsafe(u1), rspamd_url_host_unsafe(u2), + u1->hostlen); + } + + return r == 0; +} + +gsize rspamd_url_decode(gchar *dst, const gchar *src, gsize size) +{ + gchar *d, ch, c, decoded; + const gchar *s; + enum { + sw_usual = 0, + sw_quoted, + sw_quoted_second + } state; + + d = dst; + s = src; + + state = 0; + decoded = 0; + + while (size--) { + + ch = *s++; + + switch (state) { + case sw_usual: + + if (ch == '%') { + state = sw_quoted; + break; + } + else if (ch == '+') { + *d++ = ' '; + } + else { + *d++ = ch; + } + break; + + case sw_quoted: + + if (ch >= '0' && ch <= '9') { + decoded = (ch - '0'); + state = sw_quoted_second; + break; + } + + c = (ch | 0x20); + if (c >= 'a' && c <= 'f') { + decoded = (c - 'a' + 10); + state = sw_quoted_second; + break; + } + + /* the invalid quoted character */ + + state = sw_usual; + + *d++ = ch; + + break; + + case sw_quoted_second: + + state = sw_usual; + + if (ch >= '0' && ch <= '9') { + ch = ((decoded << 4) + ch - '0'); + *d++ = ch; + + break; + } + + c = (u_char) (ch | 0x20); + if (c >= 'a' && c <= 'f') { + ch = ((decoded << 4) + c - 'a' + 10); + + *d++ = ch; + break; + } + + /* the invalid quoted character */ + break; + } + } + + return (d - dst); +} + +enum rspamd_url_char_class { + RSPAMD_URL_UNRESERVED = (1 << 0), + RSPAMD_URL_SUBDELIM = (1 << 1), + RSPAMD_URL_PATHSAFE = (1 << 2), + RSPAMD_URL_QUERYSAFE = (1 << 3), + RSPAMD_URL_FRAGMENTSAFE = (1 << 4), + RSPAMD_URL_HOSTSAFE = (1 << 5), + RSPAMD_URL_USERSAFE = (1 << 6), +}; + +#define RSPAMD_URL_FLAGS_HOSTSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_HOSTSAFE | RSPAMD_URL_SUBDELIM) +#define RSPAMD_URL_FLAGS_USERSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_USERSAFE | RSPAMD_URL_SUBDELIM) +#define RSPAMD_URL_FLAGS_PATHSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_PATHSAFE | RSPAMD_URL_SUBDELIM) +#define RSPAMD_URL_FLAGS_QUERYSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_SUBDELIM) +#define RSPAMD_URL_FLAGS_FRAGMENTSAFE (RSPAMD_URL_UNRESERVED | RSPAMD_URL_FRAGMENTSAFE | RSPAMD_URL_SUBDELIM) + +static const unsigned char rspamd_url_encoding_classes[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0 /* */, RSPAMD_URL_SUBDELIM /* ! */, 0 /* " */, 0 /* # */, + RSPAMD_URL_SUBDELIM /* $ */, 0 /* % */, RSPAMD_URL_SUBDELIM /* & */, + RSPAMD_URL_SUBDELIM /* ' */, RSPAMD_URL_SUBDELIM /* ( */, + RSPAMD_URL_SUBDELIM /* ) */, RSPAMD_URL_SUBDELIM /* * */, + RSPAMD_URL_SUBDELIM /* + */, RSPAMD_URL_SUBDELIM /* , */, + RSPAMD_URL_UNRESERVED /* - */, RSPAMD_URL_UNRESERVED /* . */, + RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* / */, + RSPAMD_URL_UNRESERVED /* 0 */, RSPAMD_URL_UNRESERVED /* 1 */, + RSPAMD_URL_UNRESERVED /* 2 */, RSPAMD_URL_UNRESERVED /* 3 */, + RSPAMD_URL_UNRESERVED /* 4 */, RSPAMD_URL_UNRESERVED /* 5 */, + RSPAMD_URL_UNRESERVED /* 6 */, RSPAMD_URL_UNRESERVED /* 7 */, + RSPAMD_URL_UNRESERVED /* 8 */, RSPAMD_URL_UNRESERVED /* 9 */, + RSPAMD_URL_USERSAFE | RSPAMD_URL_HOSTSAFE | RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* : */, + RSPAMD_URL_SUBDELIM /* ; */, 0 /* < */, RSPAMD_URL_SUBDELIM /* = */, 0 /* > */, + RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* ? */, + RSPAMD_URL_PATHSAFE | RSPAMD_URL_QUERYSAFE | RSPAMD_URL_FRAGMENTSAFE /* @ */, + RSPAMD_URL_UNRESERVED /* A */, RSPAMD_URL_UNRESERVED /* B */, + RSPAMD_URL_UNRESERVED /* C */, RSPAMD_URL_UNRESERVED /* D */, + RSPAMD_URL_UNRESERVED /* E */, RSPAMD_URL_UNRESERVED /* F */, + RSPAMD_URL_UNRESERVED /* G */, RSPAMD_URL_UNRESERVED /* H */, + RSPAMD_URL_UNRESERVED /* I */, RSPAMD_URL_UNRESERVED /* J */, + RSPAMD_URL_UNRESERVED /* K */, RSPAMD_URL_UNRESERVED /* L */, + RSPAMD_URL_UNRESERVED /* M */, RSPAMD_URL_UNRESERVED /* N */, + RSPAMD_URL_UNRESERVED /* O */, RSPAMD_URL_UNRESERVED /* P */, + RSPAMD_URL_UNRESERVED /* Q */, RSPAMD_URL_UNRESERVED /* R */, + RSPAMD_URL_UNRESERVED /* S */, RSPAMD_URL_UNRESERVED /* T */, + RSPAMD_URL_UNRESERVED /* U */, RSPAMD_URL_UNRESERVED /* V */, + RSPAMD_URL_UNRESERVED /* W */, RSPAMD_URL_UNRESERVED /* X */, + RSPAMD_URL_UNRESERVED /* Y */, RSPAMD_URL_UNRESERVED /* Z */, + RSPAMD_URL_HOSTSAFE /* [ */, 0 /* \ */, RSPAMD_URL_HOSTSAFE /* ] */, 0 /* ^ */, + RSPAMD_URL_UNRESERVED /* _ */, 0 /* ` */, RSPAMD_URL_UNRESERVED /* a */, + RSPAMD_URL_UNRESERVED /* b */, RSPAMD_URL_UNRESERVED /* c */, + RSPAMD_URL_UNRESERVED /* d */, RSPAMD_URL_UNRESERVED /* e */, + RSPAMD_URL_UNRESERVED /* f */, RSPAMD_URL_UNRESERVED /* g */, + RSPAMD_URL_UNRESERVED /* h */, RSPAMD_URL_UNRESERVED /* i */, + RSPAMD_URL_UNRESERVED /* j */, RSPAMD_URL_UNRESERVED /* k */, + RSPAMD_URL_UNRESERVED /* l */, RSPAMD_URL_UNRESERVED /* m */, + RSPAMD_URL_UNRESERVED /* n */, RSPAMD_URL_UNRESERVED /* o */, + RSPAMD_URL_UNRESERVED /* p */, RSPAMD_URL_UNRESERVED /* q */, + RSPAMD_URL_UNRESERVED /* r */, RSPAMD_URL_UNRESERVED /* s */, + RSPAMD_URL_UNRESERVED /* t */, RSPAMD_URL_UNRESERVED /* u */, + RSPAMD_URL_UNRESERVED /* v */, RSPAMD_URL_UNRESERVED /* w */, + RSPAMD_URL_UNRESERVED /* x */, RSPAMD_URL_UNRESERVED /* y */, + RSPAMD_URL_UNRESERVED /* z */, 0 /* { */, 0 /* | */, 0 /* } */, + RSPAMD_URL_UNRESERVED /* ~ */, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +#define CHECK_URL_COMPONENT(beg, len, flags) \ + do { \ + for (i = 0; i < (len); i++) { \ + if ((rspamd_url_encoding_classes[(guchar) (beg)[i]] & (flags)) == 0) { \ + dlen += 2; \ + } \ + } \ + } while (0) + +#define ENCODE_URL_COMPONENT(beg, len, flags) \ + do { \ + for (i = 0; i < (len) && dend > d; i++) { \ + if ((rspamd_url_encoding_classes[(guchar) (beg)[i]] & (flags)) == 0) { \ + *d++ = '%'; \ + *d++ = hexdigests[(guchar) ((beg)[i] >> 4) & 0xf]; \ + *d++ = hexdigests[(guchar) (beg)[i] & 0xf]; \ + } \ + else { \ + *d++ = (beg)[i]; \ + } \ + } \ + } while (0) + +const gchar * +rspamd_url_encode(struct rspamd_url *url, gsize *pdlen, + rspamd_mempool_t *pool) +{ + guchar *dest, *d, *dend; + static const gchar hexdigests[16] = "0123456789ABCDEF"; + guint i; + gsize dlen = 0; + + g_assert(pdlen != NULL && url != NULL && pool != NULL); + + CHECK_URL_COMPONENT(rspamd_url_host_unsafe(url), url->hostlen, + RSPAMD_URL_FLAGS_HOSTSAFE); + CHECK_URL_COMPONENT(rspamd_url_user_unsafe(url), url->userlen, + RSPAMD_URL_FLAGS_USERSAFE); + CHECK_URL_COMPONENT(rspamd_url_data_unsafe(url), url->datalen, + RSPAMD_URL_FLAGS_PATHSAFE); + CHECK_URL_COMPONENT(rspamd_url_query_unsafe(url), url->querylen, + RSPAMD_URL_FLAGS_QUERYSAFE); + CHECK_URL_COMPONENT(rspamd_url_fragment_unsafe(url), url->fragmentlen, + RSPAMD_URL_FLAGS_FRAGMENTSAFE); + + if (dlen == 0) { + *pdlen = url->urllen; + + return url->string; + } + + /* Need to encode */ + dlen += url->urllen + sizeof("telephone://"); /* Protocol hack */ + dest = rspamd_mempool_alloc(pool, dlen + 1); + d = dest; + dend = d + dlen; + + if (url->protocollen > 0) { + if (!(url->protocol & PROTOCOL_UNKNOWN)) { + const gchar *known_proto = rspamd_url_protocol_name(url->protocol); + d += rspamd_snprintf((gchar *) d, dend - d, + "%s://", + known_proto); + } + else { + d += rspamd_snprintf((gchar *) d, dend - d, + "%*s://", + (gint) url->protocollen, url->string); + } + } + else { + d += rspamd_snprintf((gchar *) d, dend - d, "http://"); + } + + if (url->userlen > 0) { + ENCODE_URL_COMPONENT(rspamd_url_user_unsafe(url), url->userlen, + RSPAMD_URL_FLAGS_USERSAFE); + *d++ = '@'; + } + + ENCODE_URL_COMPONENT(rspamd_url_host_unsafe(url), url->hostlen, + RSPAMD_URL_FLAGS_HOSTSAFE); + + if (url->datalen > 0) { + *d++ = '/'; + ENCODE_URL_COMPONENT(rspamd_url_data_unsafe(url), url->datalen, + RSPAMD_URL_FLAGS_PATHSAFE); + } + + if (url->querylen > 0) { + *d++ = '?'; + ENCODE_URL_COMPONENT(rspamd_url_query_unsafe(url), url->querylen, + RSPAMD_URL_FLAGS_QUERYSAFE); + } + + if (url->fragmentlen > 0) { + *d++ = '#'; + ENCODE_URL_COMPONENT(rspamd_url_fragment_unsafe(url), url->fragmentlen, + RSPAMD_URL_FLAGS_FRAGMENTSAFE); + } + + *pdlen = (d - dest); + + return (const gchar *) dest; +} + +gboolean +rspamd_url_is_domain(int c) +{ + return is_domain((guchar) c); +} + +const gchar * +rspamd_url_protocol_name(enum rspamd_url_protocol proto) +{ + const gchar *ret = "unknown"; + + switch (proto) { + case PROTOCOL_HTTP: + ret = "http"; + break; + case PROTOCOL_HTTPS: + ret = "https"; + break; + case PROTOCOL_FTP: + ret = "ftp"; + break; + case PROTOCOL_FILE: + ret = "file"; + break; + case PROTOCOL_MAILTO: + ret = "mailto"; + break; + case PROTOCOL_TELEPHONE: + ret = "telephone"; + break; + default: + break; + } + + return ret; +} + +enum rspamd_url_protocol +rspamd_url_protocol_from_string(const gchar *str) +{ + enum rspamd_url_protocol ret = PROTOCOL_UNKNOWN; + + if (strcmp(str, "http") == 0) { + ret = PROTOCOL_HTTP; + } + else if (strcmp(str, "https") == 0) { + ret = PROTOCOL_HTTPS; + } + else if (strcmp(str, "mailto") == 0) { + ret = PROTOCOL_MAILTO; + } + else if (strcmp(str, "ftp") == 0) { + ret = PROTOCOL_FTP; + } + else if (strcmp(str, "file") == 0) { + ret = PROTOCOL_FILE; + } + else if (strcmp(str, "telephone") == 0) { + ret = PROTOCOL_TELEPHONE; + } + + return ret; +} + + +bool rspamd_url_set_add_or_increase(khash_t(rspamd_url_hash) * set, + struct rspamd_url *u, + bool enforce_replace) +{ + khiter_t k; + gint r; + + k = kh_get(rspamd_url_hash, set, u); + + if (k != kh_end(set)) { + /* Existing url */ + struct rspamd_url *ex = kh_key(set, k); +#define SUSPICIOUS_URL_FLAGS (RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_OBSCURED | RSPAMD_URL_FLAG_ZW_SPACES) + if (enforce_replace) { + kh_key(set, k) = u; + u->count++; + } + else { + if (u->flags & SUSPICIOUS_URL_FLAGS) { + if (!(ex->flags & SUSPICIOUS_URL_FLAGS)) { + /* Propagate new url to an old one */ + kh_key(set, k) = u; + u->count++; + } + else { + ex->count++; + } + } + else { + ex->count++; + } + } + + return false; + } + else { + k = kh_put(rspamd_url_hash, set, u, &r); + } + + return true; +} + +struct rspamd_url * +rspamd_url_set_add_or_return(khash_t(rspamd_url_hash) * set, + struct rspamd_url *u) +{ + khiter_t k; + gint r; + + if (set) { + k = kh_get(rspamd_url_hash, set, u); + + if (k != kh_end(set)) { + return kh_key(set, k); + } + else { + k = kh_put(rspamd_url_hash, set, u, &r); + + return kh_key(set, k); + } + } + + return NULL; +} + +bool rspamd_url_host_set_add(khash_t(rspamd_url_host_hash) * set, + struct rspamd_url *u) +{ + gint r; + + if (set) { + kh_put(rspamd_url_host_hash, set, u, &r); + + if (r == 0) { + return false; + } + + return true; + } + + return false; +} + +bool rspamd_url_set_has(khash_t(rspamd_url_hash) * set, struct rspamd_url *u) +{ + khiter_t k; + + if (set) { + k = kh_get(rspamd_url_hash, set, u); + + if (k == kh_end(set)) { + return false; + } + + return true; + } + + return false; +} + +bool rspamd_url_host_set_has(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u) +{ + khiter_t k; + + if (set) { + k = kh_get(rspamd_url_host_hash, set, u); + + if (k == kh_end(set)) { + return false; + } + + return true; + } + + return false; +} + +bool rspamd_url_flag_from_string(const gchar *str, gint *flag) +{ + gint h = rspamd_cryptobox_fast_hash_specific(RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, + str, strlen(str), 0); + + for (int i = 0; i < G_N_ELEMENTS(url_flag_names); i++) { + if (url_flag_names[i].hash == h) { + *flag |= url_flag_names[i].flag; + + return true; + } + } + + return false; +} + + +const gchar * +rspamd_url_flag_to_string(int flag) +{ + for (int i = 0; i < G_N_ELEMENTS(url_flag_names); i++) { + if (url_flag_names[i].flag & flag) { + return url_flag_names[i].name; + } + } + + return NULL; +} + +inline int +rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2) +{ + int min_len = MIN(u1->urllen, u2->urllen); + int r; + + if (u1->protocol != u2->protocol) { + return u1->protocol - u2->protocol; + } + + if (u1->protocol & PROTOCOL_MAILTO) { + /* Emails specialisation (hosts must be compared in a case insensitive matter */ + min_len = MIN(u1->hostlen, u2->hostlen); + + if ((r = rspamd_lc_cmp(rspamd_url_host_unsafe(u1), + rspamd_url_host_unsafe(u2), min_len)) == 0) { + if (u1->hostlen == u2->hostlen) { + if (u1->userlen != u2->userlen || u1->userlen == 0) { + r = (int) u1->userlen - (int) u2->userlen; + } + else { + r = memcmp(rspamd_url_user_unsafe(u1), + rspamd_url_user_unsafe(u2), + u1->userlen); + } + } + else { + r = u1->hostlen - u2->hostlen; + } + } + } + else { + if (u1->urllen != u2->urllen) { + /* Different length, compare common part and then compare length */ + r = memcmp(u1->string, u2->string, min_len); + + if (r == 0) { + r = u1->urllen - u2->urllen; + } + } + else { + /* Equal length */ + r = memcmp(u1->string, u2->string, u1->urllen); + } + } + + return r; +} + +int rspamd_url_cmp_qsort(const void *_u1, const void *_u2) +{ + const struct rspamd_url *u1 = *(struct rspamd_url **) _u1, + *u2 = *(struct rspamd_url **) _u2; + + return rspamd_url_cmp(u1, u2); +} diff --git a/src/libserver/url.h b/src/libserver/url.h new file mode 100644 index 0000000..d1fb8c9 --- /dev/null +++ b/src/libserver/url.h @@ -0,0 +1,430 @@ +/* URL check functions */ +#ifndef URL_H +#define URL_H + +#include "config.h" +#include "mem_pool.h" +#include "khash.h" +#include "fstring.h" +#include "libutil/cxx/utf8_util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct rspamd_mime_text_part; + +enum rspamd_url_flags { + RSPAMD_URL_FLAG_PHISHED = 1u << 0u, + RSPAMD_URL_FLAG_NUMERIC = 1u << 1u, + RSPAMD_URL_FLAG_OBSCURED = 1u << 2u, + RSPAMD_URL_FLAG_REDIRECTED = 1u << 3u, + RSPAMD_URL_FLAG_HTML_DISPLAYED = 1u << 4u, + RSPAMD_URL_FLAG_FROM_TEXT = 1u << 5u, + RSPAMD_URL_FLAG_SUBJECT = 1u << 6u, + RSPAMD_URL_FLAG_HOSTENCODED = 1u << 7u, + RSPAMD_URL_FLAG_SCHEMAENCODED = 1u << 8u, + RSPAMD_URL_FLAG_PATHENCODED = 1u << 9u, + RSPAMD_URL_FLAG_QUERYENCODED = 1u << 10u, + RSPAMD_URL_FLAG_MISSINGSLASHES = 1u << 11u, + RSPAMD_URL_FLAG_IDN = 1u << 12u, + RSPAMD_URL_FLAG_HAS_PORT = 1u << 13u, + RSPAMD_URL_FLAG_HAS_USER = 1u << 14u, + RSPAMD_URL_FLAG_SCHEMALESS = 1u << 15u, + RSPAMD_URL_FLAG_UNNORMALISED = 1u << 16u, + RSPAMD_URL_FLAG_ZW_SPACES = 1u << 17u, + RSPAMD_URL_FLAG_DISPLAY_URL = 1u << 18u, + RSPAMD_URL_FLAG_IMAGE = 1u << 19u, + RSPAMD_URL_FLAG_QUERY = 1u << 20u, + RSPAMD_URL_FLAG_CONTENT = 1u << 21u, + RSPAMD_URL_FLAG_NO_TLD = 1u << 22u, + RSPAMD_URL_FLAG_TRUNCATED = 1u << 23u, + RSPAMD_URL_FLAG_REDIRECT_TARGET = 1u << 24u, + RSPAMD_URL_FLAG_INVISIBLE = 1u << 25u, + RSPAMD_URL_FLAG_SPECIAL = 1u << 26u, + +}; +#define RSPAMD_URL_MAX_FLAG_SHIFT (26u) + +struct rspamd_url_tag { + const gchar *data; + struct rspamd_url_tag *prev, *next; +}; + +struct rspamd_url_ext; +/** + * URL structure + */ +struct rspamd_url { + char *string; + char *raw; + struct rspamd_url_ext *ext; + + uint32_t flags; + + uint8_t protocol; + uint8_t protocollen; + + uint16_t hostshift; + uint16_t datashift; + uint16_t queryshift; + uint16_t fragmentshift; + uint16_t tldshift; + guint16 usershift; + guint16 userlen; + + uint16_t hostlen; + uint16_t datalen; + uint16_t querylen; + uint16_t fragmentlen; + uint16_t tldlen; + uint16_t count; + uint16_t urllen; + uint16_t rawlen; + + /* Absolute order of the URL in a message */ + uint16_t order; + /* Order of the URL in a specific part of message */ + uint16_t part_order; +}; + +/** + * Rarely used url fields + */ +struct rspamd_url_ext { + gchar *visible_part; + struct rspamd_url *linked_url; + + guint16 port; +}; + +#define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL) +#define rspamd_url_user_unsafe(u) ((u)->string + (u)->usershift) + +#define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL) +#define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift) +#define rspamd_url_tld_unsafe(u) ((u)->string + (u)->tldshift) + +#define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift) +#define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift) +#define rspamd_url_fragment_unsafe(u) ((u)->string + (u)->fragmentshift) + +enum uri_errno { + URI_ERRNO_OK = 0, /* Parsing went well */ + URI_ERRNO_EMPTY, /* The URI string was empty */ + URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */ + URI_ERRNO_INVALID_PORT, /* Port number is bad */ + URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */ + URI_ERRNO_BAD_FORMAT, + URI_ERRNO_TLD_MISSING, + URI_ERRNO_HOST_MISSING, + URI_ERRNO_TOO_LONG, +}; + +enum rspamd_url_protocol { + PROTOCOL_FILE = 1u << 0u, + PROTOCOL_FTP = 1u << 1u, + PROTOCOL_HTTP = 1u << 2u, + PROTOCOL_HTTPS = 1u << 3u, + PROTOCOL_MAILTO = 1u << 4u, + PROTOCOL_TELEPHONE = 1u << 5u, + PROTOCOL_UNKNOWN = 1u << 7u, +}; + +enum rspamd_url_parse_flags { + RSPAMD_URL_PARSE_TEXT = 0u, + RSPAMD_URL_PARSE_HREF = (1u << 0u), + RSPAMD_URL_PARSE_CHECK = (1u << 1u), +}; + +enum rspamd_url_find_type { + RSPAMD_URL_FIND_ALL = 0, + RSPAMD_URL_FIND_STRICT, +}; + +/** + * Initialize url library + * @param cfg + */ +void rspamd_url_init(const gchar *tld_file); + +void rspamd_url_deinit(void); + +/* + * Parse urls inside text + * @param pool memory pool + * @param task task object + * @param part current text part + * @param is_html turn on html heuristic + */ +void rspamd_url_text_extract(rspamd_mempool_t *pool, + struct rspamd_task *task, + struct rspamd_mime_text_part *part, + uint16_t *cur_order, + enum rspamd_url_find_type how); + +/* + * Parse a single url into an uri structure + * @param pool memory pool + * @param uristring text form of url + * @param uri url object, must be pre allocated + */ +enum uri_errno rspamd_url_parse(struct rspamd_url *uri, + gchar *uristring, + gsize len, + rspamd_mempool_t *pool, + enum rspamd_url_parse_flags flags); + +/* + * Try to extract url from a text + * @param pool memory pool + * @param begin begin of text + * @param len length of text + * @param start storage for start position of url found (or NULL) + * @param end storage for end position of url found (or NULL) + * @param url_str storage for url string(or NULL) + * @return TRUE if url is found in specified text + */ +gboolean rspamd_url_find(rspamd_mempool_t *pool, + const gchar *begin, gsize len, + gchar **url_str, + enum rspamd_url_find_type how, + goffset *url_pos, + gboolean *prefix_added); + +/* + * Return text representation of url parsing error + */ +const gchar *rspamd_url_strerror(int err); + + +/** + * Find TLD for a specified host string + * @param in input host + * @param inlen length of input + * @param out output rspamd_ftok_t with tld position + * @return TRUE if tld has been found + */ +gboolean rspamd_url_find_tld(const gchar *in, gsize inlen, rspamd_ftok_t *out); + +typedef gboolean (*url_insert_function)(struct rspamd_url *url, + gsize start_offset, gsize end_offset, void *ud); + +/** + * Search for multiple urls in text and call `func` for each url found + * @param pool + * @param in + * @param inlen + * @param is_html + * @param func + * @param ud + */ +void rspamd_url_find_multiple(rspamd_mempool_t *pool, + const gchar *in, gsize inlen, + enum rspamd_url_find_type how, + GPtrArray *nlines, + url_insert_function func, + gpointer ud); + +/** + * Search for a single url in text and call `func` for each url found + * @param pool + * @param in + * @param inlen + * @param is_html + * @param func + * @param ud + */ +void rspamd_url_find_single(rspamd_mempool_t *pool, + const gchar *in, gsize inlen, + enum rspamd_url_find_type how, + url_insert_function func, + gpointer ud); + +/** + * Generic callback to insert URLs into rspamd_task + * @param url + * @param start_offset + * @param end_offset + * @param ud + */ +gboolean rspamd_url_task_subject_callback(struct rspamd_url *url, + gsize start_offset, + gsize end_offset, gpointer ud); + +/** + * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated + * @param dst + * @param src + * @param size + * @return + */ +gsize rspamd_url_decode(gchar *dst, const gchar *src, gsize size); + +/** + * Encode url if needed. In this case, memory is allocated from the specific pool. + * Returns pointer to begin and encoded length in `dlen` + * @param url + * @param pool + * @return + */ +const gchar *rspamd_url_encode(struct rspamd_url *url, gsize *dlen, + rspamd_mempool_t *pool); + + +/** + * Returns if a character is domain character + * @param c + * @return + */ +gboolean rspamd_url_is_domain(int c); + +/** + * Returns symbolic name for protocol + * @param proto + * @return + */ +const gchar *rspamd_url_protocol_name(enum rspamd_url_protocol proto); + + +/** + * Converts string to a numeric protocol + * @param str + * @return + */ +enum rspamd_url_protocol rspamd_url_protocol_from_string(const gchar *str); + +/** + * Converts string to a url flag + * @param str + * @param flag + * @return + */ +bool rspamd_url_flag_from_string(const gchar *str, gint *flag); + +/** + * Converts url flag to a string + * @param flag + * @return + */ +const gchar *rspamd_url_flag_to_string(int flag); + +/* Defines sets of urls indexed by url as is */ +KHASH_DECLARE(rspamd_url_hash, struct rspamd_url *, char); +KHASH_DECLARE(rspamd_url_host_hash, struct rspamd_url *, char); + +/* Convenience functions for url sets */ +/** + * Add an url to set or increase the existing url count + * @param set + * @param u + * @return true if a new url has been added + */ +bool rspamd_url_set_add_or_increase(khash_t(rspamd_url_hash) * set, + struct rspamd_url *u, + bool enforce_replace); + +/** + * Same as rspamd_url_set_add_or_increase but returns the existing url if found + * @param set + * @param u + * @return + */ +struct rspamd_url *rspamd_url_set_add_or_return(khash_t(rspamd_url_hash) * set, + struct rspamd_url *u); +/** + * Helper for url host set + * @param set + * @param u + * @return + */ +bool rspamd_url_host_set_add(khash_t(rspamd_url_host_hash) * set, + struct rspamd_url *u); +/** + * Checks if a url is in set + * @param set + * @param u + * @return + */ +bool rspamd_url_set_has(khash_t(rspamd_url_hash) * set, struct rspamd_url *u); + +bool rspamd_url_host_set_has(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u); + +/** + * Compares two urls (similar to C comparison functions) lexicographically + * @param u1 + * @param u2 + * @return + */ +int rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2); + +/** + * Same but used for qsort to sort `struct rspamd_url *[]` array + * @param u1 + * @param u2 + * @return + */ +int rspamd_url_cmp_qsort(const void *u1, const void *u2); + +/** + * Returns a port for some url + * @param u + * @return + */ +static RSPAMD_PURE_FUNCTION inline uint16_t rspamd_url_get_port(struct rspamd_url *u) +{ + if ((u->flags & RSPAMD_URL_FLAG_HAS_PORT) && u->ext) { + return u->ext->port; + } + else { + /* Assume standard port */ + if (u->protocol == PROTOCOL_HTTPS) { + return 443; + } + else { + return 80; + } + } +} + +/** + * Returns a port for some url if it is set + * @param u + * @return + */ +static RSPAMD_PURE_FUNCTION inline uint16_t rspamd_url_get_port_if_special(struct rspamd_url *u) +{ + if ((u->flags & RSPAMD_URL_FLAG_HAS_PORT) && u->ext) { + return u->ext->port; + } + + return 0; +} + +/** + * Normalize unicode input and set out url flags as appropriate + * @param pool + * @param input + * @param len_out (must be &var) + * @param url_flags_out (must be just a var with no dereference) + */ +#define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \ + do { \ + enum rspamd_utf8_normalise_result norm_res; \ + norm_res = rspamd_normalise_unicode_inplace((input), (len_out)); \ + if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \ + url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \ + } \ + if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { \ + url_flags_out |= RSPAMD_URL_FLAG_ZW_SPACES; \ + } \ + if (norm_res & (RSPAMD_UNICODE_NORM_ERROR)) { \ + url_flags_out |= RSPAMD_URL_FLAG_OBSCURED; \ + } \ + } while (0) +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/libserver/worker_util.c b/src/libserver/worker_util.c new file mode 100644 index 0000000..74a3cf8 --- /dev/null +++ b/src/libserver/worker_util.c @@ -0,0 +1,2313 @@ +/* + * Copyright 2023 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "config.h" +#include "rspamd.h" +#include "lua/lua_common.h" +#include "worker_util.h" +#include "unix-std.h" +#include "utlist.h" +#include "ottery.h" +#include "rspamd_control.h" +#include "libserver/maps/map.h" +#include "libserver/maps/map_private.h" +#include "libserver/http/http_private.h" +#include "libserver/http/http_router.h" +#include "libutil/rrd.h" + +/* sys/resource.h */ +#ifdef HAVE_SYS_RESOURCE_H +#include <sys/resource.h> +#endif +/* pwd and grp */ +#ifdef HAVE_PWD_H +#include <pwd.h> +#endif +#ifdef HAVE_GRP_H +#include <grp.h> +#endif +#ifdef HAVE_LIBUTIL_H +#include <libutil.h> +#endif +#include "zlib.h" + +#ifdef HAVE_UCONTEXT_H +#include <ucontext.h> +#elif defined(HAVE_SYS_UCONTEXT_H) +#include <sys/ucontext.h> +#endif + +#ifdef HAVE_SYS_WAIT_H +#include <sys/wait.h> +#include <math.h> + +#endif + +#include "contrib/libev/ev.h" +#include "libstat/stat_api.h" + +struct rspamd_worker *rspamd_current_worker = NULL; + +/* Forward declaration */ +static void rspamd_worker_heartbeat_start(struct rspamd_worker *, + struct ev_loop *); + +static void rspamd_worker_ignore_signal(struct rspamd_worker_signal_handler *); +/** + * Return worker's control structure by its type + * @param type + * @return worker's control structure or NULL + */ +worker_t * +rspamd_get_worker_by_type(struct rspamd_config *cfg, GQuark type) +{ + worker_t **pwrk; + + pwrk = cfg->compiled_workers; + while (pwrk && *pwrk) { + if (rspamd_check_worker(cfg, *pwrk)) { + if (g_quark_from_string((*pwrk)->name) == type) { + return *pwrk; + } + } + + pwrk++; + } + + return NULL; +} + +static void +rspamd_worker_check_finished(EV_P_ ev_timer *w, int revents) +{ + int *pnchecks = (int *) w->data; + + if (*pnchecks > SOFT_SHUTDOWN_TIME * 10) { + msg_warn("terminating worker before finishing of terminate handlers"); + ev_break(EV_A_ EVBREAK_ONE); + } + else { + int refcount = ev_active_cnt(EV_A); + + if (refcount == 1) { + ev_break(EV_A_ EVBREAK_ONE); + } + else { + ev_timer_again(EV_A_ w); + } + } +} + +static gboolean +rspamd_worker_finalize(gpointer user_data) +{ + struct rspamd_task *task = user_data; + + if (!(task->flags & RSPAMD_TASK_FLAG_PROCESSING)) { + msg_info_task("finishing actions has been processed, terminating"); + /* ev_break (task->event_loop, EVBREAK_ALL); */ + task->worker->state = rspamd_worker_wanna_die; + rspamd_session_destroy(task->s); + + return TRUE; + } + + return FALSE; +} + +gboolean +rspamd_worker_call_finish_handlers(struct rspamd_worker *worker) +{ + struct rspamd_task *task; + struct rspamd_config *cfg = worker->srv->cfg; + struct rspamd_abstract_worker_ctx *ctx; + struct rspamd_config_cfg_lua_script *sc; + + if (cfg->on_term_scripts) { + ctx = (struct rspamd_abstract_worker_ctx *) worker->ctx; + /* Create a fake task object for async events */ + task = rspamd_task_new(worker, cfg, NULL, NULL, ctx->event_loop, FALSE); + task->resolver = ctx->resolver; + task->flags |= RSPAMD_TASK_FLAG_PROCESSING; + task->s = rspamd_session_create(task->task_pool, + rspamd_worker_finalize, + NULL, + (event_finalizer_t) rspamd_task_free, + task); + + DL_FOREACH(cfg->on_term_scripts, sc) + { + lua_call_finish_script(sc, task); + } + + task->flags &= ~RSPAMD_TASK_FLAG_PROCESSING; + + if (rspamd_session_pending(task->s)) { + return TRUE; + } + } + + return FALSE; +} + +static void +rspamd_worker_terminate_handlers(struct rspamd_worker *w) +{ + if (w->nconns == 0 && + (!(w->flags & RSPAMD_WORKER_SCANNER) || w->srv->cfg->on_term_scripts == NULL)) { + /* + * We are here either: + * - No active connections are represented + * - No term scripts are registered + * - Worker is not a scanner, so it can die safely + */ + w->state = rspamd_worker_wanna_die; + } + else { + if (w->nconns > 0) { + /* + * Wait until all connections are terminated + */ + w->state = rspamd_worker_wait_connections; + } + else { + /* + * Start finish scripts + */ + if (w->state != rspamd_worker_wait_final_scripts) { + w->state = rspamd_worker_wait_final_scripts; + + if ((w->flags & RSPAMD_WORKER_SCANNER) && + rspamd_worker_call_finish_handlers(w)) { + msg_info("performing async finishing actions"); + w->state = rspamd_worker_wait_final_scripts; + } + else { + /* + * We are done now + */ + msg_info("no async finishing actions, terminating"); + w->state = rspamd_worker_wanna_die; + } + } + } + } +} + +static void +rspamd_worker_on_delayed_shutdown(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_worker *worker = (struct rspamd_worker *) w->data; + + worker->state = rspamd_worker_wanna_die; + ev_timer_stop(EV_A_ w); + ev_break(loop, EVBREAK_ALL); +} + +static void +rspamd_worker_shutdown_check(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_worker *worker = (struct rspamd_worker *) w->data; + + if (worker->state != rspamd_worker_wanna_die) { + rspamd_worker_terminate_handlers(worker); + + if (worker->state == rspamd_worker_wanna_die) { + /* We are done, kill event loop */ + ev_timer_stop(EV_A_ w); + ev_break(EV_A_ EVBREAK_ALL); + } + else { + /* Try again later */ + ev_timer_again(EV_A_ w); + } + } + else { + ev_timer_stop(EV_A_ w); + ev_break(EV_A_ EVBREAK_ALL); + } +} + +/* + * Config reload is designed by sending sigusr2 to active workers and pending shutdown of them + */ +static gboolean +rspamd_worker_usr2_handler(struct rspamd_worker_signal_handler *sigh, void *arg) +{ + /* Do not accept new connections, preparing to end worker's process */ + if (sigh->worker->state == rspamd_worker_state_running) { + static ev_timer shutdown_ev, shutdown_check_ev; + ev_tstamp shutdown_ts; + + if (sigh->worker->flags & RSPAMD_WORKER_NO_TERMINATE_DELAY) { + shutdown_ts = 0.0; + } + else { + shutdown_ts = MAX(SOFT_SHUTDOWN_TIME, + sigh->worker->srv->cfg->task_timeout * 2.0); + } + + rspamd_worker_ignore_signal(sigh); + sigh->worker->state = rspamd_worker_state_terminating; + + rspamd_default_log_function(G_LOG_LEVEL_INFO, + sigh->worker->srv->server_pool->tag.tagname, + sigh->worker->srv->server_pool->tag.uid, + G_STRFUNC, + "worker's shutdown is pending in %.2f sec", + shutdown_ts); + + /* Soft shutdown timer */ + shutdown_ev.data = sigh->worker; + ev_timer_init(&shutdown_ev, rspamd_worker_on_delayed_shutdown, + shutdown_ts, 0.0); + ev_timer_start(sigh->event_loop, &shutdown_ev); + + if (!(sigh->worker->flags & RSPAMD_WORKER_NO_TERMINATE_DELAY)) { + /* This timer checks if we are ready to die and is called frequently */ + shutdown_check_ev.data = sigh->worker; + ev_timer_init(&shutdown_check_ev, rspamd_worker_shutdown_check, + 0.5, 0.5); + ev_timer_start(sigh->event_loop, &shutdown_check_ev); + } + + rspamd_worker_stop_accept(sigh->worker); + } + + /* No more signals */ + return FALSE; +} + +/* + * Reopen log is designed by sending sigusr1 to active workers and pending shutdown of them + */ +static gboolean +rspamd_worker_usr1_handler(struct rspamd_worker_signal_handler *sigh, void *arg) +{ + struct rspamd_main *rspamd_main = sigh->worker->srv; + + rspamd_log_reopen(sigh->worker->srv->logger, rspamd_main->cfg, -1, -1); + msg_info_main("logging reinitialised"); + + /* Get more signals */ + return TRUE; +} + +static gboolean +rspamd_worker_term_handler(struct rspamd_worker_signal_handler *sigh, void *arg) +{ + if (sigh->worker->state == rspamd_worker_state_running) { + static ev_timer shutdown_ev, shutdown_check_ev; + ev_tstamp shutdown_ts; + + if (sigh->worker->flags & RSPAMD_WORKER_NO_TERMINATE_DELAY) { + shutdown_ts = 0.0; + } + else { + shutdown_ts = MAX(SOFT_SHUTDOWN_TIME, + sigh->worker->srv->cfg->task_timeout * 2.0); + } + + rspamd_worker_ignore_signal(sigh); + sigh->worker->state = rspamd_worker_state_terminating; + rspamd_default_log_function(G_LOG_LEVEL_INFO, + sigh->worker->srv->server_pool->tag.tagname, + sigh->worker->srv->server_pool->tag.uid, + G_STRFUNC, + "terminating after receiving signal %s", + g_strsignal(sigh->signo)); + + rspamd_worker_stop_accept(sigh->worker); + rspamd_worker_terminate_handlers(sigh->worker); + + /* Check if we are ready to die */ + if (sigh->worker->state != rspamd_worker_wanna_die) { + /* This timer is called when we have no choices but to die */ + shutdown_ev.data = sigh->worker; + ev_timer_init(&shutdown_ev, rspamd_worker_on_delayed_shutdown, + shutdown_ts, 0.0); + ev_timer_start(sigh->event_loop, &shutdown_ev); + + if (!(sigh->worker->flags & RSPAMD_WORKER_NO_TERMINATE_DELAY)) { + /* This timer checks if we are ready to die and is called frequently */ + shutdown_check_ev.data = sigh->worker; + ev_timer_init(&shutdown_check_ev, rspamd_worker_shutdown_check, + 0.5, 0.5); + ev_timer_start(sigh->event_loop, &shutdown_check_ev); + } + } + else { + /* Flag to die has been already set */ + ev_break(sigh->event_loop, EVBREAK_ALL); + } + } + + /* Stop reacting on signals */ + return FALSE; +} + +static void +rspamd_worker_signal_handle(EV_P_ ev_signal *w, int revents) +{ + struct rspamd_worker_signal_handler *sigh = + (struct rspamd_worker_signal_handler *) w->data; + struct rspamd_worker_signal_handler_elt *cb, *cbtmp; + + /* Call all signal handlers registered */ + DL_FOREACH_SAFE(sigh->cb, cb, cbtmp) + { + if (!cb->handler(sigh, cb->handler_data)) { + DL_DELETE(sigh->cb, cb); + g_free(cb); + } + } +} + +static void +rspamd_worker_ignore_signal(struct rspamd_worker_signal_handler *sigh) +{ + sigset_t set; + + ev_signal_stop(sigh->event_loop, &sigh->ev_sig); + sigemptyset(&set); + sigaddset(&set, sigh->signo); + sigprocmask(SIG_BLOCK, &set, NULL); +} + +static void +rspamd_worker_default_signal(int signo) +{ + struct sigaction sig; + + sigemptyset(&sig.sa_mask); + sigaddset(&sig.sa_mask, signo); + sig.sa_handler = SIG_DFL; + sig.sa_flags = 0; + sigaction(signo, &sig, NULL); +} + +static void +rspamd_sigh_free(void *p) +{ + struct rspamd_worker_signal_handler *sigh = p; + struct rspamd_worker_signal_handler_elt *cb, *tmp; + + DL_FOREACH_SAFE(sigh->cb, cb, tmp) + { + DL_DELETE(sigh->cb, cb); + g_free(cb); + } + + ev_signal_stop(sigh->event_loop, &sigh->ev_sig); + rspamd_worker_default_signal(sigh->signo); + g_free(sigh); +} + +void rspamd_worker_set_signal_handler(int signo, struct rspamd_worker *worker, + struct ev_loop *event_loop, + rspamd_worker_signal_cb_t handler, + void *handler_data) +{ + struct rspamd_worker_signal_handler *sigh; + struct rspamd_worker_signal_handler_elt *cb; + + sigh = g_hash_table_lookup(worker->signal_events, GINT_TO_POINTER(signo)); + + if (sigh == NULL) { + sigh = g_malloc0(sizeof(*sigh)); + sigh->signo = signo; + sigh->worker = worker; + sigh->event_loop = event_loop; + sigh->enabled = TRUE; + + sigh->ev_sig.data = sigh; + ev_signal_init(&sigh->ev_sig, rspamd_worker_signal_handle, signo); + ev_signal_start(event_loop, &sigh->ev_sig); + + g_hash_table_insert(worker->signal_events, + GINT_TO_POINTER(signo), + sigh); + } + + cb = g_malloc0(sizeof(*cb)); + cb->handler = handler; + cb->handler_data = handler_data; + DL_APPEND(sigh->cb, cb); +} + +void rspamd_worker_init_signals(struct rspamd_worker *worker, + struct ev_loop *event_loop) +{ + /* A set of terminating signals */ + rspamd_worker_set_signal_handler(SIGTERM, worker, event_loop, + rspamd_worker_term_handler, NULL); + rspamd_worker_set_signal_handler(SIGINT, worker, event_loop, + rspamd_worker_term_handler, NULL); + rspamd_worker_set_signal_handler(SIGHUP, worker, event_loop, + rspamd_worker_term_handler, NULL); + + /* Special purpose signals */ + rspamd_worker_set_signal_handler(SIGUSR1, worker, event_loop, + rspamd_worker_usr1_handler, NULL); + rspamd_worker_set_signal_handler(SIGUSR2, worker, event_loop, + rspamd_worker_usr2_handler, NULL); +} + + +struct ev_loop * +rspamd_prepare_worker(struct rspamd_worker *worker, const char *name, + rspamd_accept_handler hdl) +{ + struct ev_loop *event_loop; + GList *cur; + struct rspamd_worker_listen_socket *ls; + struct rspamd_worker_accept_event *accept_ev; + + worker->signal_events = g_hash_table_new_full(g_direct_hash, g_direct_equal, + NULL, rspamd_sigh_free); + + event_loop = ev_loop_new(rspamd_config_ev_backend_get(worker->srv->cfg)); + + worker->srv->event_loop = event_loop; + + rspamd_worker_init_signals(worker, event_loop); + rspamd_control_worker_add_default_cmd_handlers(worker, event_loop); + rspamd_worker_heartbeat_start(worker, event_loop); + rspamd_redis_pool_config(worker->srv->cfg->redis_pool, + worker->srv->cfg, event_loop); + + /* Accept all sockets */ + if (hdl) { + cur = worker->cf->listen_socks; + + while (cur) { + ls = cur->data; + + if (ls->fd != -1) { + accept_ev = g_malloc0(sizeof(*accept_ev)); + accept_ev->event_loop = event_loop; + accept_ev->accept_ev.data = worker; + ev_io_init(&accept_ev->accept_ev, hdl, ls->fd, EV_READ); + ev_io_start(event_loop, &accept_ev->accept_ev); + + DL_APPEND(worker->accept_events, accept_ev); + } + + cur = g_list_next(cur); + } + } + + return event_loop; +} + +void rspamd_worker_stop_accept(struct rspamd_worker *worker) +{ + struct rspamd_worker_accept_event *cur, *tmp; + + /* Remove all events */ + DL_FOREACH_SAFE(worker->accept_events, cur, tmp) + { + + if (ev_can_stop(&cur->accept_ev)) { + ev_io_stop(cur->event_loop, &cur->accept_ev); + } + + + if (ev_can_stop(&cur->throttling_ev)) { + ev_timer_stop(cur->event_loop, &cur->throttling_ev); + } + + g_free(cur); + } + + /* XXX: we need to do it much later */ +#if 0 + g_hash_table_iter_init (&it, worker->signal_events); + + while (g_hash_table_iter_next (&it, &k, &v)) { + sigh = (struct rspamd_worker_signal_handler *)v; + g_hash_table_iter_steal (&it); + + if (sigh->enabled) { + event_del (&sigh->ev); + } + + g_free (sigh); + } + + g_hash_table_unref (worker->signal_events); +#endif +} + +static rspamd_fstring_t * +rspamd_controller_maybe_compress(struct rspamd_http_connection_entry *entry, + rspamd_fstring_t *buf, struct rspamd_http_message *msg) +{ + if (entry->support_gzip) { + if (rspamd_fstring_gzip(&buf)) { + rspamd_http_message_add_header(msg, "Content-Encoding", "gzip"); + } + } + + return buf; +} + +void rspamd_controller_send_error(struct rspamd_http_connection_entry *entry, + gint code, const gchar *error_msg, ...) +{ + struct rspamd_http_message *msg; + va_list args; + rspamd_fstring_t *reply; + + msg = rspamd_http_new_message(HTTP_RESPONSE); + + va_start(args, error_msg); + msg->status = rspamd_fstring_new(); + rspamd_vprintf_fstring(&msg->status, error_msg, args); + va_end(args); + + msg->date = time(NULL); + msg->code = code; + reply = rspamd_fstring_sized_new(msg->status->len + 16); + rspamd_printf_fstring(&reply, "{\"error\":\"%V\"}", msg->status); + rspamd_http_message_set_body_from_fstring_steal(msg, + rspamd_controller_maybe_compress(entry, reply, msg)); + rspamd_http_connection_reset(entry->conn); + rspamd_http_router_insert_headers(entry->rt, msg); + rspamd_http_connection_write_message(entry->conn, + msg, + NULL, + "application/json", + entry, + entry->rt->timeout); + entry->is_reply = TRUE; +} + +void rspamd_controller_send_openmetrics(struct rspamd_http_connection_entry *entry, + rspamd_fstring_t *str) +{ + struct rspamd_http_message *msg; + + msg = rspamd_http_new_message(HTTP_RESPONSE); + msg->date = time(NULL); + msg->code = 200; + msg->status = rspamd_fstring_new_init("OK", 2); + + rspamd_http_message_set_body_from_fstring_steal(msg, + rspamd_controller_maybe_compress(entry, str, msg)); + rspamd_http_connection_reset(entry->conn); + rspamd_http_router_insert_headers(entry->rt, msg); + rspamd_http_connection_write_message(entry->conn, + msg, + NULL, + "application/openmetrics-text; version=1.0.0; charset=utf-8", + entry, + entry->rt->timeout); + entry->is_reply = TRUE; +} + +void rspamd_controller_send_string(struct rspamd_http_connection_entry *entry, + const gchar *str) +{ + struct rspamd_http_message *msg; + rspamd_fstring_t *reply; + + msg = rspamd_http_new_message(HTTP_RESPONSE); + msg->date = time(NULL); + msg->code = 200; + msg->status = rspamd_fstring_new_init("OK", 2); + + if (str) { + reply = rspamd_fstring_new_init(str, strlen(str)); + } + else { + reply = rspamd_fstring_new_init("null", 4); + } + + rspamd_http_message_set_body_from_fstring_steal(msg, + rspamd_controller_maybe_compress(entry, reply, msg)); + rspamd_http_connection_reset(entry->conn); + rspamd_http_router_insert_headers(entry->rt, msg); + rspamd_http_connection_write_message(entry->conn, + msg, + NULL, + "application/json", + entry, + entry->rt->timeout); + entry->is_reply = TRUE; +} + +void rspamd_controller_send_ucl(struct rspamd_http_connection_entry *entry, + ucl_object_t *obj) +{ + struct rspamd_http_message *msg; + rspamd_fstring_t *reply; + + msg = rspamd_http_new_message(HTTP_RESPONSE); + msg->date = time(NULL); + msg->code = 200; + msg->status = rspamd_fstring_new_init("OK", 2); + reply = rspamd_fstring_sized_new(BUFSIZ); + rspamd_ucl_emit_fstring(obj, UCL_EMIT_JSON_COMPACT, &reply); + rspamd_http_message_set_body_from_fstring_steal(msg, + rspamd_controller_maybe_compress(entry, reply, msg)); + rspamd_http_connection_reset(entry->conn); + rspamd_http_router_insert_headers(entry->rt, msg); + rspamd_http_connection_write_message(entry->conn, + msg, + NULL, + "application/json", + entry, + entry->rt->timeout); + entry->is_reply = TRUE; +} + +static void +rspamd_worker_drop_priv(struct rspamd_main *rspamd_main) +{ + if (rspamd_main->is_privileged) { + if (setgid(rspamd_main->workers_gid) == -1) { + msg_err_main("cannot setgid to %d (%s), aborting", + (gint) rspamd_main->workers_gid, + strerror(errno)); + exit(-errno); + } + + if (rspamd_main->cfg->rspamd_user && + initgroups(rspamd_main->cfg->rspamd_user, + rspamd_main->workers_gid) == -1) { + msg_err_main("initgroups failed (%s), aborting", strerror(errno)); + exit(-errno); + } + + if (setuid(rspamd_main->workers_uid) == -1) { + msg_err_main("cannot setuid to %d (%s), aborting", + (gint) rspamd_main->workers_uid, + strerror(errno)); + exit(-errno); + } + } +} + +static void +rspamd_worker_set_limits(struct rspamd_main *rspamd_main, + struct rspamd_worker_conf *cf) +{ + struct rlimit rlmt; + + if (cf->rlimit_nofile != 0) { + rlmt.rlim_cur = (rlim_t) cf->rlimit_nofile; + rlmt.rlim_max = (rlim_t) cf->rlimit_nofile; + + if (setrlimit(RLIMIT_NOFILE, &rlmt) == -1) { + msg_warn_main("cannot set files rlimit: %L, %s", + cf->rlimit_nofile, + strerror(errno)); + } + + memset(&rlmt, 0, sizeof(rlmt)); + + if (getrlimit(RLIMIT_NOFILE, &rlmt) == -1) { + msg_warn_main("cannot get max files rlimit: %HL, %s", + cf->rlimit_maxcore, + strerror(errno)); + } + else { + msg_info_main("set max file descriptors limit: %HL cur and %HL max", + (guint64) rlmt.rlim_cur, + (guint64) rlmt.rlim_max); + } + } + else { + /* Just report */ + if (getrlimit(RLIMIT_NOFILE, &rlmt) == -1) { + msg_warn_main("cannot get max files rlimit: %HL, %s", + cf->rlimit_maxcore, + strerror(errno)); + } + else { + msg_info_main("use system max file descriptors limit: %HL cur and %HL max", + (guint64) rlmt.rlim_cur, + (guint64) rlmt.rlim_max); + } + } + + if (rspamd_main->cores_throttling) { + msg_info_main("disable core files for the new worker as limits are reached"); + rlmt.rlim_cur = 0; + rlmt.rlim_max = 0; + + if (setrlimit(RLIMIT_CORE, &rlmt) == -1) { + msg_warn_main("cannot disable core dumps: error when setting limits: %s", + strerror(errno)); + } + } + else { + if (cf->rlimit_maxcore != 0) { + rlmt.rlim_cur = (rlim_t) cf->rlimit_maxcore; + rlmt.rlim_max = (rlim_t) cf->rlimit_maxcore; + + if (setrlimit(RLIMIT_CORE, &rlmt) == -1) { + msg_warn_main("cannot set max core size limit: %HL, %s", + cf->rlimit_maxcore, + strerror(errno)); + } + + /* Ensure that we did it */ + memset(&rlmt, 0, sizeof(rlmt)); + + if (getrlimit(RLIMIT_CORE, &rlmt) == -1) { + msg_warn_main("cannot get max core size rlimit: %HL, %s", + cf->rlimit_maxcore, + strerror(errno)); + } + else { + if (rlmt.rlim_cur != cf->rlimit_maxcore || + rlmt.rlim_max != cf->rlimit_maxcore) { + msg_warn_main("setting of core file limits was unsuccessful: " + "%HL was wanted, " + "but we have %HL cur and %HL max", + cf->rlimit_maxcore, + (guint64) rlmt.rlim_cur, + (guint64) rlmt.rlim_max); + } + else { + msg_info_main("set max core size limit: %HL cur and %HL max", + (guint64) rlmt.rlim_cur, + (guint64) rlmt.rlim_max); + } + } + } + else { + /* Just report */ + if (getrlimit(RLIMIT_CORE, &rlmt) == -1) { + msg_warn_main("cannot get max core size limit: %HL, %s", + cf->rlimit_maxcore, + strerror(errno)); + } + else { + msg_info_main("use system max core size limit: %HL cur and %HL max", + (guint64) rlmt.rlim_cur, + (guint64) rlmt.rlim_max); + } + } + } +} + +static void +rspamd_worker_on_term(EV_P_ ev_child *w, int revents) +{ + struct rspamd_worker *wrk = (struct rspamd_worker *) w->data; + + if (wrk->ppid == getpid()) { + if (wrk->term_handler) { + wrk->term_handler(EV_A_ w, wrk->srv, wrk); + } + else { + rspamd_check_termination_clause(wrk->srv, wrk, w->rstatus); + } + } + else { + /* Ignore SIGCHLD for not our children... */ + } +} + +static void +rspamd_worker_heartbeat_cb(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_worker *wrk = (struct rspamd_worker *) w->data; + struct rspamd_srv_command cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.type = RSPAMD_SRV_HEARTBEAT; + rspamd_srv_send_command(wrk, EV_A, &cmd, -1, NULL, NULL); +} + +static void +rspamd_worker_heartbeat_start(struct rspamd_worker *wrk, struct ev_loop *event_loop) +{ + wrk->hb.heartbeat_ev.data = (void *) wrk; + ev_timer_init(&wrk->hb.heartbeat_ev, rspamd_worker_heartbeat_cb, + 0.0, wrk->srv->cfg->heartbeat_interval); + ev_timer_start(event_loop, &wrk->hb.heartbeat_ev); +} + +static void +rspamd_main_heartbeat_cb(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_worker *wrk = (struct rspamd_worker *) w->data; + gdouble time_from_last = ev_time(); + struct rspamd_main *rspamd_main; + static struct rspamd_control_command cmd; + struct tm tm; + gchar timebuf[64]; + gchar usec_buf[16]; + gint r; + + time_from_last -= wrk->hb.last_event; + rspamd_main = wrk->srv; + + if (wrk->hb.last_event > 0 && + time_from_last > 0 && + time_from_last >= rspamd_main->cfg->heartbeat_interval * 2) { + + rspamd_localtime(wrk->hb.last_event, &tm); + r = strftime(timebuf, sizeof(timebuf), "%F %H:%M:%S", &tm); + rspamd_snprintf(usec_buf, sizeof(usec_buf), "%.5f", + wrk->hb.last_event - (gdouble) (time_t) wrk->hb.last_event); + rspamd_snprintf(timebuf + r, sizeof(timebuf) - r, + "%s", usec_buf + 1); + + if (wrk->hb.nbeats > 0) { + /* First time lost event */ + cmd.type = RSPAMD_CONTROL_CHILD_CHANGE; + cmd.cmd.child_change.what = rspamd_child_offline; + cmd.cmd.child_change.pid = wrk->pid; + rspamd_control_broadcast_srv_cmd(rspamd_main, &cmd, wrk->pid); + msg_warn_main("lost heartbeat from worker type %s with pid %P, " + "last beat on: %s (%L beats received previously)", + g_quark_to_string(wrk->type), wrk->pid, + timebuf, + wrk->hb.nbeats); + wrk->hb.nbeats = -1; + /* TODO: send notify about worker problem */ + } + else { + wrk->hb.nbeats--; + msg_warn_main("lost %L heartbeat from worker type %s with pid %P, " + "last beat on: %s", + -(wrk->hb.nbeats), + g_quark_to_string(wrk->type), + wrk->pid, + timebuf); + + if (rspamd_main->cfg->heartbeats_loss_max > 0 && + -(wrk->hb.nbeats) >= rspamd_main->cfg->heartbeats_loss_max) { + + + if (-(wrk->hb.nbeats) > rspamd_main->cfg->heartbeats_loss_max + 1) { + msg_err_main("force kill worker type %s with pid %P, " + "last beat on: %s; %L heartbeat lost", + g_quark_to_string(wrk->type), + wrk->pid, + timebuf, + -(wrk->hb.nbeats)); + kill(wrk->pid, SIGKILL); + } + else { + msg_err_main("terminate worker type %s with pid %P, " + "last beat on: %s; %L heartbeat lost", + g_quark_to_string(wrk->type), + wrk->pid, + timebuf, + -(wrk->hb.nbeats)); + kill(wrk->pid, SIGTERM); + } + } + } + } + else if (wrk->hb.nbeats < 0) { + rspamd_localtime(wrk->hb.last_event, &tm); + r = strftime(timebuf, sizeof(timebuf), "%F %H:%M:%S", &tm); + rspamd_snprintf(usec_buf, sizeof(usec_buf), "%.5f", + wrk->hb.last_event - (gdouble) (time_t) wrk->hb.last_event); + rspamd_snprintf(timebuf + r, sizeof(timebuf) - r, + "%s", usec_buf + 1); + + cmd.type = RSPAMD_CONTROL_CHILD_CHANGE; + cmd.cmd.child_change.what = rspamd_child_online; + cmd.cmd.child_change.pid = wrk->pid; + rspamd_control_broadcast_srv_cmd(rspamd_main, &cmd, wrk->pid); + msg_info_main("received heartbeat from worker type %s with pid %P, " + "last beat on: %s (%L beats lost previously)", + g_quark_to_string(wrk->type), wrk->pid, + timebuf, + -(wrk->hb.nbeats)); + wrk->hb.nbeats = 1; + /* TODO: send notify about worker restoration */ + } +} + +static void +rspamd_main_heartbeat_start(struct rspamd_worker *wrk, struct ev_loop *event_loop) +{ + wrk->hb.heartbeat_ev.data = (void *) wrk; + ev_timer_init(&wrk->hb.heartbeat_ev, rspamd_main_heartbeat_cb, + 0.0, wrk->srv->cfg->heartbeat_interval * 2); + ev_timer_start(event_loop, &wrk->hb.heartbeat_ev); +} + +static bool +rspamd_maybe_reuseport_socket(struct rspamd_worker_listen_socket *ls) +{ + if (ls->is_systemd) { + /* No need to reuseport */ + return true; + } + + if (ls->fd != -1 && rspamd_inet_address_get_af(ls->addr) == AF_UNIX) { + /* Just try listen */ + + if (listen(ls->fd, -1) == -1) { + return false; + } + + return true; + } + +#if defined(SO_REUSEPORT) && defined(SO_REUSEADDR) && defined(LINUX) + gint nfd = -1; + + if (ls->type == RSPAMD_WORKER_SOCKET_UDP) { + nfd = rspamd_inet_address_listen(ls->addr, + (ls->type == RSPAMD_WORKER_SOCKET_UDP ? SOCK_DGRAM : SOCK_STREAM), + RSPAMD_INET_ADDRESS_LISTEN_ASYNC | RSPAMD_INET_ADDRESS_LISTEN_REUSEPORT, + -1); + + if (nfd == -1) { + msg_warn("cannot create reuseport listen socket for %d: %s", + ls->fd, strerror(errno)); + nfd = ls->fd; + } + else { + if (ls->fd != -1) { + close(ls->fd); + } + ls->fd = nfd; + nfd = -1; + } + } + else { + /* + * Reuseport is broken with the current architecture, so it is easier not + * to use it at all + */ + nfd = ls->fd; + } +#endif + +#if 0 + /* This needed merely if we have reuseport for tcp, but for now it is disabled */ + /* This means that we have an fd with no listening enabled */ + if (nfd != -1) { + if (ls->type == RSPAMD_WORKER_SOCKET_TCP) { + if (listen (nfd, -1) == -1) { + return false; + } + } + } +#endif + + return true; +} + +/** + * Handles worker after fork returned zero + * @param wrk + * @param rspamd_main + * @param cf + * @param listen_sockets + */ +static void __attribute__((noreturn)) +rspamd_handle_child_fork(struct rspamd_worker *wrk, + struct rspamd_main *rspamd_main, + struct rspamd_worker_conf *cf, + GHashTable *listen_sockets) +{ + gint rc; + struct rlimit rlim; + + /* Update pid for logging */ + rspamd_log_on_fork(cf->type, rspamd_main->cfg, rspamd_main->logger); + wrk->pid = getpid(); + + /* Init PRNG after fork */ + rc = ottery_init(rspamd_main->cfg->libs_ctx->ottery_cfg); + if (rc != OTTERY_ERR_NONE) { + msg_err_main("cannot initialize PRNG: %d", rc); + abort(); + } + + rspamd_random_seed_fast(); +#ifdef HAVE_EVUTIL_RNG_INIT + evutil_secure_rng_init(); +#endif + + /* + * Libev stores all signals in a global table, so + * previous handlers must be explicitly detached and forgotten + * before starting a new loop + */ + ev_signal_stop(rspamd_main->event_loop, &rspamd_main->int_ev); + ev_signal_stop(rspamd_main->event_loop, &rspamd_main->term_ev); + ev_signal_stop(rspamd_main->event_loop, &rspamd_main->hup_ev); + ev_signal_stop(rspamd_main->event_loop, &rspamd_main->usr1_ev); + /* Remove the inherited event base */ + ev_loop_destroy(rspamd_main->event_loop); + rspamd_main->event_loop = NULL; + + /* Close unused sockets */ + GHashTableIter it; + gpointer k, v; + + + g_hash_table_iter_init(&it, listen_sockets); + + /* + * Close listen sockets of not our process (inherited from other forks) + */ + while (g_hash_table_iter_next(&it, &k, &v)) { + GList *elt = (GList *) v; + GList *our = cf->listen_socks; + + if (g_list_position(our, elt) == -1) { + GList *cur = elt; + + while (cur) { + struct rspamd_worker_listen_socket *ls = + (struct rspamd_worker_listen_socket *) cur->data; + + if (ls->fd != -1 && close(ls->fd) == -1) { + msg_err("cannot close fd %d (addr = %s): %s", + ls->fd, + rspamd_inet_address_to_string_pretty(ls->addr), + strerror(errno)); + } + + ls->fd = -1; + + cur = g_list_next(cur); + } + } + } + + /* Reuseport before dropping privs */ + GList *cur = cf->listen_socks; + + while (cur) { + struct rspamd_worker_listen_socket *ls = + (struct rspamd_worker_listen_socket *) cur->data; + + if (!rspamd_maybe_reuseport_socket(ls)) { + msg_err("cannot listen on socket %s: %s", + rspamd_inet_address_to_string_pretty(ls->addr), + strerror(errno)); + } + + cur = g_list_next(cur); + } + + /* Drop privileges */ + rspamd_worker_drop_priv(rspamd_main); + /* Set limits */ + rspamd_worker_set_limits(rspamd_main, cf); + /* Re-set stack limit */ + getrlimit(RLIMIT_STACK, &rlim); + rlim.rlim_cur = 100 * 1024 * 1024; + rlim.rlim_max = rlim.rlim_cur; + setrlimit(RLIMIT_STACK, &rlim); + + if (cf->bind_conf) { + rspamd_setproctitle("%s process (%s)", cf->worker->name, + cf->bind_conf->bind_line); + } + else { + rspamd_setproctitle("%s process", cf->worker->name); + } + + if (rspamd_main->pfh) { + rspamd_pidfile_close(rspamd_main->pfh); + } + + if (rspamd_main->cfg->log_silent_workers) { + rspamd_log_set_log_level(rspamd_main->logger, G_LOG_LEVEL_MESSAGE); + } + + wrk->start_time = rspamd_get_calendar_ticks(); + + if (cf->bind_conf) { + GString *listen_conf_stringified = g_string_new(NULL); + struct rspamd_worker_bind_conf *cur_conf; + + LL_FOREACH(cf->bind_conf, cur_conf) + { + if (cur_conf->next) { + rspamd_printf_gstring(listen_conf_stringified, "%s, ", + cur_conf->bind_line); + } + else { + rspamd_printf_gstring(listen_conf_stringified, "%s", + cur_conf->bind_line); + } + } + + msg_info_main("starting %s process %P (%d); listen on: %v", + cf->worker->name, + getpid(), wrk->index, listen_conf_stringified); + g_string_free(listen_conf_stringified, TRUE); + } + else { + msg_info_main("starting %s process %P (%d); no listen", + cf->worker->name, + getpid(), wrk->index); + } + /* Close parent part of socketpair */ + close(wrk->control_pipe[0]); + close(wrk->srv_pipe[0]); + /* + * Read comments in `rspamd_handle_main_fork` for details why these channel + * is blocking. + */ + rspamd_socket_nonblocking(wrk->control_pipe[1]); +#if 0 + rspamd_socket_nonblocking (wrk->srv_pipe[1]); +#endif + rspamd_main->cfg->cur_worker = wrk; + /* Execute worker (this function should not return normally!) */ + cf->worker->worker_start_func(wrk); + /* To distinguish from normal termination */ + exit(EXIT_FAILURE); +} + +static void +rspamd_handle_main_fork(struct rspamd_worker *wrk, + struct rspamd_main *rspamd_main, + struct rspamd_worker_conf *cf, + struct ev_loop *ev_base) +{ + /* Close worker part of socketpair */ + close(wrk->control_pipe[1]); + close(wrk->srv_pipe[1]); + + /* + * There are no reasons why control pipes are blocking: the messages + * there are rare and are strictly bounded by command sizes, so if we block + * on some pipe, it is ok, as we still poll that for all operations. + * It is also impossible to block on writing in normal conditions. + * And if the conditions are not normal, e.g. a worker is unresponsive, then + * we can safely think that the non-blocking behaviour as it is implemented + * currently will not make things better, as it would lead to incomplete + * reads/writes that are not handled anyhow and are totally broken from the + * beginning. + */ +#if 0 + rspamd_socket_nonblocking (wrk->srv_pipe[0]); +#endif + rspamd_socket_nonblocking(wrk->control_pipe[0]); + + rspamd_srv_start_watching(rspamd_main, wrk, ev_base); + /* Child event */ + wrk->cld_ev.data = wrk; + ev_child_init(&wrk->cld_ev, rspamd_worker_on_term, wrk->pid, 0); + ev_child_start(rspamd_main->event_loop, &wrk->cld_ev); + /* Heartbeats */ + rspamd_main_heartbeat_start(wrk, rspamd_main->event_loop); + /* Insert worker into worker's table, pid is index */ + g_hash_table_insert(rspamd_main->workers, + GSIZE_TO_POINTER(wrk->pid), wrk); + +#if defined(SO_REUSEPORT) && defined(SO_REUSEADDR) && defined(LINUX) + /* + * Close listen sockets in the main process once a child is handling them, + * if we have reuseport + */ + GList *cur = cf->listen_socks; + + while (cur) { + struct rspamd_worker_listen_socket *ls = + (struct rspamd_worker_listen_socket *) cur->data; + + if (ls->fd != -1 && ls->type == RSPAMD_WORKER_SOCKET_UDP) { + close(ls->fd); + ls->fd = -1; + } + + cur = g_list_next(cur); + } +#endif +} + +#ifndef SOCK_SEQPACKET +#define SOCK_SEQPACKET SOCK_DGRAM +#endif +struct rspamd_worker * +rspamd_fork_worker(struct rspamd_main *rspamd_main, + struct rspamd_worker_conf *cf, + guint index, + struct ev_loop *ev_base, + rspamd_worker_term_cb term_handler, + GHashTable *listen_sockets) +{ + struct rspamd_worker *wrk; + + /* Starting worker process */ + wrk = (struct rspamd_worker *) g_malloc0(sizeof(struct rspamd_worker)); + + if (!rspamd_socketpair(wrk->control_pipe, SOCK_SEQPACKET)) { + msg_err("socketpair failure: %s", strerror(errno)); + rspamd_hard_terminate(rspamd_main); + } + + if (!rspamd_socketpair(wrk->srv_pipe, SOCK_SEQPACKET)) { + msg_err("socketpair failure: %s", strerror(errno)); + rspamd_hard_terminate(rspamd_main); + } + + if (cf->bind_conf) { + msg_info_main("prepare to fork process %s (%d); listen on: %s", + cf->worker->name, + index, cf->bind_conf->name); + } + else { + msg_info_main("prepare to fork process %s (%d), no bind socket", + cf->worker->name, + index); + } + + wrk->srv = rspamd_main; + wrk->type = cf->type; + wrk->cf = cf; + wrk->flags = cf->worker->flags; + REF_RETAIN(cf); + wrk->index = index; + wrk->ctx = cf->ctx; + wrk->ppid = getpid(); + wrk->pid = fork(); + wrk->cores_throttled = rspamd_main->cores_throttling; + wrk->term_handler = term_handler; + wrk->control_events_pending = g_hash_table_new_full(g_direct_hash, g_direct_equal, + NULL, rspamd_pending_control_free); + + switch (wrk->pid) { + case 0: + rspamd_current_worker = wrk; + rspamd_handle_child_fork(wrk, rspamd_main, cf, listen_sockets); + break; + case -1: + msg_err_main("cannot fork main process: %s", strerror(errno)); + + if (rspamd_main->pfh) { + rspamd_pidfile_remove(rspamd_main->pfh); + } + + rspamd_hard_terminate(rspamd_main); + break; + default: + rspamd_handle_main_fork(wrk, rspamd_main, cf, ev_base); + break; + } + + return wrk; +} + +void rspamd_worker_block_signals(void) +{ + sigset_t set; + + sigemptyset(&set); + sigaddset(&set, SIGTERM); + sigaddset(&set, SIGINT); + sigaddset(&set, SIGHUP); + sigaddset(&set, SIGUSR1); + sigaddset(&set, SIGUSR2); + sigprocmask(SIG_BLOCK, &set, NULL); +} + +void rspamd_worker_unblock_signals(void) +{ + sigset_t set; + + sigemptyset(&set); + sigaddset(&set, SIGTERM); + sigaddset(&set, SIGINT); + sigaddset(&set, SIGHUP); + sigaddset(&set, SIGUSR1); + sigaddset(&set, SIGUSR2); + sigprocmask(SIG_UNBLOCK, &set, NULL); +} + +void rspamd_hard_terminate(struct rspamd_main *rspamd_main) +{ + GHashTableIter it; + gpointer k, v; + struct rspamd_worker *w; + sigset_t set; + + /* Block all signals */ + sigemptyset(&set); + sigaddset(&set, SIGTERM); + sigaddset(&set, SIGINT); + sigaddset(&set, SIGHUP); + sigaddset(&set, SIGUSR1); + sigaddset(&set, SIGUSR2); + sigaddset(&set, SIGCHLD); + sigprocmask(SIG_BLOCK, &set, NULL); + + /* We need to terminate all workers that might be already spawned */ + rspamd_worker_block_signals(); + g_hash_table_iter_init(&it, rspamd_main->workers); + + while (g_hash_table_iter_next(&it, &k, &v)) { + w = v; + msg_err_main("kill worker %P as Rspamd is terminating due to " + "an unrecoverable error", + w->pid); + kill(w->pid, SIGKILL); + } + + msg_err_main("shutting down Rspamd due to fatal error"); + + rspamd_log_close(rspamd_main->logger); + exit(EXIT_FAILURE); +} + +gboolean +rspamd_worker_is_scanner(struct rspamd_worker *w) +{ + + if (w) { + return !!(w->flags & RSPAMD_WORKER_SCANNER); + } + + return FALSE; +} + +gboolean +rspamd_worker_is_primary_controller(struct rspamd_worker *w) +{ + + if (w) { + return !!(w->flags & RSPAMD_WORKER_CONTROLLER) && w->index == 0; + } + + return FALSE; +} + +gboolean +rspamd_worker_check_controller_presence(struct rspamd_worker *w) +{ + if (w->index == 0) { + GQuark our_type = w->type; + gboolean controller_seen = FALSE; + GList *cur; + + enum { + low_priority_worker, + high_priority_worker + } our_priority; + + if (our_type == g_quark_from_static_string("rspamd_proxy")) { + our_priority = low_priority_worker; + } + else if (our_type == g_quark_from_static_string("normal")) { + our_priority = high_priority_worker; + } + else { + msg_err("function is called for a wrong worker type: %s", g_quark_to_string(our_type)); + return FALSE; + } + + cur = w->srv->cfg->workers; + + while (cur) { + struct rspamd_worker_conf *cf; + + cf = (struct rspamd_worker_conf *) cur->data; + + if (our_priority == low_priority_worker) { + if ((cf->type == g_quark_from_static_string("controller")) || + (cf->type == g_quark_from_static_string("normal"))) { + + if (cf->enabled && cf->count >= 0) { + controller_seen = TRUE; + break; + } + } + } + else { + if (cf->type == g_quark_from_static_string("controller")) { + if (cf->enabled && cf->count >= 0) { + controller_seen = TRUE; + break; + } + } + } + + cur = g_list_next(cur); + } + + if (!controller_seen) { + msg_info("no controller or normal workers defined, execute " + "controller periodics in this worker"); + w->flags |= RSPAMD_WORKER_CONTROLLER; + return TRUE; + } + } + + return FALSE; +} + +struct rspamd_worker_session_elt { + void *ptr; + guint *pref; + const gchar *tag; + time_t when; +}; + +struct rspamd_worker_session_cache { + struct ev_loop *ev_base; + GHashTable *cache; + struct rspamd_config *cfg; + struct ev_timer periodic; +}; + +static gint +rspamd_session_cache_sort_cmp(gconstpointer pa, gconstpointer pb) +{ + const struct rspamd_worker_session_elt + *e1 = *(const struct rspamd_worker_session_elt **) pa, + *e2 = *(const struct rspamd_worker_session_elt **) pb; + + return e2->when < e1->when; +} + +static void +rspamd_sessions_cache_periodic(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_worker_session_cache *c = + (struct rspamd_worker_session_cache *) w->data; + GHashTableIter it; + gchar timebuf[32]; + gpointer k, v; + struct rspamd_worker_session_elt *elt; + struct tm tms; + GPtrArray *res; + guint i; + + if (g_hash_table_size(c->cache) > c->cfg->max_sessions_cache) { + res = g_ptr_array_sized_new(g_hash_table_size(c->cache)); + g_hash_table_iter_init(&it, c->cache); + + while (g_hash_table_iter_next(&it, &k, &v)) { + g_ptr_array_add(res, v); + } + + msg_err("sessions cache is overflowed %d elements where %d is limit", + (gint) res->len, (gint) c->cfg->max_sessions_cache); + g_ptr_array_sort(res, rspamd_session_cache_sort_cmp); + + PTR_ARRAY_FOREACH(res, i, elt) + { + rspamd_localtime(elt->when, &tms); + strftime(timebuf, sizeof(timebuf), "%F %H:%M:%S", &tms); + + msg_warn("redundant session; ptr: %p, " + "tag: %s, refcount: %d, time: %s", + elt->ptr, elt->tag ? elt->tag : "unknown", + elt->pref ? *elt->pref : 0, + timebuf); + } + } + + ev_timer_again(EV_A_ w); +} + +void * +rspamd_worker_session_cache_new(struct rspamd_worker *w, + struct ev_loop *ev_base) +{ + struct rspamd_worker_session_cache *c; + static const gdouble periodic_interval = 60.0; + + c = g_malloc0(sizeof(*c)); + c->ev_base = ev_base; + c->cache = g_hash_table_new_full(g_direct_hash, g_direct_equal, + NULL, g_free); + c->cfg = w->srv->cfg; + c->periodic.data = c; + ev_timer_init(&c->periodic, rspamd_sessions_cache_periodic, periodic_interval, + periodic_interval); + ev_timer_start(ev_base, &c->periodic); + + return c; +} + + +void rspamd_worker_session_cache_add(void *cache, const gchar *tag, + guint *pref, void *ptr) +{ + struct rspamd_worker_session_cache *c = cache; + struct rspamd_worker_session_elt *elt; + + elt = g_malloc0(sizeof(*elt)); + elt->pref = pref; + elt->ptr = ptr; + elt->tag = tag; + elt->when = time(NULL); + + g_hash_table_insert(c->cache, elt->ptr, elt); +} + + +void rspamd_worker_session_cache_remove(void *cache, void *ptr) +{ + struct rspamd_worker_session_cache *c = cache; + + g_hash_table_remove(c->cache, ptr); +} + +static void +rspamd_worker_monitored_on_change(struct rspamd_monitored_ctx *ctx, + struct rspamd_monitored *m, gboolean alive, + void *ud) +{ + struct rspamd_worker *worker = ud; + struct rspamd_config *cfg = worker->srv->cfg; + struct ev_loop *ev_base; + guchar tag[RSPAMD_MONITORED_TAG_LEN]; + static struct rspamd_srv_command srv_cmd; + + rspamd_monitored_get_tag(m, tag); + ev_base = rspamd_monitored_ctx_get_ev_base(ctx); + memset(&srv_cmd, 0, sizeof(srv_cmd)); + srv_cmd.type = RSPAMD_SRV_MONITORED_CHANGE; + rspamd_strlcpy(srv_cmd.cmd.monitored_change.tag, tag, + sizeof(srv_cmd.cmd.monitored_change.tag)); + srv_cmd.cmd.monitored_change.alive = alive; + srv_cmd.cmd.monitored_change.sender = getpid(); + msg_info_config("broadcast monitored update for %s: %s", + srv_cmd.cmd.monitored_change.tag, alive ? "alive" : "dead"); + + rspamd_srv_send_command(worker, ev_base, &srv_cmd, -1, NULL, NULL); +} + +void rspamd_worker_init_monitored(struct rspamd_worker *worker, + struct ev_loop *ev_base, + struct rspamd_dns_resolver *resolver) +{ + rspamd_monitored_ctx_config(worker->srv->cfg->monitored_ctx, + worker->srv->cfg, ev_base, resolver->r, + rspamd_worker_monitored_on_change, worker); +} + +#ifdef HAVE_SA_SIGINFO + +static struct rspamd_main *saved_main = NULL; +static gboolean +rspamd_crash_propagate(gpointer key, gpointer value, gpointer unused) +{ + struct rspamd_worker *w = value; + + /* Kill children softly */ + kill(w->pid, SIGTERM); + + return TRUE; +} + +#ifdef BACKWARD_ENABLE +/* See backtrace.cxx */ +extern void rspamd_print_crash(void); +#endif + +static void +rspamd_crash_sig_handler(int sig, siginfo_t *info, void *ctx) +{ + struct sigaction sa; + ucontext_t *uap = ctx; + pid_t pid; + + pid = getpid(); + msg_err("caught fatal signal %d(%s), " + "pid: %P, trace: ", + sig, strsignal(sig), pid); + (void) uap; +#ifdef BACKWARD_ENABLE + rspamd_print_crash(); +#endif + msg_err("please see Rspamd FAQ to learn how to dump core files and how to " + "fill a bug report"); + + if (saved_main) { + if (pid == saved_main->pid) { + /* + * Main process has crashed, propagate crash further to trigger + * monitoring alerts and mass panic + */ + g_hash_table_foreach_remove(saved_main->workers, + rspamd_crash_propagate, NULL); + } + } + + /* + * Invoke signal with the default handler + */ + sigemptyset(&sa.sa_mask); + sa.sa_handler = SIG_DFL; + sa.sa_flags = 0; + sigaction(sig, &sa, NULL); + kill(pid, sig); +} +#endif + +RSPAMD_NO_SANITIZE void +rspamd_set_crash_handler(struct rspamd_main *rspamd_main) +{ +#ifdef HAVE_SA_SIGINFO + struct sigaction sa; + +#ifdef HAVE_SIGALTSTACK + void *stack_mem; + stack_t ss; + memset(&ss, 0, sizeof ss); + + ss.ss_size = MAX(SIGSTKSZ, 8192 * 4); + stack_mem = g_malloc0(ss.ss_size); + ss.ss_sp = stack_mem; + sigaltstack(&ss, NULL); +#endif + saved_main = rspamd_main; + sigemptyset(&sa.sa_mask); + sa.sa_sigaction = &rspamd_crash_sig_handler; + sa.sa_flags = SA_RESTART | SA_SIGINFO | SA_ONSTACK; + sigaction(SIGSEGV, &sa, NULL); + sigaction(SIGBUS, &sa, NULL); + sigaction(SIGABRT, &sa, NULL); + sigaction(SIGFPE, &sa, NULL); + sigaction(SIGSYS, &sa, NULL); +#endif +} + +RSPAMD_NO_SANITIZE void rspamd_unset_crash_handler(struct rspamd_main *unused_) +{ +#ifdef HAVE_SIGALTSTACK + int ret; + stack_t ss; + ret = sigaltstack(NULL, &ss); + + if (ret != -1) { + if (ss.ss_size > 0 && ss.ss_sp) { + g_free(ss.ss_sp); + } + + ss.ss_size = 0; + ss.ss_sp = NULL; +#ifdef SS_DISABLE + ss.ss_flags |= SS_DISABLE; +#endif + sigaltstack(&ss, NULL); + } +#endif +} + +static void +rspamd_enable_accept_event(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_worker_accept_event *ac_ev = + (struct rspamd_worker_accept_event *) w->data; + + ev_timer_stop(EV_A_ w); + ev_io_start(EV_A_ & ac_ev->accept_ev); +} + +void rspamd_worker_throttle_accept_events(gint sock, void *data) +{ + struct rspamd_worker_accept_event *head, *cur; + const gdouble throttling = 0.5; + + head = (struct rspamd_worker_accept_event *) data; + + DL_FOREACH(head, cur) + { + + ev_io_stop(cur->event_loop, &cur->accept_ev); + cur->throttling_ev.data = cur; + ev_timer_init(&cur->throttling_ev, rspamd_enable_accept_event, + throttling, 0.0); + ev_timer_start(cur->event_loop, &cur->throttling_ev); + } +} + +gboolean +rspamd_check_termination_clause(struct rspamd_main *rspamd_main, + struct rspamd_worker *wrk, + int res) +{ + gboolean need_refork = TRUE; + + if (wrk->state != rspamd_worker_state_running || rspamd_main->wanna_die || + (wrk->flags & RSPAMD_WORKER_OLD_CONFIG)) { + /* Do not refork workers that are intended to be terminated */ + need_refork = FALSE; + } + + if (WIFEXITED(res) && WEXITSTATUS(res) == 0) { + /* Normal worker termination, do not fork one more */ + + if (wrk->flags & RSPAMD_WORKER_OLD_CONFIG) { + /* Never re-fork old workers */ + msg_info_main("%s process %P terminated normally", + g_quark_to_string(wrk->type), + wrk->pid); + need_refork = FALSE; + } + else { + if (wrk->hb.nbeats < 0 && rspamd_main->cfg->heartbeats_loss_max > 0 && + -(wrk->hb.nbeats) >= rspamd_main->cfg->heartbeats_loss_max) { + msg_info_main("%s process %P terminated normally, but lost %L " + "heartbeats, refork it", + g_quark_to_string(wrk->type), + wrk->pid, + -(wrk->hb.nbeats)); + need_refork = TRUE; + } + else { + msg_info_main("%s process %P terminated normally", + g_quark_to_string(wrk->type), + wrk->pid); + need_refork = FALSE; + } + } + } + else { + if (WIFSIGNALED(res)) { +#ifdef WCOREDUMP + if (WCOREDUMP(res)) { + msg_warn_main( + "%s process %P terminated abnormally by signal: %s" + " and created core file; please see Rspamd FAQ " + "to learn how to extract data from core file and " + "fill a bug report", + g_quark_to_string(wrk->type), + wrk->pid, + g_strsignal(WTERMSIG(res))); + } + else { +#ifdef HAVE_SYS_RESOURCE_H + struct rlimit rlmt; + (void) getrlimit(RLIMIT_CORE, &rlmt); + + msg_warn_main( + "%s process %P terminated abnormally with exit code %d by " + "signal: %s" + " but NOT created core file (throttled=%s); " + "core file limits: %L current, %L max", + g_quark_to_string(wrk->type), + wrk->pid, + WEXITSTATUS(res), + g_strsignal(WTERMSIG(res)), + wrk->cores_throttled ? "yes" : "no", + (gint64) rlmt.rlim_cur, + (gint64) rlmt.rlim_max); +#else + msg_warn_main( + "%s process %P terminated abnormally with exit code %d by signal: %s" + " but NOT created core file (throttled=%s); ", + g_quark_to_string(wrk->type), + wrk->pid, WEXITSTATUS(res), + g_strsignal(WTERMSIG(res)), + wrk->cores_throttled ? "yes" : "no"); +#endif + } +#else + msg_warn_main( + "%s process %P terminated abnormally with exit code %d by signal: %s", + g_quark_to_string(wrk->type), + wrk->pid, WEXITSTATUS(res), + g_strsignal(WTERMSIG(res))); +#endif + if (WTERMSIG(res) == SIGUSR2) { + /* + * It is actually race condition when not started process + * has been requested to be reloaded. + * + * We shouldn't refork on this + */ + need_refork = FALSE; + } + } + else { + msg_warn_main("%s process %P terminated abnormally " + "(but it was not killed by a signal) " + "with exit code %d", + g_quark_to_string(wrk->type), + wrk->pid, + WEXITSTATUS(res)); + } + } + + return need_refork; +} + +#ifdef WITH_HYPERSCAN +gboolean +rspamd_worker_hyperscan_ready(struct rspamd_main *rspamd_main, + struct rspamd_worker *worker, gint fd, + gint attached_fd, + struct rspamd_control_command *cmd, + gpointer ud) +{ + struct rspamd_control_reply rep; + struct rspamd_re_cache *cache = worker->srv->cfg->re_cache; + + memset(&rep, 0, sizeof(rep)); + rep.type = RSPAMD_CONTROL_HYPERSCAN_LOADED; + + if (rspamd_re_cache_is_hs_loaded(cache) != RSPAMD_HYPERSCAN_LOADED_FULL || + cmd->cmd.hs_loaded.forced) { + + msg_info("loading hyperscan expressions after receiving compilation " + "notice: %s", + (rspamd_re_cache_is_hs_loaded(cache) != RSPAMD_HYPERSCAN_LOADED_FULL) ? "new db" : "forced update"); + rep.reply.hs_loaded.status = rspamd_re_cache_load_hyperscan( + worker->srv->cfg->re_cache, cmd->cmd.hs_loaded.cache_dir, false); + } + + if (write(fd, &rep, sizeof(rep)) != sizeof(rep)) { + msg_err("cannot write reply to the control socket: %s", + strerror(errno)); + } + + return TRUE; +} +#endif /* With Hyperscan */ + +gboolean +rspamd_worker_check_context(gpointer ctx, guint64 magic) +{ + struct rspamd_abstract_worker_ctx *actx = (struct rspamd_abstract_worker_ctx *) ctx; + + return actx->magic == magic; +} + +static gboolean +rspamd_worker_log_pipe_handler(struct rspamd_main *rspamd_main, + struct rspamd_worker *worker, gint fd, + gint attached_fd, + struct rspamd_control_command *cmd, + gpointer ud) +{ + struct rspamd_config *cfg = ud; + struct rspamd_worker_log_pipe *lp; + struct rspamd_control_reply rep; + + memset(&rep, 0, sizeof(rep)); + rep.type = RSPAMD_CONTROL_LOG_PIPE; + + if (attached_fd != -1) { + lp = g_malloc0(sizeof(*lp)); + lp->fd = attached_fd; + lp->type = cmd->cmd.log_pipe.type; + + DL_APPEND(cfg->log_pipes, lp); + msg_info("added new log pipe"); + } + else { + rep.reply.log_pipe.status = ENOENT; + msg_err("cannot attach log pipe: invalid fd"); + } + + if (write(fd, &rep, sizeof(rep)) != sizeof(rep)) { + msg_err("cannot write reply to the control socket: %s", + strerror(errno)); + } + + return TRUE; +} + +static gboolean +rspamd_worker_monitored_handler(struct rspamd_main *rspamd_main, + struct rspamd_worker *worker, gint fd, + gint attached_fd, + struct rspamd_control_command *cmd, + gpointer ud) +{ + struct rspamd_control_reply rep; + struct rspamd_monitored *m; + struct rspamd_monitored_ctx *mctx = worker->srv->cfg->monitored_ctx; + struct rspamd_config *cfg = ud; + + memset(&rep, 0, sizeof(rep)); + rep.type = RSPAMD_CONTROL_MONITORED_CHANGE; + + if (cmd->cmd.monitored_change.sender != getpid()) { + m = rspamd_monitored_by_tag(mctx, cmd->cmd.monitored_change.tag); + + if (m != NULL) { + rspamd_monitored_set_alive(m, cmd->cmd.monitored_change.alive); + rep.reply.monitored_change.status = 1; + msg_info_config("updated monitored status for %s: %s", + cmd->cmd.monitored_change.tag, + cmd->cmd.monitored_change.alive ? "alive" : "dead"); + } + else { + msg_err("cannot find monitored by tag: %*s", 32, + cmd->cmd.monitored_change.tag); + rep.reply.monitored_change.status = 0; + } + } + + if (write(fd, &rep, sizeof(rep)) != sizeof(rep)) { + msg_err("cannot write reply to the control socket: %s", + strerror(errno)); + } + + return TRUE; +} + +void rspamd_worker_init_scanner(struct rspamd_worker *worker, + struct ev_loop *ev_base, + struct rspamd_dns_resolver *resolver, + struct rspamd_lang_detector **plang_det) +{ + rspamd_stat_init(worker->srv->cfg, ev_base); +#ifdef WITH_HYPERSCAN + rspamd_control_worker_add_cmd_handler(worker, + RSPAMD_CONTROL_HYPERSCAN_LOADED, + rspamd_worker_hyperscan_ready, + NULL); +#endif + rspamd_control_worker_add_cmd_handler(worker, + RSPAMD_CONTROL_LOG_PIPE, + rspamd_worker_log_pipe_handler, + worker->srv->cfg); + rspamd_control_worker_add_cmd_handler(worker, + RSPAMD_CONTROL_MONITORED_CHANGE, + rspamd_worker_monitored_handler, + worker->srv->cfg); + + *plang_det = worker->srv->cfg->lang_det; +} + +void rspamd_controller_store_saved_stats(struct rspamd_main *rspamd_main, + struct rspamd_config *cfg) +{ + struct rspamd_stat *stat; + ucl_object_t *top, *sub; + struct ucl_emitter_functions *efuncs; + gint i, fd; + FILE *fp; + gchar fpath[PATH_MAX]; + + if (cfg->stats_file == NULL) { + return; + } + + rspamd_snprintf(fpath, sizeof(fpath), "%s.XXXXXXXX", cfg->stats_file); + fd = g_mkstemp_full(fpath, O_WRONLY | O_TRUNC, 00644); + + if (fd == -1) { + msg_err_config("cannot open for writing controller stats from %s: %s", + fpath, strerror(errno)); + return; + } + + fp = fdopen(fd, "w"); + stat = rspamd_main->stat; + + top = ucl_object_typed_new(UCL_OBJECT); + ucl_object_insert_key(top, ucl_object_fromint(stat->messages_scanned), "scanned", 0, false); + ucl_object_insert_key(top, ucl_object_fromint(stat->messages_learned), "learned", 0, false); + + if (stat->messages_scanned > 0) { + sub = ucl_object_typed_new(UCL_OBJECT); + for (i = METRIC_ACTION_REJECT; i <= METRIC_ACTION_NOACTION; i++) { + ucl_object_insert_key(sub, + ucl_object_fromint(stat->actions_stat[i]), + rspamd_action_to_str(i), 0, false); + } + ucl_object_insert_key(top, sub, "actions", 0, false); + } + + ucl_object_insert_key(top, + ucl_object_fromint(stat->connections_count), + "connections", 0, false); + ucl_object_insert_key(top, + ucl_object_fromint(stat->control_connections_count), + "control_connections", 0, false); + + efuncs = ucl_object_emit_file_funcs(fp); + if (!ucl_object_emit_full(top, UCL_EMIT_JSON_COMPACT, + efuncs, NULL)) { + msg_err_config("cannot write stats to %s: %s", + fpath, strerror(errno)); + + unlink(fpath); + } + else { + if (rename(fpath, cfg->stats_file) == -1) { + msg_err_config("cannot rename stats from %s to %s: %s", + fpath, cfg->stats_file, strerror(errno)); + } + } + + ucl_object_unref(top); + fclose(fp); + ucl_object_emit_funcs_free(efuncs); +} + +static ev_timer rrd_timer; + +void rspamd_controller_on_terminate(struct rspamd_worker *worker, + struct rspamd_rrd_file *rrd) +{ + struct rspamd_abstract_worker_ctx *ctx; + + ctx = (struct rspamd_abstract_worker_ctx *) worker->ctx; + rspamd_controller_store_saved_stats(worker->srv, worker->srv->cfg); + + if (rrd) { + ev_timer_stop(ctx->event_loop, &rrd_timer); + msg_info("closing rrd file: %s", rrd->filename); + rspamd_rrd_close(rrd); + } +} + +static void +rspamd_controller_load_saved_stats(struct rspamd_main *rspamd_main, + struct rspamd_config *cfg) +{ + struct ucl_parser *parser; + ucl_object_t *obj; + const ucl_object_t *elt, *subelt; + struct rspamd_stat *stat, stat_copy; + gint i; + + if (cfg->stats_file == NULL) { + return; + } + + if (access(cfg->stats_file, R_OK) == -1) { + msg_err_config("cannot load controller stats from %s: %s", + cfg->stats_file, strerror(errno)); + return; + } + + parser = ucl_parser_new(0); + + if (!ucl_parser_add_file(parser, cfg->stats_file)) { + msg_err_config("cannot parse controller stats from %s: %s", + cfg->stats_file, ucl_parser_get_error(parser)); + ucl_parser_free(parser); + + return; + } + + obj = ucl_parser_get_object(parser); + ucl_parser_free(parser); + + stat = rspamd_main->stat; + memcpy(&stat_copy, stat, sizeof(stat_copy)); + + elt = ucl_object_lookup(obj, "scanned"); + + if (elt != NULL && ucl_object_type(elt) == UCL_INT) { + stat_copy.messages_scanned = ucl_object_toint(elt); + } + + elt = ucl_object_lookup(obj, "learned"); + + if (elt != NULL && ucl_object_type(elt) == UCL_INT) { + stat_copy.messages_learned = ucl_object_toint(elt); + } + + elt = ucl_object_lookup(obj, "actions"); + + if (elt != NULL) { + for (i = METRIC_ACTION_REJECT; i <= METRIC_ACTION_NOACTION; i++) { + subelt = ucl_object_lookup(elt, rspamd_action_to_str(i)); + + if (subelt && ucl_object_type(subelt) == UCL_INT) { + stat_copy.actions_stat[i] = ucl_object_toint(subelt); + } + } + } + + elt = ucl_object_lookup(obj, "connections_count"); + + if (elt != NULL && ucl_object_type(elt) == UCL_INT) { + stat_copy.connections_count = ucl_object_toint(elt); + } + + elt = ucl_object_lookup(obj, "control_connections_count"); + + if (elt != NULL && ucl_object_type(elt) == UCL_INT) { + stat_copy.control_connections_count = ucl_object_toint(elt); + } + + ucl_object_unref(obj); + memcpy(stat, &stat_copy, sizeof(stat_copy)); +} + +struct rspamd_controller_periodics_cbdata { + struct rspamd_worker *worker; + struct rspamd_rrd_file *rrd; + struct rspamd_stat *stat; + ev_timer save_stats_event; +}; + +static void +rspamd_controller_rrd_update(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_controller_periodics_cbdata *cbd = + (struct rspamd_controller_periodics_cbdata *) w->data; + struct rspamd_stat *stat; + GArray ar; + gdouble points[METRIC_ACTION_MAX]; + GError *err = NULL; + guint i; + + g_assert(cbd->rrd != NULL); + stat = cbd->stat; + + for (i = METRIC_ACTION_REJECT; i < METRIC_ACTION_MAX; i++) { + points[i] = stat->actions_stat[i]; + } + + ar.data = (gchar *) points; + ar.len = sizeof(points); + + if (!rspamd_rrd_add_record(cbd->rrd, &ar, rspamd_get_calendar_ticks(), + &err)) { + msg_err("cannot update rrd file: %e", err); + g_error_free(err); + } + + /* Plan new event */ + ev_timer_again(EV_A_ w); +} + +static void +rspamd_controller_stats_save_periodic(EV_P_ ev_timer *w, int revents) +{ + struct rspamd_controller_periodics_cbdata *cbd = + (struct rspamd_controller_periodics_cbdata *) w->data; + + rspamd_controller_store_saved_stats(cbd->worker->srv, cbd->worker->srv->cfg); + ev_timer_again(EV_A_ w); +} + +void rspamd_worker_init_controller(struct rspamd_worker *worker, + struct rspamd_rrd_file **prrd) +{ + struct rspamd_abstract_worker_ctx *ctx; + static const ev_tstamp rrd_update_time = 1.0; + + ctx = (struct rspamd_abstract_worker_ctx *) worker->ctx; + rspamd_controller_load_saved_stats(worker->srv, worker->srv->cfg); + + if (worker->index == 0) { + /* Enable periodics and other stuff */ + static struct rspamd_controller_periodics_cbdata cbd; + const ev_tstamp save_stats_interval = 60; /* 1 minute */ + + memset(&cbd, 0, sizeof(cbd)); + cbd.save_stats_event.data = &cbd; + cbd.worker = worker; + cbd.stat = worker->srv->stat; + + ev_timer_init(&cbd.save_stats_event, + rspamd_controller_stats_save_periodic, + save_stats_interval, save_stats_interval); + ev_timer_start(ctx->event_loop, &cbd.save_stats_event); + + rspamd_map_watch(worker->srv->cfg, ctx->event_loop, + ctx->resolver, worker, + RSPAMD_MAP_WATCH_PRIMARY_CONTROLLER); + + if (prrd != NULL) { + if (ctx->cfg->rrd_file && worker->index == 0) { + GError *rrd_err = NULL; + + *prrd = rspamd_rrd_file_default(ctx->cfg->rrd_file, &rrd_err); + + if (*prrd) { + cbd.rrd = *prrd; + rrd_timer.data = &cbd; + ev_timer_init(&rrd_timer, rspamd_controller_rrd_update, + rrd_update_time, rrd_update_time); + ev_timer_start(ctx->event_loop, &rrd_timer); + } + else if (rrd_err) { + msg_err("cannot load rrd from %s: %e", ctx->cfg->rrd_file, + rrd_err); + g_error_free(rrd_err); + } + else { + msg_err("cannot load rrd from %s: unknown error", + ctx->cfg->rrd_file); + } + } + else { + *prrd = NULL; + } + } + + if (!ctx->cfg->disable_monitored) { + rspamd_worker_init_monitored(worker, + ctx->event_loop, ctx->resolver); + } + } + else { + rspamd_map_watch(worker->srv->cfg, ctx->event_loop, + ctx->resolver, worker, RSPAMD_MAP_WATCH_SCANNER); + } +} + +gdouble +rspamd_worker_check_and_adjust_timeout(struct rspamd_config *cfg, gdouble timeout) +{ + if (isnan(timeout)) { + /* Use implicit timeout from cfg->task_timeout */ + timeout = cfg->task_timeout; + } + + if (isnan(timeout)) { + return timeout; + } + + struct rspamd_symcache_timeout_result *tres = rspamd_symcache_get_max_timeout(cfg->cache); + g_assert(tres != 0); + + if (tres->max_timeout > timeout) { + msg_info_config("configured task_timeout %.2f is less than maximum symbols cache timeout %.2f; " + "some symbols can be terminated before checks", + timeout, tres->max_timeout); + GString *buf = g_string_sized_new(512); + static const int max_displayed_items = 12; + + for (int i = 0; i < MIN(tres->nitems, max_displayed_items); i++) { + if (i == 0) { + rspamd_printf_gstring(buf, "%s(%.2f)", + rspamd_symcache_item_name((struct rspamd_symcache_item *) tres->items[i].item), + tres->items[i].timeout); + } + else { + rspamd_printf_gstring(buf, "; %s(%.2f)", + rspamd_symcache_item_name((struct rspamd_symcache_item *) tres->items[i].item), + tres->items[i].timeout); + } + } + msg_info_config("list of top %d symbols by execution time: %v", + (int) MIN(tres->nitems, max_displayed_items), + buf); + + g_string_free(buf, TRUE); + } + + rspamd_symcache_timeout_result_free(tres); + + /* TODO: maybe adjust timeout */ + return timeout; +} diff --git a/src/libserver/worker_util.h b/src/libserver/worker_util.h new file mode 100644 index 0000000..ef48188 --- /dev/null +++ b/src/libserver/worker_util.h @@ -0,0 +1,334 @@ +/*- + * Copyright 2016 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef WORKER_UTIL_H_ +#define WORKER_UTIL_H_ + +#include "config.h" +#include "util.h" +#include "libserver/http/http_connection.h" +#include "rspamd.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef HAVE_SA_SIGINFO +typedef void (*rspamd_sig_handler_t)(gint); +#else + +typedef void (*rspamd_sig_handler_t)(gint, siginfo_t *, void *); + +#endif + +struct rspamd_worker; +struct rspamd_worker_signal_handler; + +extern struct rspamd_worker *rspamd_current_worker; + +/** + * Init basic signals for a worker + * @param worker + * @param event_loop + */ +void rspamd_worker_init_signals(struct rspamd_worker *worker, struct ev_loop *event_loop); + +typedef void (*rspamd_accept_handler)(struct ev_loop *loop, ev_io *w, int revents); + +/** + * Prepare worker's startup + * @param worker worker structure + * @param name name of the worker + * @param sig_handler handler of main signals + * @param accept_handler handler of accept event for listen sockets + * @return event base suitable for a worker + */ +struct ev_loop * +rspamd_prepare_worker(struct rspamd_worker *worker, const char *name, + rspamd_accept_handler hdl); + +/** + * Should be used to validate context for a worker as in assert like invocation + * @param ctx + * @param magic + * @return + */ +gboolean rspamd_worker_check_context(gpointer ctx, guint64 magic); + +/** + * Set special signal handler for a worker + */ +void rspamd_worker_set_signal_handler(int signo, + struct rspamd_worker *worker, + struct ev_loop *event_loop, + rspamd_worker_signal_cb_t handler, + void *handler_data); + +/** + * Stop accepting new connections for a worker + * @param worker + */ +void rspamd_worker_stop_accept(struct rspamd_worker *worker); + +typedef gint (*rspamd_controller_func_t)( + struct rspamd_http_connection_entry *conn_ent, + struct rspamd_http_message *msg, + struct module_ctx *ctx); + +struct rspamd_custom_controller_command { + const gchar *command; + struct module_ctx *ctx; + gboolean privileged; + gboolean require_message; + rspamd_controller_func_t handler; +}; + +struct rspamd_controller_worker_ctx; +struct rspamd_lang_detector; + +struct rspamd_controller_session { + struct rspamd_controller_worker_ctx *ctx; + struct rspamd_worker *wrk; + rspamd_mempool_t *pool; + struct rspamd_task *task; + gchar *classifier; + rspamd_inet_addr_t *from_addr; + struct rspamd_config *cfg; + struct rspamd_lang_detector *lang_det; + gboolean is_spam; + gboolean is_read_only; +}; + +/** + * Send error using HTTP and JSON output + * @param entry router entry + * @param code error code + * @param error_msg error message + */ +void rspamd_controller_send_error(struct rspamd_http_connection_entry *entry, + gint code, const gchar *error_msg, ...); + +/** + * Send openmetrics-formatted strings using HTTP + * @param entry router entry + * @param str rspamd fstring buffer, ownership is transferred + */ +void rspamd_controller_send_openmetrics(struct rspamd_http_connection_entry *entry, + rspamd_fstring_t *str); + +/** + * Send a custom string using HTTP + * @param entry router entry + * @param str string to send + */ +void rspamd_controller_send_string(struct rspamd_http_connection_entry *entry, + const gchar *str); + +/** + * Send UCL using HTTP and JSON serialization + * @param entry router entry + * @param obj object to send + */ +void rspamd_controller_send_ucl(struct rspamd_http_connection_entry *entry, + ucl_object_t *obj); + +/** + * Return worker's control structure by its type + * @param type + * @return worker's control structure or NULL + */ +worker_t *rspamd_get_worker_by_type(struct rspamd_config *cfg, GQuark type); + +/** + * Block signals before terminations + */ +void rspamd_worker_block_signals(void); + +/** + * Unblock signals + */ +void rspamd_worker_unblock_signals(void); + +/** + * Kill rspamd main and all workers + * @param rspamd_main + */ +void rspamd_hard_terminate(struct rspamd_main *rspamd_main) G_GNUC_NORETURN; + +/** + * Returns TRUE if a specific worker is a scanner worker + * @param w + * @return + */ +gboolean rspamd_worker_is_scanner(struct rspamd_worker *w); + +/** + * Checks + * @param cfg + * @param timeout + * @return + */ +gdouble rspamd_worker_check_and_adjust_timeout(struct rspamd_config *cfg, + gdouble timeout); + +/** + * Returns TRUE if a specific worker is a primary controller + * @param w + * @return + */ +gboolean rspamd_worker_is_primary_controller(struct rspamd_worker *w); + +/** + * Returns TRUE if a specific worker should take a role of a controller + */ +gboolean rspamd_worker_check_controller_presence(struct rspamd_worker *w); + +/** + * Creates new session cache + * @param w + * @return + */ +void *rspamd_worker_session_cache_new(struct rspamd_worker *w, + struct ev_loop *ev_base); + +/** + * Adds a new session identified by pointer + * @param cache + * @param tag + * @param pref + * @param ptr + */ +void rspamd_worker_session_cache_add(void *cache, const gchar *tag, + guint *pref, void *ptr); + +/** + * Removes session from cache + * @param cache + * @param ptr + */ +void rspamd_worker_session_cache_remove(void *cache, void *ptr); + +/** + * Fork new worker with the specified configuration + */ +struct rspamd_worker *rspamd_fork_worker(struct rspamd_main *, + struct rspamd_worker_conf *, guint idx, + struct ev_loop *ev_base, + rspamd_worker_term_cb term_handler, + GHashTable *listen_sockets); + +/** + * Sets crash signals handlers if compiled with libunwind + */ +RSPAMD_NO_SANITIZE void rspamd_set_crash_handler(struct rspamd_main *); + +/** + * Restore memory for crash signals + */ +RSPAMD_NO_SANITIZE void rspamd_unset_crash_handler(struct rspamd_main *); + +/** + * Initialise the main monitoring worker + * @param worker + * @param ev_base + * @param resolver + */ +void rspamd_worker_init_monitored(struct rspamd_worker *worker, + struct ev_loop *ev_base, + struct rspamd_dns_resolver *resolver); + +/** + * Performs throttling for accept events + * @param sock + * @param data struct rspamd_worker_accept_event * list + */ +void rspamd_worker_throttle_accept_events(gint sock, void *data); + +/** + * Checks (and logs) the worker's termination status. Returns TRUE if a worker + * should be restarted. + * @param rspamd_main + * @param wrk + * @param status waitpid res + * @return TRUE if refork is desired + */ +gboolean rspamd_check_termination_clause(struct rspamd_main *rspamd_main, + struct rspamd_worker *wrk, int status); + +/** + * Call for final scripts for a worker + * @param worker + * @return + */ +gboolean rspamd_worker_call_finish_handlers(struct rspamd_worker *worker); + +struct rspamd_rrd_file; +/** + * Terminate controller worker + * @param worker + */ +void rspamd_controller_on_terminate(struct rspamd_worker *worker, + struct rspamd_rrd_file *rrd); + +/** + * Inits controller worker + * @param worker + * @param ev_base + * @param prrd + */ +void rspamd_worker_init_controller(struct rspamd_worker *worker, + struct rspamd_rrd_file **prrd); + +/** + * Saves stats + * @param rspamd_main + * @param cfg + */ +void rspamd_controller_store_saved_stats(struct rspamd_main *rspamd_main, + struct rspamd_config *cfg); + +#ifdef WITH_HYPERSCAN +struct rspamd_control_command; + +gboolean rspamd_worker_hyperscan_ready(struct rspamd_main *rspamd_main, + struct rspamd_worker *worker, gint fd, + gint attached_fd, + struct rspamd_control_command *cmd, + gpointer ud); + +#endif + +#define msg_err_main(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + rspamd_main->server_pool->tag.tagname, rspamd_main->server_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_main(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + rspamd_main->server_pool->tag.tagname, rspamd_main->server_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_notice_main(...) rspamd_default_log_function(G_LOG_LEVEL_MESSAGE, \ + rspamd_main->server_pool->tag.tagname, rspamd_main->server_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_main(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + rspamd_main->server_pool->tag.tagname, rspamd_main->server_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) + +#ifdef __cplusplus +} +#endif + +#endif /* WORKER_UTIL_H_ */ |