diff options
Diffstat (limited to 'src/libserver/url.h')
-rw-r--r-- | src/libserver/url.h | 430 |
1 files changed, 430 insertions, 0 deletions
diff --git a/src/libserver/url.h b/src/libserver/url.h new file mode 100644 index 0000000..d1fb8c9 --- /dev/null +++ b/src/libserver/url.h @@ -0,0 +1,430 @@ +/* URL check functions */ +#ifndef URL_H +#define URL_H + +#include "config.h" +#include "mem_pool.h" +#include "khash.h" +#include "fstring.h" +#include "libutil/cxx/utf8_util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct rspamd_task; +struct rspamd_mime_text_part; + +enum rspamd_url_flags { + RSPAMD_URL_FLAG_PHISHED = 1u << 0u, + RSPAMD_URL_FLAG_NUMERIC = 1u << 1u, + RSPAMD_URL_FLAG_OBSCURED = 1u << 2u, + RSPAMD_URL_FLAG_REDIRECTED = 1u << 3u, + RSPAMD_URL_FLAG_HTML_DISPLAYED = 1u << 4u, + RSPAMD_URL_FLAG_FROM_TEXT = 1u << 5u, + RSPAMD_URL_FLAG_SUBJECT = 1u << 6u, + RSPAMD_URL_FLAG_HOSTENCODED = 1u << 7u, + RSPAMD_URL_FLAG_SCHEMAENCODED = 1u << 8u, + RSPAMD_URL_FLAG_PATHENCODED = 1u << 9u, + RSPAMD_URL_FLAG_QUERYENCODED = 1u << 10u, + RSPAMD_URL_FLAG_MISSINGSLASHES = 1u << 11u, + RSPAMD_URL_FLAG_IDN = 1u << 12u, + RSPAMD_URL_FLAG_HAS_PORT = 1u << 13u, + RSPAMD_URL_FLAG_HAS_USER = 1u << 14u, + RSPAMD_URL_FLAG_SCHEMALESS = 1u << 15u, + RSPAMD_URL_FLAG_UNNORMALISED = 1u << 16u, + RSPAMD_URL_FLAG_ZW_SPACES = 1u << 17u, + RSPAMD_URL_FLAG_DISPLAY_URL = 1u << 18u, + RSPAMD_URL_FLAG_IMAGE = 1u << 19u, + RSPAMD_URL_FLAG_QUERY = 1u << 20u, + RSPAMD_URL_FLAG_CONTENT = 1u << 21u, + RSPAMD_URL_FLAG_NO_TLD = 1u << 22u, + RSPAMD_URL_FLAG_TRUNCATED = 1u << 23u, + RSPAMD_URL_FLAG_REDIRECT_TARGET = 1u << 24u, + RSPAMD_URL_FLAG_INVISIBLE = 1u << 25u, + RSPAMD_URL_FLAG_SPECIAL = 1u << 26u, + +}; +#define RSPAMD_URL_MAX_FLAG_SHIFT (26u) + +struct rspamd_url_tag { + const gchar *data; + struct rspamd_url_tag *prev, *next; +}; + +struct rspamd_url_ext; +/** + * URL structure + */ +struct rspamd_url { + char *string; + char *raw; + struct rspamd_url_ext *ext; + + uint32_t flags; + + uint8_t protocol; + uint8_t protocollen; + + uint16_t hostshift; + uint16_t datashift; + uint16_t queryshift; + uint16_t fragmentshift; + uint16_t tldshift; + guint16 usershift; + guint16 userlen; + + uint16_t hostlen; + uint16_t datalen; + uint16_t querylen; + uint16_t fragmentlen; + uint16_t tldlen; + uint16_t count; + uint16_t urllen; + uint16_t rawlen; + + /* Absolute order of the URL in a message */ + uint16_t order; + /* Order of the URL in a specific part of message */ + uint16_t part_order; +}; + +/** + * Rarely used url fields + */ +struct rspamd_url_ext { + gchar *visible_part; + struct rspamd_url *linked_url; + + guint16 port; +}; + +#define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL) +#define rspamd_url_user_unsafe(u) ((u)->string + (u)->usershift) + +#define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL) +#define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift) +#define rspamd_url_tld_unsafe(u) ((u)->string + (u)->tldshift) + +#define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift) +#define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift) +#define rspamd_url_fragment_unsafe(u) ((u)->string + (u)->fragmentshift) + +enum uri_errno { + URI_ERRNO_OK = 0, /* Parsing went well */ + URI_ERRNO_EMPTY, /* The URI string was empty */ + URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */ + URI_ERRNO_INVALID_PORT, /* Port number is bad */ + URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */ + URI_ERRNO_BAD_FORMAT, + URI_ERRNO_TLD_MISSING, + URI_ERRNO_HOST_MISSING, + URI_ERRNO_TOO_LONG, +}; + +enum rspamd_url_protocol { + PROTOCOL_FILE = 1u << 0u, + PROTOCOL_FTP = 1u << 1u, + PROTOCOL_HTTP = 1u << 2u, + PROTOCOL_HTTPS = 1u << 3u, + PROTOCOL_MAILTO = 1u << 4u, + PROTOCOL_TELEPHONE = 1u << 5u, + PROTOCOL_UNKNOWN = 1u << 7u, +}; + +enum rspamd_url_parse_flags { + RSPAMD_URL_PARSE_TEXT = 0u, + RSPAMD_URL_PARSE_HREF = (1u << 0u), + RSPAMD_URL_PARSE_CHECK = (1u << 1u), +}; + +enum rspamd_url_find_type { + RSPAMD_URL_FIND_ALL = 0, + RSPAMD_URL_FIND_STRICT, +}; + +/** + * Initialize url library + * @param cfg + */ +void rspamd_url_init(const gchar *tld_file); + +void rspamd_url_deinit(void); + +/* + * Parse urls inside text + * @param pool memory pool + * @param task task object + * @param part current text part + * @param is_html turn on html heuristic + */ +void rspamd_url_text_extract(rspamd_mempool_t *pool, + struct rspamd_task *task, + struct rspamd_mime_text_part *part, + uint16_t *cur_order, + enum rspamd_url_find_type how); + +/* + * Parse a single url into an uri structure + * @param pool memory pool + * @param uristring text form of url + * @param uri url object, must be pre allocated + */ +enum uri_errno rspamd_url_parse(struct rspamd_url *uri, + gchar *uristring, + gsize len, + rspamd_mempool_t *pool, + enum rspamd_url_parse_flags flags); + +/* + * Try to extract url from a text + * @param pool memory pool + * @param begin begin of text + * @param len length of text + * @param start storage for start position of url found (or NULL) + * @param end storage for end position of url found (or NULL) + * @param url_str storage for url string(or NULL) + * @return TRUE if url is found in specified text + */ +gboolean rspamd_url_find(rspamd_mempool_t *pool, + const gchar *begin, gsize len, + gchar **url_str, + enum rspamd_url_find_type how, + goffset *url_pos, + gboolean *prefix_added); + +/* + * Return text representation of url parsing error + */ +const gchar *rspamd_url_strerror(int err); + + +/** + * Find TLD for a specified host string + * @param in input host + * @param inlen length of input + * @param out output rspamd_ftok_t with tld position + * @return TRUE if tld has been found + */ +gboolean rspamd_url_find_tld(const gchar *in, gsize inlen, rspamd_ftok_t *out); + +typedef gboolean (*url_insert_function)(struct rspamd_url *url, + gsize start_offset, gsize end_offset, void *ud); + +/** + * Search for multiple urls in text and call `func` for each url found + * @param pool + * @param in + * @param inlen + * @param is_html + * @param func + * @param ud + */ +void rspamd_url_find_multiple(rspamd_mempool_t *pool, + const gchar *in, gsize inlen, + enum rspamd_url_find_type how, + GPtrArray *nlines, + url_insert_function func, + gpointer ud); + +/** + * Search for a single url in text and call `func` for each url found + * @param pool + * @param in + * @param inlen + * @param is_html + * @param func + * @param ud + */ +void rspamd_url_find_single(rspamd_mempool_t *pool, + const gchar *in, gsize inlen, + enum rspamd_url_find_type how, + url_insert_function func, + gpointer ud); + +/** + * Generic callback to insert URLs into rspamd_task + * @param url + * @param start_offset + * @param end_offset + * @param ud + */ +gboolean rspamd_url_task_subject_callback(struct rspamd_url *url, + gsize start_offset, + gsize end_offset, gpointer ud); + +/** + * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated + * @param dst + * @param src + * @param size + * @return + */ +gsize rspamd_url_decode(gchar *dst, const gchar *src, gsize size); + +/** + * Encode url if needed. In this case, memory is allocated from the specific pool. + * Returns pointer to begin and encoded length in `dlen` + * @param url + * @param pool + * @return + */ +const gchar *rspamd_url_encode(struct rspamd_url *url, gsize *dlen, + rspamd_mempool_t *pool); + + +/** + * Returns if a character is domain character + * @param c + * @return + */ +gboolean rspamd_url_is_domain(int c); + +/** + * Returns symbolic name for protocol + * @param proto + * @return + */ +const gchar *rspamd_url_protocol_name(enum rspamd_url_protocol proto); + + +/** + * Converts string to a numeric protocol + * @param str + * @return + */ +enum rspamd_url_protocol rspamd_url_protocol_from_string(const gchar *str); + +/** + * Converts string to a url flag + * @param str + * @param flag + * @return + */ +bool rspamd_url_flag_from_string(const gchar *str, gint *flag); + +/** + * Converts url flag to a string + * @param flag + * @return + */ +const gchar *rspamd_url_flag_to_string(int flag); + +/* Defines sets of urls indexed by url as is */ +KHASH_DECLARE(rspamd_url_hash, struct rspamd_url *, char); +KHASH_DECLARE(rspamd_url_host_hash, struct rspamd_url *, char); + +/* Convenience functions for url sets */ +/** + * Add an url to set or increase the existing url count + * @param set + * @param u + * @return true if a new url has been added + */ +bool rspamd_url_set_add_or_increase(khash_t(rspamd_url_hash) * set, + struct rspamd_url *u, + bool enforce_replace); + +/** + * Same as rspamd_url_set_add_or_increase but returns the existing url if found + * @param set + * @param u + * @return + */ +struct rspamd_url *rspamd_url_set_add_or_return(khash_t(rspamd_url_hash) * set, + struct rspamd_url *u); +/** + * Helper for url host set + * @param set + * @param u + * @return + */ +bool rspamd_url_host_set_add(khash_t(rspamd_url_host_hash) * set, + struct rspamd_url *u); +/** + * Checks if a url is in set + * @param set + * @param u + * @return + */ +bool rspamd_url_set_has(khash_t(rspamd_url_hash) * set, struct rspamd_url *u); + +bool rspamd_url_host_set_has(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u); + +/** + * Compares two urls (similar to C comparison functions) lexicographically + * @param u1 + * @param u2 + * @return + */ +int rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2); + +/** + * Same but used for qsort to sort `struct rspamd_url *[]` array + * @param u1 + * @param u2 + * @return + */ +int rspamd_url_cmp_qsort(const void *u1, const void *u2); + +/** + * Returns a port for some url + * @param u + * @return + */ +static RSPAMD_PURE_FUNCTION inline uint16_t rspamd_url_get_port(struct rspamd_url *u) +{ + if ((u->flags & RSPAMD_URL_FLAG_HAS_PORT) && u->ext) { + return u->ext->port; + } + else { + /* Assume standard port */ + if (u->protocol == PROTOCOL_HTTPS) { + return 443; + } + else { + return 80; + } + } +} + +/** + * Returns a port for some url if it is set + * @param u + * @return + */ +static RSPAMD_PURE_FUNCTION inline uint16_t rspamd_url_get_port_if_special(struct rspamd_url *u) +{ + if ((u->flags & RSPAMD_URL_FLAG_HAS_PORT) && u->ext) { + return u->ext->port; + } + + return 0; +} + +/** + * Normalize unicode input and set out url flags as appropriate + * @param pool + * @param input + * @param len_out (must be &var) + * @param url_flags_out (must be just a var with no dereference) + */ +#define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \ + do { \ + enum rspamd_utf8_normalise_result norm_res; \ + norm_res = rspamd_normalise_unicode_inplace((input), (len_out)); \ + if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \ + url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \ + } \ + if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { \ + url_flags_out |= RSPAMD_URL_FLAG_ZW_SPACES; \ + } \ + if (norm_res & (RSPAMD_UNICODE_NORM_ERROR)) { \ + url_flags_out |= RSPAMD_URL_FLAG_OBSCURED; \ + } \ + } while (0) +#ifdef __cplusplus +} +#endif + +#endif |