summaryrefslogtreecommitdiffstats
path: root/src/lua/lua_url.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lua/lua_url.c1481
1 files changed, 1481 insertions, 0 deletions
diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c
new file mode 100644
index 0000000..913469f
--- /dev/null
+++ b/src/lua/lua_url.c
@@ -0,0 +1,1481 @@
+/*
+ * Copyright 2023 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "lua_common.h"
+#include "lua_url.h"
+
+
+/***
+ * @module rspamd_url
+ * This module provides routines to handle URL's and extract URL's from the text.
+ * Objects of this class are returned, for example, by `task:get_urls()` or `task:get_emails()`.
+ * You can also create `rspamd_url` from any text.
+ * @example
+local url = require "rspamd_url"
+local mpool = require "rspamd_mempool"
+
+url.init("/usr/share/rspamd/effective_tld_names.dat")
+local pool = mpool.create()
+local res = url.create(pool, 'Look at: http://user@test.example.com/test?query")
+local t = res:to_table()
+-- Content of t:
+-- url = ['http://test.example.com/test?query']
+-- host = ['test.example.com']
+-- user = ['user']
+-- path = ['test']
+-- tld = ['example.com']
+
+pool:destroy() -- res is destroyed here, so you should not use it afterwards
+
+local mistake = res:to_table() -- INVALID! as pool is destroyed
+ */
+
+/* URL methods */
+LUA_FUNCTION_DEF(url, get_length);
+LUA_FUNCTION_DEF(url, get_host);
+LUA_FUNCTION_DEF(url, get_port);
+LUA_FUNCTION_DEF(url, get_user);
+LUA_FUNCTION_DEF(url, get_path);
+LUA_FUNCTION_DEF(url, get_query);
+LUA_FUNCTION_DEF(url, get_fragment);
+LUA_FUNCTION_DEF(url, get_text);
+LUA_FUNCTION_DEF(url, tostring);
+LUA_FUNCTION_DEF(url, get_raw);
+LUA_FUNCTION_DEF(url, get_tld);
+LUA_FUNCTION_DEF(url, get_flags);
+LUA_FUNCTION_DEF(url, get_flags_num);
+LUA_FUNCTION_DEF(url, get_protocol);
+LUA_FUNCTION_DEF(url, to_table);
+LUA_FUNCTION_DEF(url, is_phished);
+LUA_FUNCTION_DEF(url, is_redirected);
+LUA_FUNCTION_DEF(url, is_obscured);
+LUA_FUNCTION_DEF(url, is_html_displayed);
+LUA_FUNCTION_DEF(url, is_subject);
+LUA_FUNCTION_DEF(url, get_phished);
+LUA_FUNCTION_DEF(url, set_redirected);
+LUA_FUNCTION_DEF(url, get_count);
+LUA_FUNCTION_DEF(url, get_visible);
+LUA_FUNCTION_DEF(url, create);
+LUA_FUNCTION_DEF(url, init);
+LUA_FUNCTION_DEF(url, all);
+LUA_FUNCTION_DEF(url, lt);
+LUA_FUNCTION_DEF(url, eq);
+LUA_FUNCTION_DEF(url, get_order);
+LUA_FUNCTION_DEF(url, get_part_order);
+
+static const struct luaL_reg urllib_m[] = {
+ LUA_INTERFACE_DEF(url, get_length),
+ LUA_INTERFACE_DEF(url, get_host),
+ LUA_INTERFACE_DEF(url, get_port),
+ LUA_INTERFACE_DEF(url, get_user),
+ LUA_INTERFACE_DEF(url, get_path),
+ LUA_INTERFACE_DEF(url, get_query),
+ LUA_INTERFACE_DEF(url, get_fragment),
+ LUA_INTERFACE_DEF(url, get_text),
+ LUA_INTERFACE_DEF(url, get_tld),
+ LUA_INTERFACE_DEF(url, get_raw),
+ LUA_INTERFACE_DEF(url, get_protocol),
+ LUA_INTERFACE_DEF(url, to_table),
+ LUA_INTERFACE_DEF(url, is_phished),
+ LUA_INTERFACE_DEF(url, is_redirected),
+ LUA_INTERFACE_DEF(url, is_obscured),
+ LUA_INTERFACE_DEF(url, is_html_displayed),
+ LUA_INTERFACE_DEF(url, is_subject),
+ LUA_INTERFACE_DEF(url, get_phished),
+
+ LUA_INTERFACE_DEF(url, get_visible),
+ LUA_INTERFACE_DEF(url, get_count),
+ LUA_INTERFACE_DEF(url, get_flags),
+ LUA_INTERFACE_DEF(url, get_flags_num),
+ LUA_INTERFACE_DEF(url, get_order),
+ LUA_INTERFACE_DEF(url, get_part_order),
+ {"get_redirected", lua_url_get_phished},
+ LUA_INTERFACE_DEF(url, set_redirected),
+ {"__tostring", lua_url_tostring},
+ {"__eq", lua_url_eq},
+ {"__lt", lua_url_lt},
+ {NULL, NULL}};
+
+static const struct luaL_reg urllib_f[] = {
+ LUA_INTERFACE_DEF(url, init),
+ LUA_INTERFACE_DEF(url, create),
+ LUA_INTERFACE_DEF(url, all),
+ {NULL, NULL}};
+
+struct rspamd_lua_url *
+lua_check_url(lua_State *L, gint pos)
+{
+ void *ud = rspamd_lua_check_udata(L, pos, "rspamd{url}");
+ luaL_argcheck(L, ud != NULL, pos, "'url' expected");
+ return ud ? ((struct rspamd_lua_url *) ud) : NULL;
+}
+
+static gboolean
+lua_url_single_inserter(struct rspamd_url *url, gsize start_offset,
+ gsize end_offset, gpointer ud)
+{
+ lua_State *L = ud;
+ struct rspamd_lua_url *lua_url;
+
+ lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
+ rspamd_lua_setclass(L, "rspamd{url}", -1);
+ lua_url->url = url;
+
+ return TRUE;
+}
+
+/***
+ * @method url:get_length()
+ * Get length of the url
+ * @return {number} length of url in bytes
+ */
+static gint
+lua_url_get_length(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL) {
+ lua_pushinteger(L, url->url->urllen);
+ }
+ else {
+ lua_pushnil(L);
+ }
+ return 1;
+}
+
+/***
+ * @method url:get_host()
+ * Get domain part of the url
+ * @return {string} domain part of URL
+ */
+static gint
+lua_url_get_host(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && url->url && url->url->hostlen > 0) {
+ lua_pushlstring(L, rspamd_url_host(url->url), url->url->hostlen);
+ }
+ else {
+ lua_pushnil(L);
+ }
+ return 1;
+}
+
+/***
+ * @method url:get_port()
+ * Get port of the url
+ * @return {number} url port
+ */
+static gint
+lua_url_get_port(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL) {
+ if (rspamd_url_get_port_if_special(url->url) == 0) {
+ lua_pushnil(L);
+ }
+ else {
+ lua_pushinteger(L, rspamd_url_get_port_if_special(url->url));
+ }
+ }
+ else {
+ lua_pushnil(L);
+ }
+ return 1;
+}
+
+/***
+ * @method url:get_user()
+ * Get user part of the url (e.g. username in email)
+ * @return {string} user part of URL
+ */
+static gint
+lua_url_get_user(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && rspamd_url_user(url->url) != NULL) {
+ lua_pushlstring(L, rspamd_url_user(url->url), url->url->userlen);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:get_path()
+ * Get path of the url
+ * @return {string} path part of URL
+ */
+static gint
+lua_url_get_path(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && url->url->datalen > 0) {
+ lua_pushlstring(L, rspamd_url_data_unsafe(url->url), url->url->datalen);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:get_query()
+ * Get query of the url
+ * @return {string} query part of URL
+ */
+static gint
+lua_url_get_query(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && url->url->querylen > 0) {
+ lua_pushlstring(L, rspamd_url_query_unsafe(url->url), url->url->querylen);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:get_fragment()
+ * Get fragment of the url
+ * @return {string} fragment part of URL
+ */
+static gint
+lua_url_get_fragment(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && url->url->fragmentlen > 0) {
+ lua_pushlstring(L, rspamd_url_fragment_unsafe(url->url), url->url->fragmentlen);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:get_text()
+ * Get full content of the url
+ * @return {string} url string
+ */
+static gint
+lua_url_get_text(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL) {
+ lua_pushlstring(L, url->url->string, url->url->urllen);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:tostring()
+ * Get full content of the url or user@domain in case of email
+ * @return {string} url as a string
+ */
+static gint
+lua_url_tostring(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && url->url != NULL) {
+ if (url->url->protocol == PROTOCOL_MAILTO) {
+ gchar *tmp = g_malloc(url->url->userlen + 1 +
+ url->url->hostlen);
+ if (url->url->userlen) {
+ memcpy(tmp, url->url->string + url->url->usershift, url->url->userlen);
+ }
+
+ tmp[url->url->userlen] = '@';
+ memcpy(tmp + url->url->userlen + 1, rspamd_url_host_unsafe(url->url),
+ url->url->hostlen);
+
+ lua_pushlstring(L, tmp, url->url->userlen + 1 + url->url->hostlen);
+ g_free(tmp);
+ }
+ else {
+ lua_pushlstring(L, url->url->string, url->url->urllen);
+ }
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:get_raw()
+ * Get full content of the url as it was parsed (e.g. with urldecode)
+ * @return {string} url string
+ */
+static gint
+lua_url_get_raw(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL) {
+ lua_pushlstring(L, url->url->raw, url->url->rawlen);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:is_phished()
+ * Check whether URL is treated as phished
+ * @return {boolean} `true` if URL is phished
+ */
+static gint
+lua_url_is_phished(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL) {
+ lua_pushboolean(L, url->url->flags & RSPAMD_URL_FLAG_PHISHED);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:is_redirected()
+ * Check whether URL was redirected
+ * @return {boolean} `true` if URL is redirected
+ */
+static gint
+lua_url_is_redirected(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL) {
+ lua_pushboolean(L, url->url->flags & RSPAMD_URL_FLAG_REDIRECTED);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:is_obscured()
+ * Check whether URL is treated as obscured or obfuscated (e.g. numbers in IP address or other hacks)
+ * @return {boolean} `true` if URL is obscured
+ */
+static gint
+lua_url_is_obscured(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL) {
+ lua_pushboolean(L, url->url->flags & RSPAMD_URL_FLAG_OBSCURED);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+
+/***
+ * @method url:is_html_displayed()
+ * Check whether URL is just displayed in HTML (e.g. NOT a real href)
+ * @return {boolean} `true` if URL is displayed only
+ */
+static gint
+lua_url_is_html_displayed(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL) {
+ lua_pushboolean(L, url->url->flags & RSPAMD_URL_FLAG_HTML_DISPLAYED);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:is_subject()
+ * Check whether URL is found in subject
+ * @return {boolean} `true` if URL is found in subject
+ */
+static gint
+lua_url_is_subject(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL) {
+ lua_pushboolean(L, url->url->flags & RSPAMD_URL_FLAG_SUBJECT);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:get_phished()
+ * Get another URL that pretends to be this URL (e.g. used in phishing)
+ * @return {url} phished URL
+ */
+static gint
+lua_url_get_phished(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *purl, *url = lua_check_url(L, 1);
+
+ if (url) {
+ if (url->url->ext && url->url->ext->linked_url != NULL) {
+ /* XXX: in fact, this is the only possible combination of flags, so this check is redundant */
+ if (url->url->flags &
+ (RSPAMD_URL_FLAG_PHISHED | RSPAMD_URL_FLAG_REDIRECTED)) {
+ purl = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
+ rspamd_lua_setclass(L, "rspamd{url}", -1);
+ purl->url = url->url->ext->linked_url;
+
+ return 1;
+ }
+ }
+ }
+
+ lua_pushnil(L);
+ return 1;
+}
+
+/***
+ * @method url:set_redirected(url, pool)
+ * Set url as redirected to another url
+ * @param {string|url} url new url that is redirecting an old one
+ * @param {pool} pool memory pool to allocate memory if needed
+ * @return {url} parsed redirected url (if needed)
+ */
+static gint
+lua_url_set_redirected(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1), *redir;
+ rspamd_mempool_t *pool = NULL;
+
+ if (url == NULL) {
+ return luaL_error(L, "url is required as the first argument");
+ }
+
+ if (lua_type(L, 2) == LUA_TSTRING) {
+ /* Parse url */
+ if (lua_type(L, 3) != LUA_TUSERDATA) {
+ return luaL_error(L, "mempool is required as the third argument");
+ }
+
+ pool = rspamd_lua_check_mempool(L, 3);
+
+ if (pool == NULL) {
+ return luaL_error(L, "mempool is required as the third argument");
+ }
+
+ gsize len;
+ const gchar *urlstr = lua_tolstring(L, 2, &len);
+
+ rspamd_url_find_single(pool, urlstr, len, RSPAMD_URL_FIND_ALL,
+ lua_url_single_inserter, L);
+
+ if (lua_type(L, -1) != LUA_TUSERDATA) {
+ /* URL is actually not found */
+ lua_pushnil(L);
+ }
+ else {
+ redir = lua_check_url(L, -1);
+
+ url->url->flags |= RSPAMD_URL_FLAG_REDIRECTED;
+
+ if (url->url->ext == NULL) {
+ url->url->ext = rspamd_mempool_alloc0_type(pool, struct rspamd_url_ext);
+ }
+ url->url->ext->linked_url = redir->url;
+ }
+ }
+ else {
+ redir = lua_check_url(L, 2);
+
+ if (redir == NULL) {
+ return luaL_error(L, "url is required as the second argument");
+ }
+
+ pool = rspamd_lua_check_mempool(L, 3);
+
+ if (pool == NULL) {
+ return luaL_error(L, "mempool is required as the third argument");
+ }
+
+ url->url->flags |= RSPAMD_URL_FLAG_REDIRECTED;
+ if (url->url->ext == NULL) {
+ url->url->ext = rspamd_mempool_alloc0_type(pool, struct rspamd_url_ext);
+ }
+ url->url->ext->linked_url = redir->url;
+
+ /* Push back on stack */
+ lua_pushvalue(L, 2);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:get_tld()
+ * Get effective second level domain part (eSLD) of the url host
+ * @return {string} effective second level domain part (eSLD) of the url host
+ */
+static gint
+lua_url_get_tld(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && url->url->tldlen > 0) {
+ lua_pushlstring(L, rspamd_url_tld_unsafe(url->url), url->url->tldlen);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:get_protocol()
+ * Get protocol name
+ * @return {string} protocol as a string
+ */
+static gint
+lua_url_get_protocol(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && url->url->protocol != PROTOCOL_UNKNOWN) {
+ lua_pushstring(L, rspamd_url_protocol_name(url->url->protocol));
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:get_count()
+ * Return number of occurrences for this particular URL
+ * @return {number} number of occurrences
+ */
+static gint
+lua_url_get_count(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && url->url != NULL) {
+ lua_pushinteger(L, url->url->count);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+* @method url:get_visible()
+* Get visible part of the url with html tags stripped
+* @return {string} url string
+*/
+static gint
+lua_url_get_visible(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && url->url->ext && url->url->ext->visible_part) {
+ lua_pushstring(L, url->url->ext->visible_part);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:to_table()
+ * Return url as a table with the following fields:
+ *
+ * - `url`: full content
+ * - `host`: hostname part
+ * - `user`: user part
+ * - `path`: path part
+ * - `tld`: top level domain
+ * - `protocol`: url protocol
+ * @return {table} URL as a table
+ */
+static gint
+lua_url_to_table(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+ struct rspamd_url *u;
+
+ if (url != NULL) {
+ u = url->url;
+ lua_createtable(L, 0, 12);
+ lua_pushstring(L, "url");
+ lua_pushlstring(L, u->string, u->urllen);
+ lua_settable(L, -3);
+
+ if (u->hostlen > 0) {
+ lua_pushstring(L, "host");
+ lua_pushlstring(L, rspamd_url_host_unsafe(u), u->hostlen);
+ lua_settable(L, -3);
+ }
+
+ if (rspamd_url_get_port_if_special(u) != 0) {
+ lua_pushstring(L, "port");
+ lua_pushinteger(L, rspamd_url_get_port_if_special(u));
+ lua_settable(L, -3);
+ }
+
+ if (u->tldlen > 0) {
+ lua_pushstring(L, "tld");
+ lua_pushlstring(L, rspamd_url_tld_unsafe(u), u->tldlen);
+ lua_settable(L, -3);
+ }
+
+ if (u->userlen > 0) {
+ lua_pushstring(L, "user");
+ lua_pushlstring(L, rspamd_url_user(u), u->userlen);
+ lua_settable(L, -3);
+ }
+
+ if (u->datalen > 0) {
+ lua_pushstring(L, "path");
+ lua_pushlstring(L, rspamd_url_data_unsafe(u), u->datalen);
+ lua_settable(L, -3);
+ }
+
+ if (u->querylen > 0) {
+ lua_pushstring(L, "query");
+ lua_pushlstring(L, rspamd_url_query_unsafe(u), u->querylen);
+ lua_settable(L, -3);
+ }
+
+ if (u->fragmentlen > 0) {
+ lua_pushstring(L, "fragment");
+ lua_pushlstring(L, rspamd_url_fragment_unsafe(u), u->fragmentlen);
+ lua_settable(L, -3);
+ }
+
+
+ lua_pushstring(L, "protocol");
+ lua_pushstring(L, rspamd_url_protocol_name(u->protocol));
+ lua_settable(L, -3);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+static rspamd_mempool_t *static_lua_url_pool;
+
+RSPAMD_CONSTRUCTOR(rspamd_urls_static_pool_ctor)
+{
+ static_lua_url_pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
+ "static_lua_url", 0);
+}
+
+RSPAMD_DESTRUCTOR(rspamd_urls_static_pool_dtor)
+{
+ rspamd_mempool_delete(static_lua_url_pool);
+}
+
+/***
+ * @function url.create([mempool,] str, [{flags_table}])
+ * @param {rspamd_mempool} memory pool for URL, e.g. `task:get_mempool()`
+ * @param {string} text that contains URL (can also contain other stuff)
+ * @return {url} new url object that exists as long as the corresponding mempool exists
+ */
+static gint
+lua_url_create(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ rspamd_mempool_t *pool;
+ struct rspamd_lua_text *t;
+ struct rspamd_lua_url *u;
+
+ if (lua_type(L, 1) == LUA_TUSERDATA) {
+ pool = rspamd_lua_check_mempool(L, 1);
+ t = lua_check_text_or_string(L, 2);
+ }
+ else {
+ pool = static_lua_url_pool;
+ t = lua_check_text_or_string(L, 2);
+ }
+
+ if (pool == NULL || t == NULL) {
+ return luaL_error(L, "invalid arguments");
+ }
+ else {
+ rspamd_url_find_single(pool, t->start, t->len, RSPAMD_URL_FIND_ALL,
+ lua_url_single_inserter, L);
+
+ if (lua_type(L, -1) != LUA_TUSERDATA) {
+ /* URL is actually not found */
+ lua_pushnil(L);
+
+ return 1;
+ }
+
+ u = (struct rspamd_lua_url *) lua_touserdata(L, -1);
+
+ if (lua_type(L, 3) == LUA_TTABLE) {
+ /* Add flags */
+ for (lua_pushnil(L); lua_next(L, 3); lua_pop(L, 1)) {
+ int nmask = 0;
+ const gchar *fname = lua_tostring(L, -1);
+
+ if (rspamd_url_flag_from_string(fname, &nmask)) {
+ u->url->flags |= nmask;
+ }
+ else {
+ lua_pop(L, 1);
+ return luaL_error(L, "invalid flag: %s", fname);
+ }
+ }
+ }
+ }
+
+ return 1;
+}
+
+/***
+ * @function url.init(tld_file)
+ * Initialize url library if not initialized yet by Rspamd
+ * @param {string} tld_file path to effective_tld_names.dat file (public suffix list)
+ * @return nothing
+ */
+static gint
+lua_url_init(lua_State *L)
+{
+ const gchar *tld_path;
+
+ tld_path = luaL_checkstring(L, 1);
+
+ rspamd_url_init(tld_path);
+
+ return 0;
+}
+
+static gboolean
+lua_url_table_inserter(struct rspamd_url *url, gsize start_offset,
+ gsize end_offset, gpointer ud)
+{
+ lua_State *L = ud;
+ struct rspamd_lua_url *lua_url;
+ gint n;
+
+ n = rspamd_lua_table_size(L, -1);
+ lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
+ rspamd_lua_setclass(L, "rspamd{url}", -1);
+ lua_url->url = url;
+ lua_rawseti(L, -2, n + 1);
+
+ return TRUE;
+}
+
+
+static gint
+lua_url_all(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ rspamd_mempool_t *pool = rspamd_lua_check_mempool(L, 1);
+ const gchar *text;
+ size_t length;
+
+ if (pool == NULL) {
+ lua_pushnil(L);
+ }
+ else {
+ text = luaL_checklstring(L, 2, &length);
+
+ if (text != NULL) {
+ lua_newtable(L);
+ rspamd_url_find_multiple(pool, text, length,
+ RSPAMD_URL_FIND_ALL, NULL,
+ lua_url_table_inserter, L);
+ }
+ else {
+ lua_pushnil(L);
+ }
+ }
+
+ return 1;
+}
+
+/***
+ * @method url:get_flags()
+ * Return flags for a specified URL as map 'flag'->true for all flags set,
+ * possible flags are:
+ *
+ * - `phished`: URL is likely phished
+ * - `numeric`: URL is numeric (e.g. IP address)
+ * - `obscured`: URL was obscured
+ * - `redirected`: URL comes from redirector
+ * - `html_displayed`: URL is used just for displaying purposes
+ * - `text`: URL comes from the text
+ * - `subject`: URL comes from the subject
+ * - `host_encoded`: URL host part is encoded
+ * - `schema_encoded`: URL schema part is encoded
+ * - `query_encoded`: URL query part is encoded
+ * - `missing_slashes`: URL has some slashes missing
+ * - `idn`: URL has international characters
+ * - `has_port`: URL has port
+ * - `has_user`: URL has user part
+ * - `schemaless`: URL has no schema
+ * - `unnormalised`: URL has some unicode unnormalities
+ * - `zw_spaces`: URL has some zero width spaces
+ * - `url_displayed`: URL has some other url-like string in visible part
+ * - `image`: URL is from src attribute of img HTML tag
+ * @return {table} URL flags
+ */
+#define PUSH_FLAG(fl) \
+ do { \
+ if (flags & (fl)) { \
+ lua_pushstring(L, rspamd_url_flag_to_string(fl)); \
+ lua_pushboolean(L, true); \
+ lua_settable(L, -3); \
+ } \
+ } while (0)
+
+static gint
+lua_url_get_flags(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+ enum rspamd_url_flags flags;
+
+ if (url != NULL) {
+ flags = url->url->flags;
+
+ lua_createtable(L, 0, 4);
+
+ for (gint i = 0; i < RSPAMD_URL_MAX_FLAG_SHIFT; i++) {
+ PUSH_FLAG(1u << i);
+ }
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+#undef PUSH_FLAG
+
+static gint
+lua_url_get_flags_num(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url) {
+ lua_pushinteger(L, url->url->flags);
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static gint
+lua_url_get_order(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url) {
+ if (url->url->order != (uint16_t) -1) {
+ lua_pushinteger(L, url->url->order);
+ }
+ else {
+ lua_pushnil(L);
+ }
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static gint
+lua_url_get_part_order(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url) {
+ if (url->url->part_order != (uint16_t) -1) {
+ lua_pushinteger(L, url->url->part_order);
+ }
+ else {
+ lua_pushnil(L);
+ }
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+void lua_tree_url_callback(gpointer key, gpointer value, gpointer ud)
+{
+ struct rspamd_lua_url *lua_url;
+ struct rspamd_url *url = (struct rspamd_url *) value;
+ struct lua_tree_cb_data *cb = ud;
+
+ if ((url->protocol & cb->protocols_mask) == url->protocol) {
+
+ /* Handle different flags application logic */
+ switch (cb->flags_mode) {
+ case url_flags_mode_include_any:
+ if (url->flags != (url->flags & cb->flags_mask)) {
+ return;
+ }
+ break;
+ case url_flags_mode_include_explicit:
+ if ((url->flags & cb->flags_mask) != cb->flags_mask) {
+ return;
+ }
+ break;
+ case url_flags_mode_exclude_include:
+ if ((url->flags & cb->flags_exclude_mask) != 0) {
+ return;
+ }
+ if ((url->flags & cb->flags_mask) == 0) {
+ return;
+ }
+ break;
+ }
+
+ if (cb->skip_prob > 0) {
+ gdouble coin = rspamd_random_double_fast_seed(&cb->random_seed);
+
+ if (coin < cb->skip_prob) {
+ return;
+ }
+ }
+
+ lua_url = lua_newuserdata(cb->L, sizeof(struct rspamd_lua_url));
+ lua_pushvalue(cb->L, cb->metatable_pos);
+ lua_setmetatable(cb->L, -2);
+ lua_url->url = url;
+ lua_rawseti(cb->L, -2, cb->i++);
+ }
+}
+
+gboolean
+lua_url_cbdata_fill(lua_State *L,
+ gint pos,
+ struct lua_tree_cb_data *cbd,
+ guint default_protocols,
+ guint default_flags,
+ gsize max_urls)
+{
+ gint protocols_mask = 0;
+
+ gint pos_arg_type = lua_type(L, pos);
+ guint flags_mask = default_flags;
+ gboolean seen_flags = FALSE, seen_protocols = FALSE;
+
+ memset(cbd, 0, sizeof(*cbd));
+ cbd->flags_mode = url_flags_mode_include_any;
+
+ if (pos_arg_type == LUA_TBOOLEAN) {
+ protocols_mask = default_protocols;
+ if (lua_toboolean(L, 2)) {
+ protocols_mask |= PROTOCOL_MAILTO;
+ }
+ }
+ else if (pos_arg_type == LUA_TTABLE) {
+ if (rspamd_lua_geti(L, 1, pos) == LUA_TNIL) {
+ /* New method: indexed table */
+
+ lua_getfield(L, pos, "flags");
+ if (lua_istable(L, -1)) {
+ gint top = lua_gettop(L);
+
+ lua_getfield(L, pos, "flags_mode");
+ if (lua_isstring(L, -1)) {
+ const gchar *mode_str = lua_tostring(L, -1);
+
+ if (strcmp(mode_str, "explicit") == 0) {
+ cbd->flags_mode = url_flags_mode_include_explicit;
+ /*
+ * Ignore default flags in this mode and include
+ * merely flags specified by a caller
+ */
+ flags_mask = 0;
+ }
+ }
+ lua_pop(L, 1);
+
+ for (lua_pushnil(L); lua_next(L, top); lua_pop(L, 1)) {
+ int nmask = 0;
+
+
+ if (lua_type(L, -1) == LUA_TSTRING) {
+ const gchar *fname = lua_tostring(L, -1);
+
+
+ if (rspamd_url_flag_from_string(fname, &nmask)) {
+ flags_mask |= nmask;
+ }
+ else {
+ msg_info("bad url flag: %s", fname);
+ return FALSE;
+ }
+ }
+ else {
+ flags_mask |= lua_tointeger(L, -1);
+ }
+ }
+
+ seen_flags = TRUE;
+ }
+ else {
+ flags_mask |= default_flags;
+ }
+ lua_pop(L, 1);
+
+ lua_getfield(L, pos, "protocols");
+ if (lua_istable(L, -1)) {
+ gint top = lua_gettop(L);
+
+ for (lua_pushnil(L); lua_next(L, top); lua_pop(L, 1)) {
+ int nmask;
+ const gchar *pname = lua_tostring(L, -1);
+
+ nmask = rspamd_url_protocol_from_string(pname);
+
+ if (nmask != PROTOCOL_UNKNOWN) {
+ protocols_mask |= nmask;
+ }
+ else {
+ msg_info("bad url protocol: %s", pname);
+ return FALSE;
+ }
+ }
+ seen_protocols = TRUE;
+ }
+ else {
+ protocols_mask = default_protocols;
+ }
+ lua_pop(L, 1);
+
+ if (!seen_protocols) {
+ lua_getfield(L, pos, "emails");
+ if (lua_isboolean(L, -1)) {
+ if (lua_toboolean(L, -1)) {
+ protocols_mask |= PROTOCOL_MAILTO;
+ }
+ }
+ lua_pop(L, 1);
+ }
+
+ if (!seen_flags) {
+ lua_getfield(L, pos, "images");
+ if (lua_isboolean(L, -1)) {
+ if (lua_toboolean(L, -1)) {
+ flags_mask |= RSPAMD_URL_FLAG_IMAGE;
+ }
+ else {
+ flags_mask &= ~RSPAMD_URL_FLAG_IMAGE;
+ }
+ }
+ else {
+ flags_mask &= ~RSPAMD_URL_FLAG_IMAGE;
+ }
+ lua_pop(L, 1);
+ }
+
+ if (!seen_flags) {
+ lua_getfield(L, pos, "content");
+ if (lua_isboolean(L, -1)) {
+ if (lua_toboolean(L, -1)) {
+ flags_mask |= RSPAMD_URL_FLAG_CONTENT;
+ }
+ else {
+ flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
+ }
+ }
+ else {
+ flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
+ }
+ lua_pop(L, 1);
+ }
+
+ lua_getfield(L, pos, "max_urls");
+ if (lua_isnumber(L, -1)) {
+ max_urls = lua_tonumber(L, -1);
+ }
+ lua_pop(L, 1);
+
+ lua_getfield(L, pos, "sort");
+ if (lua_isboolean(L, -1)) {
+ cbd->sort = TRUE;
+ }
+ lua_pop(L, 1);
+ }
+ else {
+ /* Plain table of the protocols */
+ for (lua_pushnil(L); lua_next(L, pos); lua_pop(L, 1)) {
+ int nmask;
+ const gchar *pname = lua_tostring(L, -1);
+
+ nmask = rspamd_url_protocol_from_string(pname);
+
+ if (nmask != PROTOCOL_UNKNOWN) {
+ protocols_mask |= nmask;
+ }
+ else {
+ msg_info("bad url protocol: %s", pname);
+ return FALSE;
+ }
+ }
+ }
+
+ lua_pop(L, 1); /* After rspamd_lua_geti */
+ }
+ else if (pos_arg_type == LUA_TSTRING) {
+ const gchar *plist = lua_tostring(L, pos);
+ gchar **strvec;
+ gchar *const *cvec;
+
+ strvec = g_strsplit_set(plist, ",;", -1);
+ cvec = strvec;
+
+ while (*cvec) {
+ int nmask;
+
+ nmask = rspamd_url_protocol_from_string(*cvec);
+
+ if (nmask != PROTOCOL_UNKNOWN) {
+ protocols_mask |= nmask;
+ }
+ else {
+ msg_info("bad url protocol: %s", *cvec);
+ g_strfreev(strvec);
+
+ return FALSE;
+ }
+
+ cvec++;
+ }
+
+ g_strfreev(strvec);
+ }
+ else if (pos_arg_type == LUA_TNONE || pos_arg_type == LUA_TNIL) {
+ protocols_mask = default_protocols;
+ flags_mask = default_flags;
+ }
+ else {
+ return FALSE;
+ }
+
+ if (lua_type(L, pos + 1) == LUA_TBOOLEAN) {
+ if (lua_toboolean(L, pos + 1)) {
+ flags_mask |= RSPAMD_URL_FLAG_IMAGE;
+ }
+ else {
+ flags_mask &= ~RSPAMD_URL_FLAG_IMAGE;
+ }
+ }
+
+ cbd->i = 1;
+ cbd->L = L;
+ cbd->max_urls = max_urls;
+ cbd->protocols_mask = protocols_mask;
+ cbd->flags_mask = flags_mask;
+
+ /* This needs to be removed from the stack */
+ rspamd_lua_class_metatable(L, "rspamd{url}");
+ cbd->metatable_pos = lua_gettop(L);
+ (void) lua_checkstack(L, cbd->metatable_pos + 4);
+
+ return TRUE;
+}
+
+gboolean
+lua_url_cbdata_fill_exclude_include(lua_State *L,
+ gint pos,
+ struct lua_tree_cb_data *cbd,
+ guint default_protocols,
+ gsize max_urls)
+{
+ guint protocols_mask = default_protocols;
+ guint include_flags_mask, exclude_flags_mask;
+
+ gint pos_arg_type = lua_type(L, pos);
+
+ memset(cbd, 0, sizeof(*cbd));
+ cbd->flags_mode = url_flags_mode_exclude_include;
+
+ /* Include flags */
+ if (pos_arg_type == LUA_TTABLE) {
+ include_flags_mask = 0; /* Reset to no flags */
+
+ for (lua_pushnil(L); lua_next(L, pos); lua_pop(L, 1)) {
+ int nmask = 0;
+
+ if (lua_type(L, -1) == LUA_TSTRING) {
+ const gchar *fname = lua_tostring(L, -1);
+
+ if (rspamd_url_flag_from_string(fname, &nmask)) {
+ include_flags_mask |= nmask;
+ }
+ else {
+ msg_info("bad url include flag: %s", fname);
+ return FALSE;
+ }
+ }
+ else {
+ include_flags_mask |= lua_tointeger(L, -1);
+ }
+ }
+ }
+ else if (pos_arg_type == LUA_TNIL || pos_arg_type == LUA_TNONE) {
+ /* Include all flags */
+ include_flags_mask = ~0U;
+ }
+ else {
+ msg_info("bad arguments: wrong include mask");
+ return FALSE;
+ }
+
+ /* Exclude flags */
+ pos_arg_type = lua_type(L, pos + 1);
+ if (pos_arg_type == LUA_TTABLE) {
+ exclude_flags_mask = 0; /* Reset to no flags */
+
+ for (lua_pushnil(L); lua_next(L, pos + 1); lua_pop(L, 1)) {
+ int nmask = 0;
+
+ if (lua_type(L, -1) == LUA_TSTRING) {
+ const gchar *fname = lua_tostring(L, -1);
+
+ if (rspamd_url_flag_from_string(fname, &nmask)) {
+ exclude_flags_mask |= nmask;
+ }
+ else {
+ msg_info("bad url exclude flag: %s", fname);
+ return FALSE;
+ }
+ }
+ else {
+ exclude_flags_mask |= lua_tointeger(L, -1);
+ }
+ }
+ }
+ else if (pos_arg_type == LUA_TNIL || pos_arg_type == LUA_TNONE) {
+ /* Empty all exclude flags */
+ exclude_flags_mask = 0U;
+ }
+ else {
+ msg_info("bad arguments: wrong exclude mask");
+ return FALSE;
+ }
+
+ if (lua_type(L, pos + 2) == LUA_TTABLE) {
+ protocols_mask = 0U; /* Reset all protocols */
+
+ for (lua_pushnil(L); lua_next(L, pos + 2); lua_pop(L, 1)) {
+ int nmask;
+ const gchar *pname = lua_tostring(L, -1);
+
+ nmask = rspamd_url_protocol_from_string(pname);
+
+ if (nmask != PROTOCOL_UNKNOWN) {
+ protocols_mask |= nmask;
+ }
+ else {
+ msg_info("bad url protocol: %s", pname);
+ return FALSE;
+ }
+ }
+ }
+ else {
+ protocols_mask = default_protocols;
+ }
+
+ cbd->i = 1;
+ cbd->L = L;
+ cbd->max_urls = max_urls;
+ cbd->protocols_mask = protocols_mask;
+ cbd->flags_mask = include_flags_mask;
+ cbd->flags_exclude_mask = exclude_flags_mask;
+
+ /* This needs to be removed from the stack */
+ rspamd_lua_class_metatable(L, "rspamd{url}");
+ cbd->metatable_pos = lua_gettop(L);
+ (void) lua_checkstack(L, cbd->metatable_pos + 4);
+
+ return TRUE;
+}
+
+
+void lua_url_cbdata_dtor(struct lua_tree_cb_data *cbd)
+{
+ if (cbd->metatable_pos != -1) {
+ lua_remove(cbd->L, cbd->metatable_pos);
+ }
+}
+
+gsize lua_url_adjust_skip_prob(float timestamp,
+ guchar digest[16],
+ struct lua_tree_cb_data *cb,
+ gsize sz)
+{
+ if (cb->max_urls > 0 && sz > cb->max_urls) {
+ cb->skip_prob = 1.0 - ((gdouble) cb->max_urls) / (gdouble) sz;
+ /*
+ * Use task dependent probabilistic seed to ensure that
+ * consequent task:get_urls return the same list of urls
+ * We use both digest and timestamp here to avoid attack surface
+ * based just on digest.
+ */
+ memcpy(&cb->random_seed, digest, 4);
+ memcpy(((unsigned char *) &cb->random_seed) + 4, &timestamp, 4);
+ sz = cb->max_urls;
+ }
+
+ return sz;
+}
+
+static gint
+lua_url_eq(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *u1 = lua_check_url(L, 1),
+ *u2 = lua_check_url(L, 2);
+
+ if (u1 && u2) {
+ lua_pushboolean(L, (rspamd_url_cmp(u1->url, u2->url) == 0));
+ }
+ else {
+ lua_pushboolean(L, false);
+ }
+
+ return 1;
+}
+
+static gint
+lua_url_lt(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *u1 = lua_check_url(L, 1),
+ *u2 = lua_check_url(L, 2);
+
+ if (u1 && u2) {
+ lua_pushinteger(L, rspamd_url_cmp(u1->url, u2->url));
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static gint
+lua_load_url(lua_State *L)
+{
+ lua_newtable(L);
+ luaL_register(L, NULL, urllib_f);
+
+ /* Push flags */
+ lua_createtable(L, 0, RSPAMD_URL_MAX_FLAG_SHIFT);
+ for (int i = 0; i < RSPAMD_URL_MAX_FLAG_SHIFT; i++) {
+ guint flag = 1u << i;
+
+ lua_pushinteger(L, flag);
+ lua_setfield(L, -2, rspamd_url_flag_to_string(flag));
+ }
+
+ lua_setfield(L, -2, "flags");
+
+ return 1;
+}
+
+void luaopen_url(lua_State *L)
+{
+ rspamd_lua_new_class(L, "rspamd{url}", urllib_m);
+ lua_pop(L, 1);
+
+ rspamd_lua_add_preload(L, "rspamd_url", lua_load_url);
+}