summaryrefslogtreecommitdiffstats
path: root/src/lua/lua_parsers.c
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
commit133a45c109da5310add55824db21af5239951f93 (patch)
treeba6ac4c0a950a0dda56451944315d66409923918 /src/lua/lua_parsers.c
parentInitial commit. (diff)
downloadrspamd-133a45c109da5310add55824db21af5239951f93.tar.xz
rspamd-133a45c109da5310add55824db21af5239951f93.zip
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/lua/lua_parsers.c')
-rw-r--r--src/lua/lua_parsers.c410
1 files changed, 410 insertions, 0 deletions
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c
new file mode 100644
index 0000000..1fc71db
--- /dev/null
+++ b/src/lua/lua_parsers.c
@@ -0,0 +1,410 @@
+/*-
+ * Copyright 2020 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lua_common.h"
+#include "tokenizers/tokenizers.h"
+#include "contrib/uthash/utlist.h"
+#include "libserver/html/html.h"
+#include "libmime/email_addr.h"
+#include "libmime/content_type.h"
+#include "libmime/mime_headers.h"
+#include "libmime/smtp_parsers.h"
+#include "lua_parsers.h"
+
+/***
+ * @module rspamd_parsers
+ * This module contains Lua-C interfaces to Rspamd parsers of different kind.
+ */
+
+/***
+ * @function parsers.tokenize_text(input[, exceptions])
+ * Create tokens from a text using optional exceptions list
+ * @param {text/string} input input data
+ * @param {table} exceptions, a table of pairs containing <start_pos,length> of exceptions in the input
+ * @return {table/strings} list of strings representing words in the text
+ */
+
+
+/***
+ * @function parsers.parse_html(input)
+ * Parses HTML and returns the according text
+ * @param {string|text} in input HTML
+ * @return {rspamd_text} processed text with no HTML tags
+ */
+
+/***
+ * @function parsers.parse_mail_address(str, [pool])
+ * Parses email address and returns a table of tables in the following format:
+ *
+ * - `raw` - the original value without any processing
+ * - `name` - name of internet address in UTF8, e.g. for `Vsevolod Stakhov <blah@foo.com>` it returns `Vsevolod Stakhov`
+ * - `addr` - address part of the address
+ * - `user` - user part (if present) of the address, e.g. `blah`
+ * - `domain` - domain part (if present), e.g. `foo.com`
+ * - `flags` - table with following keys set to true if given condition fulfilled:
+ * - [valid] - valid SMTP address in conformity with https://tools.ietf.org/html/rfc5321#section-4.1.
+ * - [ip] - domain is IPv4/IPv6 address
+ * - [braced] - angled `<blah@foo.com>` address
+ * - [quoted] - quoted user part
+ * - [empty] - empty address
+ * - [backslash] - user part contains backslash
+ * - [8bit] - contains 8bit characters
+ *
+ * @param {string} str input string
+ * @param {rspamd_mempool} pool memory pool to use
+ * @return {table/tables} parsed list of mail addresses
+ */
+
+/***
+ * @function parsers.parse_content_type(ct_string, mempool)
+ * Parses content-type string to a table:
+ * - `type`
+ * - `subtype`
+ * - `charset`
+ * - `boundary`
+ * - other attributes
+ *
+ * @param {string} ct_string content type as string
+ * @param {rspamd_mempool} mempool needed to store temporary data (e.g. task pool)
+ * @return table or nil if cannot parse content type
+ */
+
+/***
+ * @function parsers.parse_smtp_date(str[, local_tz])
+ * Converts an SMTP date string to unix timestamp
+ * @param {string} str input string
+ * @param {boolean} local_tz convert to local tz if `true`
+ * @return {number} time as unix timestamp (converted to float)
+ */
+
+static const struct luaL_reg parserslib_f[] = {
+ LUA_INTERFACE_DEF(parsers, tokenize_text),
+ LUA_INTERFACE_DEF(parsers, parse_html),
+ LUA_INTERFACE_DEF(parsers, parse_mail_address),
+ LUA_INTERFACE_DEF(parsers, parse_content_type),
+ LUA_INTERFACE_DEF(parsers, parse_smtp_date),
+
+ {NULL, NULL}};
+
+gint lua_parsers_tokenize_text(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ const gchar *in = NULL;
+ gsize len = 0, pos, ex_len, i;
+ GList *exceptions = NULL, *cur;
+ struct rspamd_lua_text *t;
+ struct rspamd_process_exception *ex;
+ UText utxt = UTEXT_INITIALIZER;
+ GArray *res;
+ rspamd_stat_token_t *w;
+
+ if (lua_type(L, 1) == LUA_TSTRING) {
+ in = luaL_checklstring(L, 1, &len);
+ }
+ else if (lua_type(L, 1) == LUA_TUSERDATA) {
+ t = lua_check_text(L, 1);
+
+ if (t) {
+ in = t->start;
+ len = t->len;
+ }
+ }
+
+ if (in == NULL) {
+ lua_pushnil(L);
+ return 1;
+ }
+
+ if (lua_gettop(L) > 1 && lua_type(L, 2) == LUA_TTABLE) {
+ lua_pushvalue(L, 2);
+ lua_pushnil(L);
+
+ while (lua_next(L, -2) != 0) {
+ if (lua_type(L, -1) == LUA_TTABLE) {
+ lua_rawgeti(L, -1, 1);
+ pos = luaL_checknumber(L, -1);
+ lua_pop(L, 1);
+ lua_rawgeti(L, -1, 2);
+ ex_len = luaL_checknumber(L, -1);
+ lua_pop(L, 1);
+
+ if (ex_len > 0) {
+ ex = g_malloc0(sizeof(*ex));
+ ex->pos = pos;
+ ex->len = ex_len;
+ ex->type = RSPAMD_EXCEPTION_GENERIC;
+ exceptions = g_list_prepend(exceptions, ex);
+ }
+ }
+ lua_pop(L, 1);
+ }
+
+ lua_pop(L, 1);
+ }
+
+ if (exceptions) {
+ exceptions = g_list_reverse(exceptions);
+ }
+
+ UErrorCode uc_err = U_ZERO_ERROR;
+ utext_openUTF8(&utxt,
+ in,
+ len,
+ &uc_err);
+
+ res = rspamd_tokenize_text((gchar *) in, len,
+ &utxt,
+ RSPAMD_TOKENIZE_UTF, NULL,
+ exceptions,
+ NULL, NULL, NULL);
+
+ if (res == NULL) {
+ lua_pushnil(L);
+ }
+ else {
+ lua_createtable(L, res->len, 0);
+
+ for (i = 0; i < res->len; i++) {
+ w = &g_array_index(res, rspamd_stat_token_t, i);
+ lua_pushlstring(L, w->original.begin, w->original.len);
+ lua_rawseti(L, -2, i + 1);
+ }
+ }
+
+ cur = exceptions;
+ while (cur) {
+ ex = cur->data;
+ g_free(ex);
+ cur = g_list_next(cur);
+ }
+
+ g_list_free(exceptions);
+ utext_close(&utxt);
+
+ return 1;
+}
+
+gint lua_parsers_parse_html(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_text *t;
+ const gchar *start = NULL;
+ gsize len;
+ GByteArray *in;
+ rspamd_mempool_t *pool;
+ void *hc;
+
+ if (lua_type(L, 1) == LUA_TUSERDATA) {
+ t = lua_check_text(L, 1);
+
+ if (t != NULL) {
+ start = t->start;
+ len = t->len;
+ }
+ }
+ else if (lua_type(L, 1) == LUA_TSTRING) {
+ start = luaL_checklstring(L, 1, &len);
+ }
+
+ if (start != NULL) {
+ pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), NULL, 0);
+ in = g_byte_array_sized_new(len);
+ g_byte_array_append(in, start, len);
+
+ hc = rspamd_html_process_part(pool, in);
+
+ rspamd_ftok_t res;
+ rspamd_html_get_parsed_content(hc, &res);
+ lua_new_text(L, res.begin, res.len, TRUE);
+
+ g_byte_array_free(in, TRUE);
+ rspamd_mempool_delete(pool);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+gint lua_parsers_parse_mail_address(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ GPtrArray *addrs;
+ gsize len;
+ const gchar *str = luaL_checklstring(L, 1, &len);
+ gint max_addrs = luaL_optinteger(L, 3, 10240);
+ rspamd_mempool_t *pool;
+ gboolean own_pool = FALSE;
+
+ if (str) {
+
+ if (lua_type(L, 2) == LUA_TUSERDATA) {
+ pool = rspamd_lua_check_mempool(L, 2);
+
+ if (pool == NULL) {
+ return luaL_error(L, "invalid arguments");
+ }
+ }
+ else {
+ pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
+ "lua parsers", 0);
+ own_pool = TRUE;
+ }
+
+ addrs = rspamd_email_address_from_mime(pool, str, len, NULL, max_addrs);
+
+ if (addrs == NULL) {
+ lua_pushnil(L);
+ }
+ else {
+ lua_push_emails_address_list(L, addrs, 0);
+ }
+
+ if (own_pool) {
+ rspamd_mempool_delete(pool);
+ }
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
+gint lua_parsers_parse_content_type(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ gsize len;
+ const gchar *ct_str = luaL_checklstring(L, 1, &len);
+ rspamd_mempool_t *pool = rspamd_lua_check_mempool(L, 2);
+ struct rspamd_content_type *ct;
+
+ if (!ct_str || !pool) {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ ct = rspamd_content_type_parse(ct_str, len, pool);
+
+ if (ct == NULL) {
+ lua_pushnil(L);
+ }
+ else {
+ GHashTableIter it;
+ gpointer k, v;
+
+ lua_createtable(L, 0, 4 + (ct->attrs ? g_hash_table_size(ct->attrs) : 0));
+
+ if (ct->type.len > 0) {
+ lua_pushstring(L, "type");
+ lua_pushlstring(L, ct->type.begin, ct->type.len);
+ lua_settable(L, -3);
+ }
+
+ if (ct->subtype.len > 0) {
+ lua_pushstring(L, "subtype");
+ lua_pushlstring(L, ct->subtype.begin, ct->subtype.len);
+ lua_settable(L, -3);
+ }
+
+ if (ct->charset.len > 0) {
+ lua_pushstring(L, "charset");
+ lua_pushlstring(L, ct->charset.begin, ct->charset.len);
+ lua_settable(L, -3);
+ }
+
+ if (ct->orig_boundary.len > 0) {
+ lua_pushstring(L, "boundary");
+ lua_pushlstring(L, ct->orig_boundary.begin, ct->orig_boundary.len);
+ lua_settable(L, -3);
+ }
+
+ if (ct->attrs) {
+ g_hash_table_iter_init(&it, ct->attrs);
+
+ while (g_hash_table_iter_next(&it, &k, &v)) {
+ struct rspamd_content_type_param *param =
+ (struct rspamd_content_type_param *) v,
+ *cur;
+ guint i = 1;
+
+ lua_pushlstring(L, param->name.begin, param->name.len);
+ lua_createtable(L, 1, 0);
+
+ DL_FOREACH(param, cur)
+ {
+ lua_pushlstring(L, cur->value.begin, cur->value.len);
+ lua_rawseti(L, -2, i++);
+ }
+
+ lua_settable(L, -3);
+ }
+ }
+ }
+
+ return 1;
+}
+
+int lua_parsers_parse_smtp_date(lua_State *L)
+{
+ gsize slen;
+ const gchar *str = lua_tolstring(L, 1, &slen);
+ GError *err = NULL;
+
+ if (str == NULL) {
+ return luaL_argerror(L, 1, "invalid argument");
+ }
+
+ time_t tt = rspamd_parse_smtp_date(str, slen, &err);
+
+ if (err == NULL) {
+ if (lua_isboolean(L, 2) && !!lua_toboolean(L, 2)) {
+ struct tm t;
+
+ rspamd_localtime(tt, &t);
+#if !defined(__sun)
+ t.tm_gmtoff = 0;
+#endif
+ t.tm_isdst = 0;
+ tt = mktime(&t);
+ }
+
+ lua_pushnumber(L, tt);
+ }
+ else {
+ lua_pushnil(L);
+ lua_pushstring(L, err->message);
+ g_error_free(err);
+
+ return 2;
+ }
+
+ return 1;
+}
+
+static gint
+lua_load_parsers(lua_State *L)
+{
+ lua_newtable(L);
+ luaL_register(L, NULL, parserslib_f);
+
+ return 1;
+}
+
+void luaopen_parsers(lua_State *L)
+{
+ rspamd_lua_add_preload(L, "rspamd_parsers", lua_load_parsers);
+} \ No newline at end of file