summaryrefslogtreecommitdiffstats
path: root/test/lua/unit/html.lua
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 21:30:40 +0000
commit133a45c109da5310add55824db21af5239951f93 (patch)
treeba6ac4c0a950a0dda56451944315d66409923918 /test/lua/unit/html.lua
parentInitial commit. (diff)
downloadrspamd-133a45c109da5310add55824db21af5239951f93.tar.xz
rspamd-133a45c109da5310add55824db21af5239951f93.zip
Adding upstream version 3.8.1.upstream/3.8.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'test/lua/unit/html.lua')
-rw-r--r--test/lua/unit/html.lua113
1 files changed, 113 insertions, 0 deletions
diff --git a/test/lua/unit/html.lua b/test/lua/unit/html.lua
new file mode 100644
index 0000000..81c52ec
--- /dev/null
+++ b/test/lua/unit/html.lua
@@ -0,0 +1,113 @@
+context("HTML processing", function()
+ local rspamd_util = require("rspamd_util")
+ local logger = require("rspamd_logger")
+ local cases = {
+ -- Entities
+ {[[<html><body>.&#102;&#105;&#114;&#101;&#98;&#97;&#115;&#101;&#97;&#112;&#112;.&#99;&#111;&#109;</body></html>]],
+ [[.firebaseapp.com]]},
+ {[[
+<?xml version="1.0" encoding="iso-8859-1"?>
+ <!DOCTYPE html
+ PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <title>
+ Wikibooks
+ </title>
+ </head>
+ <body>
+ <p>
+ Hello, world!
+
+ </p>
+ </body>
+ </html>]], 'Hello, world!\n'},
+ {[[
+<!DOCTYPE html>
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>title</title>
+ <link rel="stylesheet" href="style.css">
+ <script src="script.js"></script>
+ <style><!--
+- -a -a -a -- --- -
+ --></head>
+ <body>
+ <!-- page content -->
+ Hello, world!
+ </body>
+</html>
+ ]], 'Hello, world!'},
+ {[[
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>title</title>
+ <link rel="stylesheet" href="style.css">
+ <script src="script.js"></script>
+ </head>
+ <body>
+ <!-- page content -->
+ Hello, world!<br>test</br><br>content</hr>more content<br>
+ <div>
+ content inside div
+ </div>
+ </body>
+</html>
+ ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n'},
+ {[[
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>title</title>
+ <link rel="stylesheet" href="style.css">
+ <script src="script.js"></script>
+ </head>
+ <body>
+ <!-- tabular content -->
+ <table>
+ content
+ </table>
+ <table>
+ <tr>
+ <th>heada</th>
+ <th>headb</th>
+ </tr>
+ <tr>
+ <td>data1</td>
+ <td>data2</td>
+ </tr>
+ </table>
+
+ </body>
+</html>
+ ]], 'content\nheada headb\ndata1 data2\n'},
+ {[[
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>title</title>
+ <link rel="stylesheet" href="style.css">
+ <script src="script.js"></script>
+ </head>
+ <body>
+ <!-- escape content -->
+ a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;
+ </body>
+</html>
+ ]], 'a b a > b a < b a & b \'a "a"'},
+ }
+
+ for i,c in ipairs(cases) do
+ test("Extract text from HTML " .. tostring(i), function()
+ local t = rspamd_util.parse_html(c[1])
+
+ assert_not_nil(t)
+ assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'",
+ c[2], t))
+
+ end)
+ end
+end)