summaryrefslogtreecommitdiffstats
path: root/test/lua/unit/html.lua
diff options
context:
space:
mode:
Diffstat (limited to 'test/lua/unit/html.lua')
-rw-r--r--test/lua/unit/html.lua113
1 files changed, 113 insertions, 0 deletions
diff --git a/test/lua/unit/html.lua b/test/lua/unit/html.lua
new file mode 100644
index 0000000..81c52ec
--- /dev/null
+++ b/test/lua/unit/html.lua
@@ -0,0 +1,113 @@
+context("HTML processing", function()
+ local rspamd_util = require("rspamd_util")
+ local logger = require("rspamd_logger")
+ local cases = {
+ -- Entities
+ {[[<html><body>.&#102;&#105;&#114;&#101;&#98;&#97;&#115;&#101;&#97;&#112;&#112;.&#99;&#111;&#109;</body></html>]],
+ [[.firebaseapp.com]]},
+ {[[
+<?xml version="1.0" encoding="iso-8859-1"?>
+ <!DOCTYPE html
+ PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <title>
+ Wikibooks
+ </title>
+ </head>
+ <body>
+ <p>
+ Hello, world!
+
+ </p>
+ </body>
+ </html>]], 'Hello, world!\n'},
+ {[[
+<!DOCTYPE html>
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>title</title>
+ <link rel="stylesheet" href="style.css">
+ <script src="script.js"></script>
+ <style><!--
+- -a -a -a -- --- -
+ --></head>
+ <body>
+ <!-- page content -->
+ Hello, world!
+ </body>
+</html>
+ ]], 'Hello, world!'},
+ {[[
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>title</title>
+ <link rel="stylesheet" href="style.css">
+ <script src="script.js"></script>
+ </head>
+ <body>
+ <!-- page content -->
+ Hello, world!<br>test</br><br>content</hr>more content<br>
+ <div>
+ content inside div
+ </div>
+ </body>
+</html>
+ ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n'},
+ {[[
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>title</title>
+ <link rel="stylesheet" href="style.css">
+ <script src="script.js"></script>
+ </head>
+ <body>
+ <!-- tabular content -->
+ <table>
+ content
+ </table>
+ <table>
+ <tr>
+ <th>heada</th>
+ <th>headb</th>
+ </tr>
+ <tr>
+ <td>data1</td>
+ <td>data2</td>
+ </tr>
+ </table>
+
+ </body>
+</html>
+ ]], 'content\nheada headb\ndata1 data2\n'},
+ {[[
+<html lang="en">
+ <head>
+ <meta charset="utf-8">
+ <title>title</title>
+ <link rel="stylesheet" href="style.css">
+ <script src="script.js"></script>
+ </head>
+ <body>
+ <!-- escape content -->
+ a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;
+ </body>
+</html>
+ ]], 'a b a > b a < b a & b \'a "a"'},
+ }
+
+ for i,c in ipairs(cases) do
+ test("Extract text from HTML " .. tostring(i), function()
+ local t = rspamd_util.parse_html(c[1])
+
+ assert_not_nil(t)
+ assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'",
+ c[2], t))
+
+ end)
+ end
+end)