Adding upstream version 3.8.1.upstream/3.8.1 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 21:30:40 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 21:30:40 +0000
commit: 133a45c109da5310add55824db21af5239951f93 (patch)
tree: ba6ac4c0a950a0dda56451944315d66409923918 /test/lua/unit/html.lua
parent: Initial commit. (diff)
download: rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz
rspamd-133a45c109da5310add55824db21af5239951f93.zip
1 files changed, 113 insertions, 0 deletions
diff --git a/test/lua/unit/html.lua b/test/lua/unit/html.lua
new file mode 100644
index 0000000..81c52ec
--- /dev/null
+++ b/test/lua/unit/html.lua
@@ -0,0 +1,113 @@
+context("HTML processing", function()
+  local rspamd_util = require("rspamd_util")
+  local logger = require("rspamd_logger")
+  local cases = {
+      -- Entities
+      {[[<html><body>.&#102;&#105;&#114;&#101;&#98;&#97;&#115;&#101;&#97;&#112;&#112;.&#99;&#111;&#109;</body></html>]],
+       [[.firebaseapp.com]]},
+      {[[
+<?xml version="1.0" encoding="iso-8859-1"?>
+ <!DOCTYPE html
+   PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+   <head>
+     <title>
+       Wikibooks
+     </title>
+   </head>
+   <body>
+     <p>
+       Hello,          world!
+
+     </p>
+   </body>
+ </html>]], 'Hello, world!\n'},
+       {[[
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>title</title>
+    <link rel="stylesheet" href="style.css">
+    <script src="script.js"></script>
+    <style><!--
+- -a -a -a -- --- -
+  --></head>
+  <body>
+    <!-- page content -->
+    Hello, world!
+  </body>
+</html>
+      ]], 'Hello, world!'},
+      {[[
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>title</title>
+    <link rel="stylesheet" href="style.css">
+    <script src="script.js"></script>
+  </head>
+  <body>
+    <!-- page content -->
+    Hello, world!<br>test</br><br>content</hr>more content<br>
+    <div>
+      content inside div
+    </div>
+  </body>
+</html>
+      ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n'},
+      {[[
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>title</title>
+    <link rel="stylesheet" href="style.css">
+    <script src="script.js"></script>
+  </head>
+  <body>
+    <!-- tabular content -->
+    <table>
+      content
+    </table>
+    <table>
+      <tr>
+        <th>heada</th>
+        <th>headb</th>
+      </tr>
+      <tr>
+        <td>data1</td>
+        <td>data2</td>
+      </tr>
+    </table>
+
+  </body>
+</html>
+      ]], 'content\nheada headb\ndata1 data2\n'},
+      {[[
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>title</title>
+    <link rel="stylesheet" href="style.css">
+    <script src="script.js"></script>
+  </head>
+  <body>
+    <!-- escape content -->
+    a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;
+  </body>
+</html>
+      ]], 'a b a > b a < b a & b \'a "a"'},
+  }
+
+  for i,c in ipairs(cases) do
+    test("Extract text from HTML " .. tostring(i), function()
+      local t = rspamd_util.parse_html(c[1])
+
+      assert_not_nil(t)
+      assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'",
+          c[2], t))
+
+    end)
+  end
+end)
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 21:30:40 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 21:30:40 +0000
commit	133a45c109da5310add55824db21af5239951f93 (patch)
tree	ba6ac4c0a950a0dda56451944315d66409923918 /test/lua/unit/html.lua
parent	Initial commit. (diff)
download	rspamd-133a45c109da5310add55824db21af5239951f93.tar.xz rspamd-133a45c109da5310add55824db21af5239951f93.zip