Adding upstream version 4.2.2.upstream/4.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 20:34:10 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-10 20:34:10 +0000
commit: e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc (patch)
tree: 68cb5ef9081156392f1dd62a00c6ccc1451b93df /tools/html2text.py
parent: Initial commit. (diff)
download: wireshark-e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc.tar.xz
wireshark-e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc.zip
1 files changed, 249 insertions, 0 deletions
diff --git a/tools/html2text.py b/tools/html2text.py
new file mode 100755
index 0000000..da290b1
--- /dev/null
+++ b/tools/html2text.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+#
+# html2text.py - converts HTML to text
+#
+# Wireshark - Network traffic analyzer
+# By Gerald Combs <gerald@wireshark.org>
+# Copyright 1998 Gerald Combs
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+from __future__ import unicode_literals
+
+__author__      = "Peter Wu <peter@lekensteyn.nl>"
+__copyright__   = "Copyright 2015, Peter Wu"
+__license__     = "GPL (v2 or later)"
+
+# TODO:
+#   multiple list indentation levels (modify bullets?)
+#   maybe allow for ascii output instead of utf-8?
+
+import sys
+from textwrap import TextWrapper
+try:
+    from HTMLParser import HTMLParser
+    from htmlentitydefs import name2codepoint
+except ImportError: # Python 3
+    from html.parser import HTMLParser
+    from html.entities import name2codepoint
+    unichr = chr # for html entity handling
+
+class TextHTMLParser(HTMLParser):
+    """Converts a HTML document to text."""
+    def __init__(self):
+        try:
+            # Python 3.4
+            HTMLParser. __init__(self, convert_charrefs=True)
+        except Exception:
+            HTMLParser. __init__(self)
+        # All text, concatenated
+        self.output_buffer = ''
+        # The current text block which is being constructed
+        self.text_block = ''
+        # Whether the previous element was terminated with whitespace
+        self.need_space = False
+        # Whether to prevent word-wrapping the contents (for "pre" tag)
+        self.skip_wrap = False
+        # Quoting
+        self.need_quote = False
+        self.quote_stack = []
+        # Suffixes
+        self.need_suffix = False
+        self.suffix_stack = []
+        # track list items
+        self.list_item_prefix = None
+        self.ordered_list_index = None
+        self.stack_list_item_prefix = []
+        self.stack_ordered_list_index = []
+        self.list_indent_level = 0
+        self.list_item_indent = ""
+        # Indentation (for heading and paragraphs)
+        self.indent_levels = [0, 0]
+        # Don't dump CSS, scripts, etc.
+        self.ignore_tags = ('head', 'style', 'script')
+        self.ignore_level = 0
+        # href footnotes.
+        self.footnotes = []
+        self.href = None
+
+    def _wrap_text(self, text):
+        """Wraps text, but additionally indent list items."""
+        initial_indent = indent = sum(self.indent_levels) * ' '
+        if self.list_item_prefix:
+            initial_indent += self.list_item_prefix
+            indent += '    '
+        kwargs = {
+            'width': 72,
+            'initial_indent': initial_indent,
+            'subsequent_indent': indent
+        }
+        kwargs['break_on_hyphens'] = False
+        wrapper = TextWrapper(**kwargs)
+        return '\n'.join(wrapper.wrap(text))
+
+    def _commit_block(self, newline='\n\n'):
+        text = self.text_block
+        if text:
+            if not self.skip_wrap:
+                text = self._wrap_text(text)
+            self.output_buffer += text + newline
+            self.text_block = ''
+        self.need_space = False
+
+    def handle_starttag(self, tag, attrs):
+        # end a block of text on <br>, but also flush list items which are not
+        # terminated.
+        if tag == 'br' or tag == 'li':
+            self._commit_block('\n')
+        if tag == 'code':
+            self.need_quote = True
+            self.quote_stack.append('`')
+        if tag == 'pre':
+            self.skip_wrap = True
+        if tag in ('ol', 'ul'):
+            self.list_indent_level += 1
+            self.list_item_indent = "   " * (self.list_indent_level - 1)
+            self.stack_ordered_list_index.append(self.ordered_list_index)
+            self.stack_list_item_prefix.append(self.list_item_prefix)
+        # Following list items are numbered.
+        if tag == 'ol':
+            self.ordered_list_index = 1
+        if tag == 'ul':
+            self.list_item_prefix = self.list_item_indent + '  • '
+        if tag == 'li' and self.ordered_list_index:
+            self.list_item_prefix =  self.list_item_indent + ' %d. ' % (self.ordered_list_index)
+            self.ordered_list_index += 1
+        if tag[0] == 'h' and len(tag) == 2 and \
+            (tag[1] >= '1' and tag[1] <= '6'):
+            self.indent_levels = [int(tag[1]) - 1, 0]
+        if tag == 'p':
+            self.indent_levels[1] = 1
+        if tag == 'a':
+            try:
+                href = [attr[1] for attr in attrs if attr[0] == 'href'][0]
+                if '://' in href: # Skip relative URLs and links.
+                    self.href = href
+            except IndexError:
+                self.href = None
+        if tag == 'span':
+            try:
+                el_class = [attr[1] for attr in attrs if attr[0] == 'class'][0]
+                if 'menuseq' in el_class:
+                    self.need_quote = True
+                    self.quote_stack.append('"')
+            except IndexError:
+                pass
+        if tag == 'div':
+            try:
+                el_class = [attr[1] for attr in attrs if attr[0] == 'class'][0]
+                if 'title' in el_class.split(' '):
+                    self.need_suffix = True
+                    self.suffix_stack.append(':')
+            except IndexError:
+                pass
+        if tag in self.ignore_tags:
+            self.ignore_level += 1
+
+    def handle_data(self, data):
+        quote = ''
+        if self.need_quote:
+            quote = self.quote_stack[-1]
+        suffix = ''
+        if self.need_suffix:
+            suffix = self.suffix_stack.pop()
+        if self.ignore_level > 0:
+            return
+        elif self.skip_wrap:
+            block = data
+        else:
+            if self.href and data == self.href:
+                # This is a self link. Don't create a footnote.
+                self.href = None
+
+            # For normal text, fold multiple whitespace and strip
+            # leading and trailing spaces for the whole block (but
+            # keep spaces in the middle).
+            block = quote
+            if data.strip() and data[:1].isspace():
+                # Keep spaces in the middle
+                self.need_space = True
+            if self.need_space and data.strip() and self.text_block:
+                block = ' ' + quote
+            block += ' '.join(data.split()) + suffix
+            self.need_space = data[-1:].isspace()
+        self.text_block += block
+        self.need_quote = False
+        self.need_suffix = False
+
+    def handle_endtag(self, tag):
+        block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6 tr'
+        #block_elements += ' dl dd dt'
+        if tag in block_elements.split():
+            self._commit_block()
+        if tag in ('code', 'span'):
+            # XXX This span isn't guaranteed to match its opening.
+            self.text_block += self.quote_stack.pop()
+        if tag in ('ol', 'ul'):
+            self.list_indent_level -= 1
+            self.list_item_indent = "   " * (self.list_indent_level - 1)
+            self.ordered_list_index = self.stack_ordered_list_index.pop()
+            self.list_item_prefix = self.stack_list_item_prefix.pop()
+        if tag == 'pre':
+            self.skip_wrap = False
+        if tag == 'a' and self.href:
+            self.footnotes.append(self.href)
+            self.text_block += '[{0}]'.format(len(self.footnotes))
+        if tag in self.ignore_tags:
+            self.ignore_level -= 1
+
+    def handle_charref(self, name):
+        self.handle_data(unichr(int(name)))
+
+    def handle_entityref(self, name):
+        self.handle_data(unichr(name2codepoint[name]))
+
+    def close(self):
+        HTMLParser.close(self)
+        self._commit_block()
+
+        if len(self.footnotes) > 0:
+            self.list_item_prefix = None
+            self.indent_levels = [1, 0]
+            self.text_block = 'References'
+            self._commit_block()
+            self.indent_levels = [1, 1]
+            footnote_num = 1
+            for href in self.footnotes:
+                self.text_block += '{0:>2}. {1}\n'.format(footnote_num, href)
+                footnote_num += 1
+                self._commit_block('\n')
+
+
+        byte_output = self.output_buffer.encode('utf-8')
+        if hasattr(sys.stdout, 'buffer'):
+            sys.stdout.buffer.write(byte_output)
+        else:
+            sys.stdout.write(byte_output)
+
+
+def main():
+    htmlparser = TextHTMLParser()
+    if len(sys.argv) > 1 and sys.argv[1] != '-':
+        filename = sys.argv[1]
+        f = open(filename, 'rb')
+    else:
+        filename = None
+        f = sys.stdin
+    try:
+        if hasattr(f, 'buffer'):
+            # Access raw (byte) buffer in Python 3 instead of decoded one
+            f = f.buffer
+        # Read stdin as a Unicode string
+        htmlparser.feed(f.read().decode('utf-8'))
+    finally:
+        if filename is not None:
+            f.close()
+    htmlparser.close()
+
+if __name__ == '__main__':
+    sys.exit(main())
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 20:34:10 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-10 20:34:10 +0000
commit	e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc (patch)
tree	68cb5ef9081156392f1dd62a00c6ccc1451b93df /tools/html2text.py
parent	Initial commit. (diff)
download	wireshark-e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc.tar.xz wireshark-e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc.zip