diff options
Diffstat (limited to 'tools/html2text.py')
-rwxr-xr-x | tools/html2text.py | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/tools/html2text.py b/tools/html2text.py new file mode 100755 index 00000000..da290b1b --- /dev/null +++ b/tools/html2text.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +# +# html2text.py - converts HTML to text +# +# Wireshark - Network traffic analyzer +# By Gerald Combs <gerald@wireshark.org> +# Copyright 1998 Gerald Combs +# +# SPDX-License-Identifier: GPL-2.0-or-later + +from __future__ import unicode_literals + +__author__ = "Peter Wu <peter@lekensteyn.nl>" +__copyright__ = "Copyright 2015, Peter Wu" +__license__ = "GPL (v2 or later)" + +# TODO: +# multiple list indentation levels (modify bullets?) +# maybe allow for ascii output instead of utf-8? + +import sys +from textwrap import TextWrapper +try: + from HTMLParser import HTMLParser + from htmlentitydefs import name2codepoint +except ImportError: # Python 3 + from html.parser import HTMLParser + from html.entities import name2codepoint + unichr = chr # for html entity handling + +class TextHTMLParser(HTMLParser): + """Converts a HTML document to text.""" + def __init__(self): + try: + # Python 3.4 + HTMLParser. __init__(self, convert_charrefs=True) + except Exception: + HTMLParser. __init__(self) + # All text, concatenated + self.output_buffer = '' + # The current text block which is being constructed + self.text_block = '' + # Whether the previous element was terminated with whitespace + self.need_space = False + # Whether to prevent word-wrapping the contents (for "pre" tag) + self.skip_wrap = False + # Quoting + self.need_quote = False + self.quote_stack = [] + # Suffixes + self.need_suffix = False + self.suffix_stack = [] + # track list items + self.list_item_prefix = None + self.ordered_list_index = None + self.stack_list_item_prefix = [] + self.stack_ordered_list_index = [] + self.list_indent_level = 0 + self.list_item_indent = "" + # Indentation (for heading and paragraphs) + self.indent_levels = [0, 0] + # Don't dump CSS, scripts, etc. + self.ignore_tags = ('head', 'style', 'script') + self.ignore_level = 0 + # href footnotes. + self.footnotes = [] + self.href = None + + def _wrap_text(self, text): + """Wraps text, but additionally indent list items.""" + initial_indent = indent = sum(self.indent_levels) * ' ' + if self.list_item_prefix: + initial_indent += self.list_item_prefix + indent += ' ' + kwargs = { + 'width': 72, + 'initial_indent': initial_indent, + 'subsequent_indent': indent + } + kwargs['break_on_hyphens'] = False + wrapper = TextWrapper(**kwargs) + return '\n'.join(wrapper.wrap(text)) + + def _commit_block(self, newline='\n\n'): + text = self.text_block + if text: + if not self.skip_wrap: + text = self._wrap_text(text) + self.output_buffer += text + newline + self.text_block = '' + self.need_space = False + + def handle_starttag(self, tag, attrs): + # end a block of text on <br>, but also flush list items which are not + # terminated. + if tag == 'br' or tag == 'li': + self._commit_block('\n') + if tag == 'code': + self.need_quote = True + self.quote_stack.append('`') + if tag == 'pre': + self.skip_wrap = True + if tag in ('ol', 'ul'): + self.list_indent_level += 1 + self.list_item_indent = " " * (self.list_indent_level - 1) + self.stack_ordered_list_index.append(self.ordered_list_index) + self.stack_list_item_prefix.append(self.list_item_prefix) + # Following list items are numbered. + if tag == 'ol': + self.ordered_list_index = 1 + if tag == 'ul': + self.list_item_prefix = self.list_item_indent + ' • ' + if tag == 'li' and self.ordered_list_index: + self.list_item_prefix = self.list_item_indent + ' %d. ' % (self.ordered_list_index) + self.ordered_list_index += 1 + if tag[0] == 'h' and len(tag) == 2 and \ + (tag[1] >= '1' and tag[1] <= '6'): + self.indent_levels = [int(tag[1]) - 1, 0] + if tag == 'p': + self.indent_levels[1] = 1 + if tag == 'a': + try: + href = [attr[1] for attr in attrs if attr[0] == 'href'][0] + if '://' in href: # Skip relative URLs and links. + self.href = href + except IndexError: + self.href = None + if tag == 'span': + try: + el_class = [attr[1] for attr in attrs if attr[0] == 'class'][0] + if 'menuseq' in el_class: + self.need_quote = True + self.quote_stack.append('"') + except IndexError: + pass + if tag == 'div': + try: + el_class = [attr[1] for attr in attrs if attr[0] == 'class'][0] + if 'title' in el_class.split(' '): + self.need_suffix = True + self.suffix_stack.append(':') + except IndexError: + pass + if tag in self.ignore_tags: + self.ignore_level += 1 + + def handle_data(self, data): + quote = '' + if self.need_quote: + quote = self.quote_stack[-1] + suffix = '' + if self.need_suffix: + suffix = self.suffix_stack.pop() + if self.ignore_level > 0: + return + elif self.skip_wrap: + block = data + else: + if self.href and data == self.href: + # This is a self link. Don't create a footnote. + self.href = None + + # For normal text, fold multiple whitespace and strip + # leading and trailing spaces for the whole block (but + # keep spaces in the middle). + block = quote + if data.strip() and data[:1].isspace(): + # Keep spaces in the middle + self.need_space = True + if self.need_space and data.strip() and self.text_block: + block = ' ' + quote + block += ' '.join(data.split()) + suffix + self.need_space = data[-1:].isspace() + self.text_block += block + self.need_quote = False + self.need_suffix = False + + def handle_endtag(self, tag): + block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6 tr' + #block_elements += ' dl dd dt' + if tag in block_elements.split(): + self._commit_block() + if tag in ('code', 'span'): + # XXX This span isn't guaranteed to match its opening. + self.text_block += self.quote_stack.pop() + if tag in ('ol', 'ul'): + self.list_indent_level -= 1 + self.list_item_indent = " " * (self.list_indent_level - 1) + self.ordered_list_index = self.stack_ordered_list_index.pop() + self.list_item_prefix = self.stack_list_item_prefix.pop() + if tag == 'pre': + self.skip_wrap = False + if tag == 'a' and self.href: + self.footnotes.append(self.href) + self.text_block += '[{0}]'.format(len(self.footnotes)) + if tag in self.ignore_tags: + self.ignore_level -= 1 + + def handle_charref(self, name): + self.handle_data(unichr(int(name))) + + def handle_entityref(self, name): + self.handle_data(unichr(name2codepoint[name])) + + def close(self): + HTMLParser.close(self) + self._commit_block() + + if len(self.footnotes) > 0: + self.list_item_prefix = None + self.indent_levels = [1, 0] + self.text_block = 'References' + self._commit_block() + self.indent_levels = [1, 1] + footnote_num = 1 + for href in self.footnotes: + self.text_block += '{0:>2}. {1}\n'.format(footnote_num, href) + footnote_num += 1 + self._commit_block('\n') + + + byte_output = self.output_buffer.encode('utf-8') + if hasattr(sys.stdout, 'buffer'): + sys.stdout.buffer.write(byte_output) + else: + sys.stdout.write(byte_output) + + +def main(): + htmlparser = TextHTMLParser() + if len(sys.argv) > 1 and sys.argv[1] != '-': + filename = sys.argv[1] + f = open(filename, 'rb') + else: + filename = None + f = sys.stdin + try: + if hasattr(f, 'buffer'): + # Access raw (byte) buffer in Python 3 instead of decoded one + f = f.buffer + # Read stdin as a Unicode string + htmlparser.feed(f.read().decode('utf-8')) + finally: + if filename is not None: + f.close() + htmlparser.close() + +if __name__ == '__main__': + sys.exit(main()) |