#!/usr/bin/env python3
#
# html2text.py - converts HTML to text
#
# Wireshark - Network traffic analyzer
# By Gerald Combs
# Copyright 1998 Gerald Combs
#
# SPDX-License-Identifier: GPL-2.0-or-later
from __future__ import unicode_literals
__author__ = "Peter Wu "
__copyright__ = "Copyright 2015, Peter Wu"
__license__ = "GPL (v2 or later)"
# TODO:
# multiple list indentation levels (modify bullets?)
# maybe allow for ascii output instead of utf-8?
import sys
from textwrap import TextWrapper
try:
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
except ImportError: # Python 3
from html.parser import HTMLParser
from html.entities import name2codepoint
unichr = chr # for html entity handling
class TextHTMLParser(HTMLParser):
"""Converts a HTML document to text."""
def __init__(self):
try:
# Python 3.4
HTMLParser. __init__(self, convert_charrefs=True)
except Exception:
HTMLParser. __init__(self)
# All text, concatenated
self.output_buffer = ''
# The current text block which is being constructed
self.text_block = ''
# Whether the previous element was terminated with whitespace
self.need_space = False
# Whether to prevent word-wrapping the contents (for "pre" tag)
self.skip_wrap = False
# Quoting
self.need_quote = False
self.quote_stack = []
# Suffixes
self.need_suffix = False
self.suffix_stack = []
# track list items
self.list_item_prefix = None
self.ordered_list_index = None
self.stack_list_item_prefix = []
self.stack_ordered_list_index = []
self.list_indent_level = 0
self.list_item_indent = ""
# Indentation (for heading and paragraphs)
self.indent_levels = [0, 0]
# Don't dump CSS, scripts, etc.
self.ignore_tags = ('head', 'style', 'script')
self.ignore_level = 0
# href footnotes.
self.footnotes = []
self.href = None
def _wrap_text(self, text):
"""Wraps text, but additionally indent list items."""
initial_indent = indent = sum(self.indent_levels) * ' '
if self.list_item_prefix:
initial_indent += self.list_item_prefix
indent += ' '
kwargs = {
'width': 72,
'initial_indent': initial_indent,
'subsequent_indent': indent
}
kwargs['break_on_hyphens'] = False
wrapper = TextWrapper(**kwargs)
return '\n'.join(wrapper.wrap(text))
def _commit_block(self, newline='\n\n'):
text = self.text_block
if text:
if not self.skip_wrap:
text = self._wrap_text(text)
self.output_buffer += text + newline
self.text_block = ''
self.need_space = False
def handle_starttag(self, tag, attrs):
# end a block of text on
, but also flush list items which are not
# terminated.
if tag == 'br' or tag == 'li':
self._commit_block('\n')
if tag == 'code':
self.need_quote = True
self.quote_stack.append('`')
if tag == 'pre':
self.skip_wrap = True
if tag in ('ol', 'ul'):
self.list_indent_level += 1
self.list_item_indent = " " * (self.list_indent_level - 1)
self.stack_ordered_list_index.append(self.ordered_list_index)
self.stack_list_item_prefix.append(self.list_item_prefix)
# Following list items are numbered.
if tag == 'ol':
self.ordered_list_index = 1
if tag == 'ul':
self.list_item_prefix = self.list_item_indent + ' • '
if tag == 'li' and self.ordered_list_index:
self.list_item_prefix = self.list_item_indent + ' %d. ' % (self.ordered_list_index)
self.ordered_list_index += 1
if tag[0] == 'h' and len(tag) == 2 and \
(tag[1] >= '1' and tag[1] <= '6'):
self.indent_levels = [int(tag[1]) - 1, 0]
if tag == 'p':
self.indent_levels[1] = 1
if tag == 'a':
try:
href = [attr[1] for attr in attrs if attr[0] == 'href'][0]
if '://' in href: # Skip relative URLs and links.
self.href = href
except IndexError:
self.href = None
if tag == 'span':
try:
el_class = [attr[1] for attr in attrs if attr[0] == 'class'][0]
if 'menuseq' in el_class:
self.need_quote = True
self.quote_stack.append('"')
except IndexError:
pass
if tag == 'div':
try:
el_class = [attr[1] for attr in attrs if attr[0] == 'class'][0]
if 'title' in el_class.split(' '):
self.need_suffix = True
self.suffix_stack.append(':')
except IndexError:
pass
if tag in self.ignore_tags:
self.ignore_level += 1
def handle_data(self, data):
quote = ''
if self.need_quote:
quote = self.quote_stack[-1]
suffix = ''
if self.need_suffix:
suffix = self.suffix_stack.pop()
if self.ignore_level > 0:
return
elif self.skip_wrap:
block = data
else:
if self.href and data == self.href:
# This is a self link. Don't create a footnote.
self.href = None
# For normal text, fold multiple whitespace and strip
# leading and trailing spaces for the whole block (but
# keep spaces in the middle).
block = quote
if data.strip() and data[:1].isspace():
# Keep spaces in the middle
self.need_space = True
if self.need_space and data.strip() and self.text_block:
block = ' ' + quote
block += ' '.join(data.split()) + suffix
self.need_space = data[-1:].isspace()
self.text_block += block
self.need_quote = False
self.need_suffix = False
def handle_endtag(self, tag):
block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6 tr'
#block_elements += ' dl dd dt'
if tag in block_elements.split():
self._commit_block()
if tag in ('code', 'span'):
# XXX This span isn't guaranteed to match its opening.
self.text_block += self.quote_stack.pop()
if tag in ('ol', 'ul'):
self.list_indent_level -= 1
self.list_item_indent = " " * (self.list_indent_level - 1)
self.ordered_list_index = self.stack_ordered_list_index.pop()
self.list_item_prefix = self.stack_list_item_prefix.pop()
if tag == 'pre':
self.skip_wrap = False
if tag == 'a' and self.href:
self.footnotes.append(self.href)
self.text_block += '[{0}]'.format(len(self.footnotes))
if tag in self.ignore_tags:
self.ignore_level -= 1
def handle_charref(self, name):
self.handle_data(unichr(int(name)))
def handle_entityref(self, name):
self.handle_data(unichr(name2codepoint[name]))
def close(self):
HTMLParser.close(self)
self._commit_block()
if len(self.footnotes) > 0:
self.list_item_prefix = None
self.indent_levels = [1, 0]
self.text_block = 'References'
self._commit_block()
self.indent_levels = [1, 1]
footnote_num = 1
for href in self.footnotes:
self.text_block += '{0:>2}. {1}\n'.format(footnote_num, href)
footnote_num += 1
self._commit_block('\n')
byte_output = self.output_buffer.encode('utf-8')
if hasattr(sys.stdout, 'buffer'):
sys.stdout.buffer.write(byte_output)
else:
sys.stdout.write(byte_output)
def main():
htmlparser = TextHTMLParser()
if len(sys.argv) > 1 and sys.argv[1] != '-':
filename = sys.argv[1]
f = open(filename, 'rb')
else:
filename = None
f = sys.stdin
try:
if hasattr(f, 'buffer'):
# Access raw (byte) buffer in Python 3 instead of decoded one
f = f.buffer
# Read stdin as a Unicode string
htmlparser.feed(f.read().decode('utf-8'))
finally:
if filename is not None:
f.close()
htmlparser.close()
if __name__ == '__main__':
sys.exit(main())