From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Wed, 17 Apr 2024 14:02:58 +0200
Subject: Adding upstream version 1.64.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 src/etc/htmldocck.py | 655 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 655 insertions(+)
 create mode 100644 src/etc/htmldocck.py

(limited to 'src/etc/htmldocck.py')

diff --git a/src/etc/htmldocck.py b/src/etc/htmldocck.py
new file mode 100644
index 000000000..d02ac9d9c
--- /dev/null
+++ b/src/etc/htmldocck.py
@@ -0,0 +1,655 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+r"""
+htmldocck.py is a custom checker script for Rustdoc HTML outputs.
+
+# How and why?
+
+The principle is simple: This script receives a path to generated HTML
+documentation and a "template" script, which has a series of check
+commands like `@has` or `@matches`. Each command is used to check if
+some pattern is present or not present in the particular file or in
+a particular node of the HTML tree. In many cases, the template script
+happens to be the source code given to rustdoc.
+
+While it indeed is possible to test in smaller portions, it has been
+hard to construct tests in this fashion and major rendering errors were
+discovered much later. This script is designed to make black-box and
+regression testing of Rustdoc easy. This does not preclude the needs for
+unit testing, but can be used to complement related tests by quickly
+showing the expected renderings.
+
+In order to avoid one-off dependencies for this task, this script uses
+a reasonably working HTML parser and the existing XPath implementation
+from Python's standard library. Hopefully, we won't render
+non-well-formed HTML.
+
+# Commands
+
+Commands start with an `@` followed by a command name (letters and
+hyphens), and zero or more arguments separated by one or more whitespace
+characters and optionally delimited with single or double quotes. The `@`
+mark cannot be preceded by a non-whitespace character. Other lines
+(including every text up to the first `@`) are ignored, but it is
+recommended to avoid the use of `@` in the template file.
+
+There are a number of supported commands:
+
+* `@has PATH` checks for the existence of the given file.
+
+  `PATH` is relative to the output directory. It can be given as `-`
+  which repeats the most recently used `PATH`.
+
+* `@has PATH PATTERN` and `@matches PATH PATTERN` checks for
+  the occurrence of the given pattern `PATTERN` in the specified file.
+  Only one occurrence of the pattern is enough.
+
+  For `@has`, `PATTERN` is a whitespace-normalized (every consecutive
+  whitespace being replaced by one single space character) string.
+  The entire file is also whitespace-normalized including newlines.
+
+  For `@matches`, `PATTERN` is a Python-supported regular expression.
+  The file remains intact but the regexp is matched without the `MULTILINE`
+  and `IGNORECASE` options. You can still use a prefix `(?m)` or `(?i)`
+  to override them, and `\A` and `\Z` for definitely matching
+  the beginning and end of the file.
+
+  (The same distinction goes to other variants of these commands.)
+
+* `@has PATH XPATH PATTERN` and `@matches PATH XPATH PATTERN` checks for
+  the presence of the given XPath `XPATH` in the specified HTML file,
+  and also the occurrence of the given pattern `PATTERN` in the matching
+  node or attribute. Only one occurrence of the pattern in the match
+  is enough.
+
+  `PATH` should be a valid and well-formed HTML file. It does *not*
+  accept arbitrary HTML5; it should have matching open and close tags
+  and correct entity references at least.
+
+  `XPATH` is an XPath expression to match. The XPath is fairly limited:
+  `tag`, `*`, `.`, `//`, `..`, `[@attr]`, `[@attr='value']`, `[tag]`,
+  `[POS]` (element located in given `POS`), `[last()-POS]`, `text()`
+  and `@attr` (both as the last segment) are supported. Some examples:
+
+  - `//pre` or `.//pre` matches any element with a name `pre`.
+  - `//a[@href]` matches any element with an `href` attribute.
+  - `//*[@class="impl"]//code` matches any element with a name `code`,
+    which is an ancestor of some element which `class` attr is `impl`.
+  - `//h1[@class="fqn"]/span[1]/a[last()]/@class` matches a value of
+    `class` attribute in the last `a` element (can be followed by more
+    elements that are not `a`) inside the first `span` in the `h1` with
+    a class of `fqn`. Note that there cannot be any additional elements
+    between them due to the use of `/` instead of `//`.
+
+  Do not try to use non-absolute paths, it won't work due to the flawed
+  ElementTree implementation. The script rejects them.
+
+  For the text matches (i.e. paths not ending with `@attr`), any
+  subelements are flattened into one string; this is handy for ignoring
+  highlights for example. If you want to simply check for the presence of
+  a given node or attribute, use an empty string (`""`) as a `PATTERN`.
+
+* `@count PATH XPATH COUNT` checks for the occurrence of the given XPath
+  in the specified file. The number of occurrences must match the given
+  count.
+
+* `@count PATH XPATH TEXT COUNT` checks for the occurrence of the given XPath
+  with the given text in the specified file. The number of occurrences must
+  match the given count.
+
+* `@snapshot NAME PATH XPATH` creates a snapshot test named NAME.
+  A snapshot test captures a subtree of the DOM, at the location
+  determined by the XPath, and compares it to a pre-recorded value
+  in a file. The file's name is the test's name with the `.rs` extension
+  replaced with `.NAME.html`, where NAME is the snapshot's name.
+
+  htmldocck supports the `--bless` option to accept the current subtree
+  as expected, saving it to the file determined by the snapshot's name.
+  compiletest's `--bless` flag is forwarded to htmldocck.
+
+* `@has-dir PATH` checks for the existence of the given directory.
+
+All conditions can be negated with `!`. `@!has foo/type.NoSuch.html`
+checks if the given file does not exist, for example.
+
+"""
+
+from __future__ import absolute_import, print_function, unicode_literals
+
+import codecs
+import io
+import sys
+import os.path
+import re
+import shlex
+from collections import namedtuple
+try:
+    from html.parser import HTMLParser
+except ImportError:
+    from HTMLParser import HTMLParser
+try:
+    from xml.etree import cElementTree as ET
+except ImportError:
+    from xml.etree import ElementTree as ET
+
+try:
+    from html.entities import name2codepoint
+except ImportError:
+    from htmlentitydefs import name2codepoint
+
+# "void elements" (no closing tag) from the HTML Standard section 12.1.2
+VOID_ELEMENTS = {'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
+                     'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr'}
+
+# Python 2 -> 3 compatibility
+try:
+    unichr
+except NameError:
+    unichr = chr
+
+
+channel = os.environ["DOC_RUST_LANG_ORG_CHANNEL"]
+
+# Initialized in main
+rust_test_path = None
+bless = None
+
+class CustomHTMLParser(HTMLParser):
+    """simplified HTML parser.
+
+    this is possible because we are dealing with very regular HTML from
+    rustdoc; we only have to deal with i) void elements and ii) empty
+    attributes."""
+    def __init__(self, target=None):
+        HTMLParser.__init__(self)
+        self.__builder = target or ET.TreeBuilder()
+
+    def handle_starttag(self, tag, attrs):
+        attrs = {k: v or '' for k, v in attrs}
+        self.__builder.start(tag, attrs)
+        if tag in VOID_ELEMENTS:
+            self.__builder.end(tag)
+
+    def handle_endtag(self, tag):
+        self.__builder.end(tag)
+
+    def handle_startendtag(self, tag, attrs):
+        attrs = {k: v or '' for k, v in attrs}
+        self.__builder.start(tag, attrs)
+        self.__builder.end(tag)
+
+    def handle_data(self, data):
+        self.__builder.data(data)
+
+    def handle_entityref(self, name):
+        self.__builder.data(unichr(name2codepoint[name]))
+
+    def handle_charref(self, name):
+        code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
+        self.__builder.data(unichr(code))
+
+    def close(self):
+        HTMLParser.close(self)
+        return self.__builder.close()
+
+
+Command = namedtuple('Command', 'negated cmd args lineno context')
+
+
+class FailedCheck(Exception):
+    pass
+
+
+class InvalidCheck(Exception):
+    pass
+
+
+def concat_multi_lines(f):
+    """returns a generator out of the file object, which
+    - removes `\\` then `\n` then a shared prefix with the previous line then
+      optional whitespace;
+    - keeps a line number (starting from 0) of the first line being
+      concatenated."""
+    lastline = None  # set to the last line when the last line has a backslash
+    firstlineno = None
+    catenated = ''
+    for lineno, line in enumerate(f):
+        line = line.rstrip('\r\n')
+
+        # strip the common prefix from the current line if needed
+        if lastline is not None:
+            common_prefix = os.path.commonprefix([line, lastline])
+            line = line[len(common_prefix):].lstrip()
+
+        firstlineno = firstlineno or lineno
+        if line.endswith('\\'):
+            if lastline is None:
+                lastline = line[:-1]
+            catenated += line[:-1]
+        else:
+            yield firstlineno, catenated + line
+            lastline = None
+            firstlineno = None
+            catenated = ''
+
+    if lastline is not None:
+        print_err(lineno, line, 'Trailing backslash at the end of the file')
+
+
+LINE_PATTERN = re.compile(r'''
+    (?<=(?<!\S))(?P<invalid>!?)@(?P<negated>!?)
+    (?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
+    (?P<args>.*)$
+''', re.X | re.UNICODE)
+
+
+def get_commands(template):
+    with io.open(template, encoding='utf-8') as f:
+        for lineno, line in concat_multi_lines(f):
+            m = LINE_PATTERN.search(line)
+            if not m:
+                continue
+
+            negated = (m.group('negated') == '!')
+            cmd = m.group('cmd')
+            if m.group('invalid') == '!':
+                print_err(
+                    lineno,
+                    line,
+                    'Invalid command: `!@{0}{1}`, (help: try with `@!{1}`)'.format(
+                        '!' if negated else '',
+                        cmd,
+                    ),
+                )
+                continue
+            args = m.group('args')
+            if args and not args[:1].isspace():
+                print_err(lineno, line, 'Invalid template syntax')
+                continue
+            try:
+                args = shlex.split(args)
+            except UnicodeEncodeError:
+                args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
+            yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
+
+
+def _flatten(node, acc):
+    if node.text:
+        acc.append(node.text)
+    for e in node:
+        _flatten(e, acc)
+        if e.tail:
+            acc.append(e.tail)
+
+
+def flatten(node):
+    acc = []
+    _flatten(node, acc)
+    return ''.join(acc)
+
+
+def make_xml(text):
+    xml = ET.XML('<xml>%s</xml>' % text)
+    return xml
+
+
+def normalize_xpath(path):
+    path = path.replace("{{channel}}", channel)
+    if path.startswith('//'):
+        return '.' + path  # avoid warnings
+    elif path.startswith('.//'):
+        return path
+    else:
+        raise InvalidCheck('Non-absolute XPath is not supported due to implementation issues')
+
+
+class CachedFiles(object):
+    def __init__(self, root):
+        self.root = root
+        self.files = {}
+        self.trees = {}
+        self.last_path = None
+
+    def resolve_path(self, path):
+        if path != '-':
+            path = os.path.normpath(path)
+            self.last_path = path
+            return path
+        elif self.last_path is None:
+            raise InvalidCheck('Tried to use the previous path in the first command')
+        else:
+            return self.last_path
+
+    def get_file(self, path):
+        path = self.resolve_path(path)
+        if path in self.files:
+            return self.files[path]
+
+        abspath = os.path.join(self.root, path)
+        if not(os.path.exists(abspath) and os.path.isfile(abspath)):
+            raise FailedCheck('File does not exist {!r}'.format(path))
+
+        with io.open(abspath, encoding='utf-8') as f:
+            data = f.read()
+            self.files[path] = data
+            return data
+
+    def get_tree(self, path):
+        path = self.resolve_path(path)
+        if path in self.trees:
+            return self.trees[path]
+
+        abspath = os.path.join(self.root, path)
+        if not(os.path.exists(abspath) and os.path.isfile(abspath)):
+            raise FailedCheck('File does not exist {!r}'.format(path))
+
+        with io.open(abspath, encoding='utf-8') as f:
+            try:
+                tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
+            except Exception as e:
+                raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
+            self.trees[path] = tree
+            return self.trees[path]
+
+    def get_dir(self, path):
+        path = self.resolve_path(path)
+        abspath = os.path.join(self.root, path)
+        if not(os.path.exists(abspath) and os.path.isdir(abspath)):
+            raise FailedCheck('Directory does not exist {!r}'.format(path))
+
+
+def check_string(data, pat, regexp):
+    pat = pat.replace("{{channel}}", channel)
+    if not pat:
+        return True  # special case a presence testing
+    elif regexp:
+        return re.search(pat, data, flags=re.UNICODE) is not None
+    else:
+        data = ' '.join(data.split())
+        pat = ' '.join(pat.split())
+        return pat in data
+
+
+def check_tree_attr(tree, path, attr, pat, regexp):
+    path = normalize_xpath(path)
+    ret = False
+    for e in tree.findall(path):
+        if attr in e.attrib:
+            value = e.attrib[attr]
+        else:
+            continue
+
+        ret = check_string(value, pat, regexp)
+        if ret:
+            break
+    return ret
+
+
+# Returns the number of occurences matching the regex (`regexp`) and the text (`pat`).
+def check_tree_text(tree, path, pat, regexp, stop_at_first):
+    path = normalize_xpath(path)
+    match_count = 0
+    try:
+        for e in tree.findall(path):
+            try:
+                value = flatten(e)
+            except KeyError:
+                continue
+            else:
+                if check_string(value, pat, regexp):
+                    match_count += 1
+                    if stop_at_first:
+                        break
+    except Exception:
+        print('Failed to get path "{}"'.format(path))
+        raise
+    return match_count
+
+
+def get_tree_count(tree, path):
+    path = normalize_xpath(path)
+    return len(tree.findall(path))
+
+
+def check_snapshot(snapshot_name, actual_tree, normalize_to_text):
+    assert rust_test_path.endswith('.rs')
+    snapshot_path = '{}.{}.{}'.format(rust_test_path[:-3], snapshot_name, 'html')
+    try:
+        with open(snapshot_path, 'r') as snapshot_file:
+            expected_str = snapshot_file.read().replace("{{channel}}", channel)
+    except FileNotFoundError:
+        if bless:
+            expected_str = None
+        else:
+            raise FailedCheck('No saved snapshot value')
+
+    if not normalize_to_text:
+        actual_str = ET.tostring(actual_tree).decode('utf-8')
+    else:
+        actual_str = flatten(actual_tree)
+
+    # Conditions:
+    #  1. Is --bless
+    #  2. Are actual and expected tree different
+    #  3. Are actual and expected text different
+    if not expected_str \
+        or (not normalize_to_text and \
+            not compare_tree(make_xml(actual_str), make_xml(expected_str), stderr)) \
+        or (normalize_to_text and actual_str != expected_str):
+
+        if bless:
+            with open(snapshot_path, 'w') as snapshot_file:
+                actual_str = actual_str.replace(channel, "{{channel}}")
+                snapshot_file.write(actual_str)
+        else:
+            print('--- expected ---\n')
+            print(expected_str)
+            print('\n\n--- actual ---\n')
+            print(actual_str)
+            print()
+            raise FailedCheck('Actual snapshot value is different than expected')
+
+
+# Adapted from https://github.com/formencode/formencode/blob/3a1ba9de2fdd494dd945510a4568a3afeddb0b2e/formencode/doctest_xml_compare.py#L72-L120
+def compare_tree(x1, x2, reporter=None):
+    if x1.tag != x2.tag:
+        if reporter:
+            reporter('Tags do not match: %s and %s' % (x1.tag, x2.tag))
+        return False
+    for name, value in x1.attrib.items():
+        if x2.attrib.get(name) != value:
+            if reporter:
+                reporter('Attributes do not match: %s=%r, %s=%r'
+                         % (name, value, name, x2.attrib.get(name)))
+            return False
+    for name in x2.attrib:
+        if name not in x1.attrib:
+            if reporter:
+                reporter('x2 has an attribute x1 is missing: %s'
+                         % name)
+            return False
+    if not text_compare(x1.text, x2.text):
+        if reporter:
+            reporter('text: %r != %r' % (x1.text, x2.text))
+        return False
+    if not text_compare(x1.tail, x2.tail):
+        if reporter:
+            reporter('tail: %r != %r' % (x1.tail, x2.tail))
+        return False
+    cl1 = list(x1)
+    cl2 = list(x2)
+    if len(cl1) != len(cl2):
+        if reporter:
+            reporter('children length differs, %i != %i'
+                     % (len(cl1), len(cl2)))
+        return False
+    i = 0
+    for c1, c2 in zip(cl1, cl2):
+        i += 1
+        if not compare_tree(c1, c2, reporter=reporter):
+            if reporter:
+                reporter('children %i do not match: %s'
+                         % (i, c1.tag))
+            return False
+    return True
+
+
+def text_compare(t1, t2):
+    if not t1 and not t2:
+        return True
+    if t1 == '*' or t2 == '*':
+        return True
+    return (t1 or '').strip() == (t2 or '').strip()
+
+
+def stderr(*args):
+    if sys.version_info.major < 3:
+        file = codecs.getwriter('utf-8')(sys.stderr)
+    else:
+        file = sys.stderr
+
+    print(*args, file=file)
+
+
+def print_err(lineno, context, err, message=None):
+    global ERR_COUNT
+    ERR_COUNT += 1
+    stderr("{}: {}".format(lineno, message or err))
+    if message and err:
+        stderr("\t{}".format(err))
+
+    if context:
+        stderr("\t{}".format(context))
+
+
+def get_nb_matching_elements(cache, c, regexp, stop_at_first):
+    tree = cache.get_tree(c.args[0])
+    pat, sep, attr = c.args[1].partition('/@')
+    if sep:  # attribute
+        tree = cache.get_tree(c.args[0])
+        return check_tree_attr(tree, pat, attr, c.args[2], False)
+    else:  # normalized text
+        pat = c.args[1]
+        if pat.endswith('/text()'):
+            pat = pat[:-7]
+        return check_tree_text(cache.get_tree(c.args[0]), pat, c.args[2], regexp, stop_at_first)
+
+
+ERR_COUNT = 0
+
+
+def check_command(c, cache):
+    try:
+        cerr = ""
+        if c.cmd == 'has' or c.cmd == 'matches':  # string test
+            regexp = (c.cmd == 'matches')
+            if len(c.args) == 1 and not regexp:  # @has <path> = file existence
+                try:
+                    cache.get_file(c.args[0])
+                    ret = True
+                except FailedCheck as err:
+                    cerr = str(err)
+                    ret = False
+            elif len(c.args) == 2:  # @has/matches <path> <pat> = string test
+                cerr = "`PATTERN` did not match"
+                ret = check_string(cache.get_file(c.args[0]), c.args[1], regexp)
+            elif len(c.args) == 3:  # @has/matches <path> <pat> <match> = XML tree test
+                cerr = "`XPATH PATTERN` did not match"
+                ret = get_nb_matching_elements(cache, c, regexp, True) != 0
+            else:
+                raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
+
+        elif c.cmd == 'count':  # count test
+            if len(c.args) == 3:  # @count <path> <pat> <count> = count test
+                expected = int(c.args[2])
+                found = get_tree_count(cache.get_tree(c.args[0]), c.args[1])
+                cerr = "Expected {} occurrences but found {}".format(expected, found)
+                ret = expected == found
+            elif len(c.args) == 4:  # @count <path> <pat> <text> <count> = count test
+                expected = int(c.args[3])
+                found = get_nb_matching_elements(cache, c, False, False)
+                cerr = "Expected {} occurrences but found {}".format(expected, found)
+                ret = found == expected
+            else:
+                raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
+
+        elif c.cmd == 'snapshot':  # snapshot test
+            if len(c.args) == 3:  # @snapshot <snapshot-name> <html-path> <xpath>
+                [snapshot_name, html_path, pattern] = c.args
+                tree = cache.get_tree(html_path)
+                xpath = normalize_xpath(pattern)
+                normalize_to_text = False
+                if xpath.endswith('/text()'):
+                    xpath = xpath[:-7]
+                    normalize_to_text = True
+
+                subtrees = tree.findall(xpath)
+                if len(subtrees) == 1:
+                    [subtree] = subtrees
+                    try:
+                        check_snapshot(snapshot_name, subtree, normalize_to_text)
+                        ret = True
+                    except FailedCheck as err:
+                        cerr = str(err)
+                        ret = False
+                elif len(subtrees) == 0:
+                    raise FailedCheck('XPATH did not match')
+                else:
+                    raise FailedCheck('Expected 1 match, but found {}'.format(len(subtrees)))
+            else:
+                raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
+
+        elif c.cmd == 'has-dir':  # has-dir test
+            if len(c.args) == 1:  # @has-dir <path> = has-dir test
+                try:
+                    cache.get_dir(c.args[0])
+                    ret = True
+                except FailedCheck as err:
+                    cerr = str(err)
+                    ret = False
+            else:
+                raise InvalidCheck('Invalid number of @{} arguments'.format(c.cmd))
+
+        elif c.cmd == 'valid-html':
+            raise InvalidCheck('Unimplemented @valid-html')
+
+        elif c.cmd == 'valid-links':
+            raise InvalidCheck('Unimplemented @valid-links')
+
+        else:
+            raise InvalidCheck('Unrecognized @{}'.format(c.cmd))
+
+        if ret == c.negated:
+            raise FailedCheck(cerr)
+
+    except FailedCheck as err:
+        message = '@{}{} check failed'.format('!' if c.negated else '', c.cmd)
+        print_err(c.lineno, c.context, str(err), message)
+    except InvalidCheck as err:
+        print_err(c.lineno, c.context, str(err))
+
+
+def check(target, commands):
+    cache = CachedFiles(target)
+    for c in commands:
+        check_command(c, cache)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) not in [3, 4]:
+        stderr('Usage: {} <doc dir> <template> [--bless]'.format(sys.argv[0]))
+        raise SystemExit(1)
+
+    rust_test_path = sys.argv[2]
+    if len(sys.argv) > 3 and sys.argv[3] == '--bless':
+        bless = True
+    else:
+        # We only support `--bless` at the end of the arguments.
+        # This assert is to prevent silent failures.
+        assert '--bless' not in sys.argv
+        bless = False
+    check(sys.argv[1], get_commands(rust_test_path))
+    if ERR_COUNT:
+        stderr("\nEncountered {} errors".format(ERR_COUNT))
+        raise SystemExit(1)
-- 
cgit v1.2.3