firefox/tools/lint/fluent-lint/__init__.py

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import bisect
import os
import re
from html.parser import HTMLParser

import mozpack.path as mozpath
import yaml
from fluent.syntax import ast, parse, visitor
from mozlint import result
from mozlint.pathutils import expand_exclusions


class TextElementHTMLParser(HTMLParser):
    """HTML Parser for TextElement.

    TextElements may contain embedded html tags, which can include
    quotes in attributes. We only want to check the actual text.
    """

    def __init__(self):
        super().__init__()
        self.extracted_text = []

    def handle_data(self, data):
        self.extracted_text.append(data)


class Linter(visitor.Visitor):
    """Fluent linter implementation.

    This subclasses the Fluent AST visitor. Methods are called corresponding
    to each type of node in the Fluent AST. It is possible to control
    whether a node is recursed into by calling the generic_visit method on
    the superclass.

    See the documentation here:
    https://www.projectfluent.org/python-fluent/fluent.syntax/stable/usage.html
    """

    def __init__(
        self, path, config, exclusions, contents, offsets_and_lines, brand_names=[]
    ):
        super().__init__()
        self.path = path
        self.config = config
        self.exclusions = exclusions
        self.contents = contents
        self.offsets_and_lines = offsets_and_lines

        self.results = []
        self.identifier_re = re.compile(r"[a-z0-9-]+")
        self.apostrophe_re = re.compile(r"\w'")
        self.incorrect_apostrophe_re = re.compile(r"\w\u2018\w")
        self.single_quote_re = re.compile(r"'(.+)'")
        self.double_quote_re = re.compile(r"\".+\"")
        self.ellipsis_re = re.compile(r"\.\.\.")

        self.brand_names = brand_names
        self.minimum_id_length = 9

        self.state = {
            # The resource comment should be at the top of the page after the license.
            "node_can_be_resource_comment": True,
            # Group comments must be followed by a message. Two group comments are not
            # allowed in a row.
            "can_have_group_comment": True,
            # Comment bound to the current message
            "comment": "",
            # The current group comment
            "group_comment": "",
            # Variables in the current message
            "variables": [],
        }

        attributes = [
            "label",
            "value",
            "accesskey",
            "alt",
            "title",
            "tooltiptext",
            "placeholder",
            "aria-label",
            "aria-description",
            "aria-valuetext",
            "style",
            # For XUL key/command setup.
            "key",
            "keycode",
            # For download filenames:
            "download",
            # Used in the Firefox prefs
            "searchkeywords",
            # Used by search-textbox.js
            "searchbuttonlabel",
            # Used in toolbar customization.
            "toolbarname",
            # Used in moz-message-bar.
            "message",
            # Used in dialogs (should be moved to using fluent IDs though)
            "buttonlabelaccept",
            "buttonaccesskeyaccept",
            "buttonlabelcancel",
            "buttonaccesskeycancel",
            "buttonlabelextra2",
            "buttonaccesskeyextra2",
            # Used in app menu notifications (should be moved to use fluent IDs)
            "buttonlabel",
            "buttonaccesskey",
            "secondarybuttonlabel",
            "secondarybuttonaccesskey",
            # Commonly used in Lit-based web components
            "heading",
            "description",
        ]
        self.known_attribute_list = [a.lower() for a in attributes]

        # Set this to true to debug print the root node's json. This is useful for
        # writing new lint rules, or debugging existing ones.
        self.debug_print_json = False

    def generic_visit(self, node):
        node_name = type(node).__name__
        self.state["node_can_be_resource_comment"] = self.state[
            "node_can_be_resource_comment"
        ] and (
            # This is the root node.
            node_name == "Resource"
            # Empty space is allowed.
            or node_name == "Span"
            # Comments are allowed
            or node_name == "Comment"
        )

        if self.debug_print_json:
            import json

            print(json.dumps(node.to_json(), indent=2))
            # Only debug print the root node.
            self.debug_print_json = False

        super(Linter, self).generic_visit(node)

    def visit_Attribute(self, node):
        # Only visit values for Attribute nodes, the identifier comes from dom.
        super().generic_visit(node.value)

    def visit_FunctionReference(self, node):
        # We don't recurse into function references, the identifiers there are
        # allowed to be free form.
        pass

    def visit_Message(self, node):
        # There must be at least one message or term between group comments.
        self.state["can_have_group_comment"] = True
        self.last_message_id = node.id.name

        super().generic_visit(node)

        # Do this here instead as visit_Attribute doesn't have access to the
        # message's comment.
        for attr in node.attributes:
            if not attr.id.name.lower() in self.known_attribute_list:
                comment = self.state["comment"] + self.state["group_comment"]
                if not f".{attr.id.name}" in comment:
                    self.add_error(
                        attr,
                        "VA01",
                        "Use attributes designed for localized content directly."
                        " If script-based processing is necessary, add a comment"
                        f" explaining why. The linter didn't recognize: .{attr.id.name}",
                        "warning",
                    )

        # Check if variables are referenced in comments
        if self.state["variables"]:
            comments = self.state["comment"] + self.state["group_comment"]
            missing_references = [
                v for v in self.state["variables"] if f"${v}" not in comments
            ]
            if missing_references:
                self.add_error(
                    node,
                    "VC01",
                    "Messages including variables should have a comment "
                    "explaining what will replace the variable. "
                    "Missing references: "
                    + ", ".join([f"${m}" for m in missing_references]),
                )

        # Reset current comment and variable references after reading the
        # message.
        self.state["comment"] = ""
        self.state["variables"] = []

    def visit_Term(self, node):
        # There must be at least one message or term between group comments.
        self.state["can_have_group_comment"] = True
        self.last_message_id = None

        super().generic_visit(node)

        # Reset current comment and variable references after reading the term.
        self.state["comment"] = ""
        self.state["variables"] = []

    def visit_MessageReference(self, node):
        # We don't recurse into message references, the identifiers are either
        # checked elsewhere or are attributes and come from DOM.
        pass

    def visit_Identifier(self, node):
        if (
            self.path not in self.exclusions["ID01"]["files"]
            and node.name not in self.exclusions["ID01"]["messages"]
            and not self.identifier_re.fullmatch(node.name)
        ):
            self.add_error(
                node,
                "ID01",
                f"Identifiers may only contain lowercase characters and - (ID: {node.name})",
            )
        if (
            len(node.name) < self.minimum_id_length
            and self.path not in self.exclusions["ID02"]["files"]
            and node.name not in self.exclusions["ID02"]["messages"]
        ):
            self.add_error(
                node,
                "ID02",
                f"Identifiers must be at least {self.minimum_id_length} characters long (ID: {node.name}",
            )

    def visit_TextElement(self, node):
        parser = TextElementHTMLParser()
        parser.feed(node.value)
        for text in parser.extracted_text:
            # To check for apostrophes, first remove pairs of straight quotes
            # used as delimiters.
            cleaned_str = re.sub(self.single_quote_re, "\1", node.value)
            if self.apostrophe_re.search(cleaned_str):
                self.add_error(
                    node,
                    "TE01",
                    "Strings with apostrophes should use foo\u2019s instead of foo's.",
                )
            if self.incorrect_apostrophe_re.search(text):
                self.add_error(
                    node,
                    "TE02",
                    "Strings with apostrophes should use foo\u2019s instead of foo\u2018s.",
                )
            if self.single_quote_re.search(text):
                self.add_error(
                    node,
                    "TE03",
                    "Single-quoted strings should use Unicode \u2018foo\u2019 instead of 'foo'.",
                )
            if self.double_quote_re.search(text):
                self.add_error(
                    node,
                    "TE04",
                    'Double-quoted strings should use Unicode \u201cfoo\u201d instead of "foo".',
                )
            if self.ellipsis_re.search(text):
                self.add_error(
                    node,
                    "TE05",
                    "Strings with an ellipsis should use the Unicode \u2026 character"
                    " instead of three periods",
                )

            # If part of a message, check for brand names
            if (
                self.last_message_id is not None
                and self.path not in self.exclusions["CO01"]["files"]
                and self.last_message_id not in self.exclusions["CO01"]["messages"]
            ):
                found_brands = []
                for brand in self.brand_names:
                    if re.search(rf"\b{re.escape(brand)}\b", text):
                        found_brands.append(brand)
                if found_brands:
                    self.add_error(
                        node,
                        "CO01",
                        "Strings should use the corresponding terms instead of"
                        f" hard-coded brand names ({', '.join(found_brands)})",
                    )

    def visit_ResourceComment(self, node):
        # This node is a comment with: "###"
        if not self.state["node_can_be_resource_comment"]:
            self.add_error(
                node,
                "RC01",
                "Resource comments (###) should be placed at the top of the file, just "
                "after the license header. There should only be one resource comment "
                "per file.",
            )
            return

        lines_after = get_newlines_count_after(node.span, self.contents)
        lines_before = get_newlines_count_before(node.span, self.contents)

        if node.span.end == len(self.contents) - 1:
            # This file only contains a resource comment.
            return

        if lines_after != 2:
            self.add_error(
                node,
                "RC02",
                "Resource comments (###) should be followed by one empty line.",
            )
            return

        if lines_before != 2:
            self.add_error(
                node,
                "RC03",
                "Resource comments (###) should have one empty line above them.",
            )
            return

    def visit_SelectExpression(self, node):
        # We only want to visit the variant values, the identifiers in selectors
        # and keys are allowed to be free form.
        for variant in node.variants:
            super().generic_visit(variant.value)

        # Store the variable used for the SelectExpression, excluding functions
        # like PLATFORM()
        if (
            type(node.selector) is ast.VariableReference
            and node.selector.id.name not in self.state["variables"]
        ):
            self.state["variables"].append(node.selector.id.name)

    def visit_Comment(self, node):
        # This node is a comment with: "#"

        # Store the comment
        self.state["comment"] = node.content

    def visit_GroupComment(self, node):
        # This node is a comment with: "##"

        # Store the group comment
        self.state["group_comment"] = node.content

        if not self.state["can_have_group_comment"]:
            self.add_error(
                node,
                "GC04",
                "Group comments (##) must be followed by at least one message "
                "or term. Make sure that a single group comment with multiple "
                "paragraphs is not separated by whitespace, as it will be "
                "interpreted as two different comments.",
            )
            return

        self.state["can_have_group_comment"] = False

        lines_after = get_newlines_count_after(node.span, self.contents)
        lines_before = get_newlines_count_before(node.span, self.contents)

        if node.span.end == len(self.contents) - 1:
            # The group comment is the last thing in the file.

            if node.content == "":
                # Empty comments are allowed at the end of the file.
                return

            self.add_error(
                node,
                "GC01",
                "Group comments (##) should not be at the end of the file, they should "
                "always be above a message. Only an empty group comment is allowed at "
                "the end of a file.",
            )
            return

        if lines_after != 2:
            self.add_error(
                node,
                "GC02",
                "Group comments (##) should be followed by one empty line.",
            )
            return

        if lines_before != 2:
            self.add_error(
                node,
                "GC03",
                "Group comments (##) should have an empty line before them.",
            )
            return

    def visit_VariableReference(self, node):
        # Identifiers are allowed to be free form, but need to store them
        # for comment checks.

        if node.id.name not in self.state["variables"]:
            self.state["variables"].append(node.id.name)

    def add_error(self, node, rule, msg, level=None):
        (col, line) = self.span_to_line_and_col(node.span)
        res = {
            "path": self.path,
            "lineno": line,
            "column": col,
            "rule": rule,
            "message": msg,
        }
        if level:
            res["level"] = level

        self.results.append(result.from_config(self.config, **res))

    def span_to_line_and_col(self, span):
        i = bisect.bisect_left(self.offsets_and_lines, (span.start, 0))
        if i > 0:
            col = span.start - self.offsets_and_lines[i - 1][0]
        else:
            col = 1 + span.start
        return (col, self.offsets_and_lines[i][1])


def get_offsets_and_lines(contents):
    """Return a list consisting of tuples of (offset, line).

    The Fluent AST contains spans of start and end offsets in the file.
    This function returns a list of offsets and line numbers so that errors
    can be reported using line and column.
    """
    line = 1
    result = []
    for m in re.finditer(r"\n", contents):
        result.append((m.start(), line))
        line += 1
    return result


def get_newlines_count_after(span, contents):
    # Determine the number of newlines.
    count = 0
    for i in range(span.end, len(contents)):
        assert contents[i] != "\r", "This linter does not handle \\r characters."
        if contents[i] != "\n":
            break
        count += 1

    return count


def get_newlines_count_before(span, contents):
    # Determine the range of newline characters.
    count = 0
    for i in range(span.start - 1, 0, -1):
        assert contents[i] != "\r", "This linter does not handle \\r characters."
        if contents[i] != "\n":
            break
        count += 1

    return count


def get_exclusions(root):
    with open(
        mozpath.join(root, "tools", "lint", "fluent-lint", "exclusions.yml")
    ) as f:
        exclusions = list(yaml.safe_load_all(f))[0]
        for error_type in exclusions:
            exclusions[error_type]["files"] = set(
                [mozpath.join(root, x) for x in exclusions[error_type]["files"]]
            )
        return exclusions


def get_branding_list(root, brand_files):
    class MessageExtractor(visitor.Visitor):
        def __init__(self):
            self.brands = []
            self.last_message_id = None

        def visit_Term(self, node):
            self.last_message_id = node.id.name
            self.generic_visit(node)

        def visit_TextElement(self, node):
            if self.last_message_id:
                self.brands += [node.value]
                self.last_message_id = None
            self.generic_visit(node)

    extractor = MessageExtractor()

    for brand_path in brand_files:
        brand_file = mozpath.join(root, brand_path)
        if os.path.exists(brand_file):
            with open(brand_file, encoding="utf-8") as f:
                messages = parse(f.read())
                extractor.visit(messages)

    return list(set(extractor.brands))


def lint(paths, config, fix=None, **lintargs):
    root = lintargs["root"]
    files = list(expand_exclusions(paths, config, root))
    exclusions = get_exclusions(root)
    brand_files = config.get("brand-files")
    brand_names = get_branding_list(root, brand_files)
    results = []
    for path in files:
        contents = open(path, encoding="utf-8").read()
        linter = Linter(
            path,
            config,
            exclusions,
            contents,
            get_offsets_and_lines(contents),
            brand_names,
        )
        linter.visit(parse(contents))
        results.extend(linter.results)
    return results