1 files changed, 470 insertions, 0 deletions
diff --git a/tools/lint/fluent-lint/__init__.py b/tools/lint/fluent-lint/__init__.py
new file mode 100644
index 0000000000..3d0373ea01
--- /dev/null
+++ b/tools/lint/fluent-lint/__init__.py
@@ -0,0 +1,470 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+import bisect
+import os
+import re
+from html.parser import HTMLParser
+
+import mozpack.path as mozpath
+import yaml
+from fluent.syntax import ast, parse, visitor
+from mozlint import result
+from mozlint.pathutils import expand_exclusions
+
+
+class TextElementHTMLParser(HTMLParser):
+    """HTML Parser for TextElement.
+
+    TextElements may contain embedded html tags, which can include
+    quotes in attributes. We only want to check the actual text.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.extracted_text = []
+
+    def handle_data(self, data):
+        self.extracted_text.append(data)
+
+
+class Linter(visitor.Visitor):
+    """Fluent linter implementation.
+
+    This subclasses the Fluent AST visitor. Methods are called corresponding
+    to each type of node in the Fluent AST. It is possible to control
+    whether a node is recursed into by calling the generic_visit method on
+    the superclass.
+
+    See the documentation here:
+    https://www.projectfluent.org/python-fluent/fluent.syntax/stable/usage.html
+    """
+
+    def __init__(
+        self, path, config, exclusions, contents, offsets_and_lines, brand_names=[]
+    ):
+        super().__init__()
+        self.path = path
+        self.config = config
+        self.exclusions = exclusions
+        self.contents = contents
+        self.offsets_and_lines = offsets_and_lines
+
+        self.results = []
+        self.identifier_re = re.compile(r"[a-z0-9-]+")
+        self.apostrophe_re = re.compile(r"\w'")
+        self.incorrect_apostrophe_re = re.compile(r"\w\u2018\w")
+        self.single_quote_re = re.compile(r"'(.+)'")
+        self.double_quote_re = re.compile(r"\".+\"")
+        self.ellipsis_re = re.compile(r"\.\.\.")
+
+        self.brand_names = brand_names
+        self.minimum_id_length = 9
+
+        self.state = {
+            # The resource comment should be at the top of the page after the license.
+            "node_can_be_resource_comment": True,
+            # Group comments must be followed by a message. Two group comments are not
+            # allowed in a row.
+            "can_have_group_comment": True,
+            # Comment bound to the current message
+            "comment": "",
+            # The current group comment
+            "group_comment": "",
+            # Variables in the current message
+            "variables": [],
+        }
+
+        # Set this to true to debug print the root node's json. This is useful for
+        # writing new lint rules, or debugging existing ones.
+        self.debug_print_json = False
+
+    def generic_visit(self, node):
+        node_name = type(node).__name__
+        self.state["node_can_be_resource_comment"] = self.state[
+            "node_can_be_resource_comment"
+        ] and (
+            # This is the root node.
+            node_name == "Resource"
+            # Empty space is allowed.
+            or node_name == "Span"
+            # Comments are allowed
+            or node_name == "Comment"
+        )
+
+        if self.debug_print_json:
+            import json
+
+            print(json.dumps(node.to_json(), indent=2))
+            # Only debug print the root node.
+            self.debug_print_json = False
+
+        super(Linter, self).generic_visit(node)
+
+    def visit_Attribute(self, node):
+        # Only visit values for Attribute nodes, the identifier comes from dom.
+        super().generic_visit(node.value)
+
+    def visit_FunctionReference(self, node):
+        # We don't recurse into function references, the identifiers there are
+        # allowed to be free form.
+        pass
+
+    def visit_Message(self, node):
+        # There must be at least one message or term between group comments.
+        self.state["can_have_group_comment"] = True
+        self.last_message_id = node.id.name
+
+        super().generic_visit(node)
+
+        # Check if variables are referenced in comments
+        if self.state["variables"]:
+            comments = self.state["comment"] + self.state["group_comment"]
+            missing_references = [
+                v for v in self.state["variables"] if f"${v}" not in comments
+            ]
+            if missing_references:
+                self.add_error(
+                    node,
+                    "VC01",
+                    "Messages including variables should have a comment "
+                    "explaining what will replace the variable. "
+                    "Missing references: "
+                    + ", ".join([f"${m}" for m in missing_references]),
+                )
+
+        # Reset current comment and variable references after reading the
+        # message.
+        self.state["comment"] = ""
+        self.state["variables"] = []
+
+    def visit_Term(self, node):
+        # There must be at least one message or term between group comments.
+        self.state["can_have_group_comment"] = True
+        self.last_message_id = None
+
+        super().generic_visit(node)
+
+        # Reset current comment and variable references after reading the term.
+        self.state["comment"] = ""
+        self.state["variables"] = []
+
+    def visit_MessageReference(self, node):
+        # We don't recurse into message references, the identifiers are either
+        # checked elsewhere or are attributes and come from DOM.
+        pass
+
+    def visit_Identifier(self, node):
+        if (
+            self.path not in self.exclusions["ID01"]["files"]
+            and node.name not in self.exclusions["ID01"]["messages"]
+            and not self.identifier_re.fullmatch(node.name)
+        ):
+            self.add_error(
+                node,
+                "ID01",
+                "Identifiers may only contain lowercase characters and -",
+            )
+        if (
+            len(node.name) < self.minimum_id_length
+            and self.path not in self.exclusions["ID02"]["files"]
+            and node.name not in self.exclusions["ID02"]["messages"]
+        ):
+            self.add_error(
+                node,
+                "ID02",
+                f"Identifiers must be at least {self.minimum_id_length} characters long",
+            )
+
+    def visit_TextElement(self, node):
+        parser = TextElementHTMLParser()
+        parser.feed(node.value)
+        for text in parser.extracted_text:
+            # To check for apostrophes, first remove pairs of straight quotes
+            # used as delimiters.
+            cleaned_str = re.sub(self.single_quote_re, "\1", node.value)
+            if self.apostrophe_re.search(cleaned_str):
+                self.add_error(
+                    node,
+                    "TE01",
+                    "Strings with apostrophes should use foo\u2019s instead of foo's.",
+                )
+            if self.incorrect_apostrophe_re.search(text):
+                self.add_error(
+                    node,
+                    "TE02",
+                    "Strings with apostrophes should use foo\u2019s instead of foo\u2018s.",
+                )
+            if self.single_quote_re.search(text):
+                self.add_error(
+                    node,
+                    "TE03",
+                    "Single-quoted strings should use Unicode \u2018foo\u2019 instead of 'foo'.",
+                )
+            if self.double_quote_re.search(text):
+                self.add_error(
+                    node,
+                    "TE04",
+                    'Double-quoted strings should use Unicode \u201cfoo\u201d instead of "foo".',
+                )
+            if self.ellipsis_re.search(text):
+                self.add_error(
+                    node,
+                    "TE05",
+                    "Strings with an ellipsis should use the Unicode \u2026 character"
+                    " instead of three periods",
+                )
+
+            # If part of a message, check for brand names
+            if (
+                self.last_message_id is not None
+                and self.path not in self.exclusions["CO01"]["files"]
+                and self.last_message_id not in self.exclusions["CO01"]["messages"]
+            ):
+                found_brands = []
+                for brand in self.brand_names:
+                    if brand in text:
+                        found_brands.append(brand)
+                if found_brands:
+                    self.add_error(
+                        node,
+                        "CO01",
+                        "Strings should use the corresponding terms instead of"
+                        f" hard-coded brand names ({', '.join(found_brands)})",
+                    )
+
+    def visit_ResourceComment(self, node):
+        # This node is a comment with: "###"
+        if not self.state["node_can_be_resource_comment"]:
+            self.add_error(
+                node,
+                "RC01",
+                "Resource comments (###) should be placed at the top of the file, just "
+                "after the license header. There should only be one resource comment "
+                "per file.",
+            )
+            return
+
+        lines_after = get_newlines_count_after(node.span, self.contents)
+        lines_before = get_newlines_count_before(node.span, self.contents)
+
+        if node.span.end == len(self.contents) - 1:
+            # This file only contains a resource comment.
+            return
+
+        if lines_after != 2:
+            self.add_error(
+                node,
+                "RC02",
+                "Resource comments (###) should be followed by one empty line.",
+            )
+            return
+
+        if lines_before != 2:
+            self.add_error(
+                node,
+                "RC03",
+                "Resource comments (###) should have one empty line above them.",
+            )
+            return
+
+    def visit_SelectExpression(self, node):
+        # We only want to visit the variant values, the identifiers in selectors
+        # and keys are allowed to be free form.
+        for variant in node.variants:
+            super().generic_visit(variant.value)
+
+        # Store the variable used for the SelectExpression, excluding functions
+        # like PLATFORM()
+        if (
+            type(node.selector) == ast.VariableReference
+            and node.selector.id.name not in self.state["variables"]
+        ):
+            self.state["variables"].append(node.selector.id.name)
+
+    def visit_Comment(self, node):
+        # This node is a comment with: "#"
+
+        # Store the comment
+        self.state["comment"] = node.content
+
+    def visit_GroupComment(self, node):
+        # This node is a comment with: "##"
+
+        # Store the group comment
+        self.state["group_comment"] = node.content
+
+        if not self.state["can_have_group_comment"]:
+            self.add_error(
+                node,
+                "GC04",
+                "Group comments (##) must be followed by at least one message "
+                "or term. Make sure that a single group comment with multiple "
+                "paragraphs is not separated by whitespace, as it will be "
+                "interpreted as two different comments.",
+            )
+            return
+
+        self.state["can_have_group_comment"] = False
+
+        lines_after = get_newlines_count_after(node.span, self.contents)
+        lines_before = get_newlines_count_before(node.span, self.contents)
+
+        if node.span.end == len(self.contents) - 1:
+            # The group comment is the last thing in the file.
+
+            if node.content == "":
+                # Empty comments are allowed at the end of the file.
+                return
+
+            self.add_error(
+                node,
+                "GC01",
+                "Group comments (##) should not be at the end of the file, they should "
+                "always be above a message. Only an empty group comment is allowed at "
+                "the end of a file.",
+            )
+            return
+
+        if lines_after != 2:
+            self.add_error(
+                node,
+                "GC02",
+                "Group comments (##) should be followed by one empty line.",
+            )
+            return
+
+        if lines_before != 2:
+            self.add_error(
+                node,
+                "GC03",
+                "Group comments (##) should have an empty line before them.",
+            )
+            return
+
+    def visit_VariableReference(self, node):
+        # Identifiers are allowed to be free form, but need to store them
+        # for comment checks.
+
+        if node.id.name not in self.state["variables"]:
+            self.state["variables"].append(node.id.name)
+
+    def add_error(self, node, rule, msg):
+        (col, line) = self.span_to_line_and_col(node.span)
+        res = {
+            "path": self.path,
+            "lineno": line,
+            "column": col,
+            "rule": rule,
+            "message": msg,
+        }
+        self.results.append(result.from_config(self.config, **res))
+
+    def span_to_line_and_col(self, span):
+        i = bisect.bisect_left(self.offsets_and_lines, (span.start, 0))
+        if i > 0:
+            col = span.start - self.offsets_and_lines[i - 1][0]
+        else:
+            col = 1 + span.start
+        return (col, self.offsets_and_lines[i][1])
+
+
+def get_offsets_and_lines(contents):
+    """Return a list consisting of tuples of (offset, line).
+
+    The Fluent AST contains spans of start and end offsets in the file.
+    This function returns a list of offsets and line numbers so that errors
+    can be reported using line and column.
+    """
+    line = 1
+    result = []
+    for m in re.finditer(r"\n", contents):
+        result.append((m.start(), line))
+        line += 1
+    return result
+
+
+def get_newlines_count_after(span, contents):
+    # Determine the number of newlines.
+    count = 0
+    for i in range(span.end, len(contents)):
+        assert contents[i] != "\r", "This linter does not handle \\r characters."
+        if contents[i] != "\n":
+            break
+        count += 1
+
+    return count
+
+
+def get_newlines_count_before(span, contents):
+    # Determine the range of newline characters.
+    count = 0
+    for i in range(span.start - 1, 0, -1):
+        assert contents[i] != "\r", "This linter does not handle \\r characters."
+        if contents[i] != "\n":
+            break
+        count += 1
+
+    return count
+
+
+def get_exclusions(root):
+    with open(
+        mozpath.join(root, "tools", "lint", "fluent-lint", "exclusions.yml")
+    ) as f:
+        exclusions = list(yaml.safe_load_all(f))[0]
+        for error_type in exclusions:
+            exclusions[error_type]["files"] = set(
+                [mozpath.join(root, x) for x in exclusions[error_type]["files"]]
+            )
+        return exclusions
+
+
+def get_branding_list(root, brand_files):
+    class MessageExtractor(visitor.Visitor):
+        def __init__(self):
+            self.brands = []
+            self.last_message_id = None
+
+        def visit_Term(self, node):
+            self.last_message_id = node.id.name
+            self.generic_visit(node)
+
+        def visit_TextElement(self, node):
+            if self.last_message_id:
+                self.brands += [node.value]
+                self.last_message_id = None
+            self.generic_visit(node)
+
+    extractor = MessageExtractor()
+
+    for brand_path in brand_files:
+        brand_file = mozpath.join(root, brand_path)
+        if os.path.exists(brand_file):
+            with open(brand_file, encoding="utf-8") as f:
+                messages = parse(f.read())
+                extractor.visit(messages)
+
+    return list(set(extractor.brands))
+
+
+def lint(paths, config, fix=None, **lintargs):
+    root = lintargs["root"]
+    files = list(expand_exclusions(paths, config, root))
+    exclusions = get_exclusions(root)
+    brand_files = config.get("brand-files")
+    brand_names = get_branding_list(root, brand_files)
+    results = []
+    for path in files:
+        contents = open(path, "r", encoding="utf-8").read()
+        linter = Linter(
+            path,
+            config,
+            exclusions,
+            contents,
+            get_offsets_and_lines(contents),
+            brand_names,
+        )
+        linter.visit(parse(contents))
+        results.extend(linter.results)
+    return results