summaryrefslogtreecommitdiffstats
path: root/tools/lint/fluent-lint/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--tools/lint/fluent-lint/__init__.py470
1 files changed, 470 insertions, 0 deletions
diff --git a/tools/lint/fluent-lint/__init__.py b/tools/lint/fluent-lint/__init__.py
new file mode 100644
index 0000000000..3d0373ea01
--- /dev/null
+++ b/tools/lint/fluent-lint/__init__.py
@@ -0,0 +1,470 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+import bisect
+import os
+import re
+from html.parser import HTMLParser
+
+import mozpack.path as mozpath
+import yaml
+from fluent.syntax import ast, parse, visitor
+from mozlint import result
+from mozlint.pathutils import expand_exclusions
+
+
+class TextElementHTMLParser(HTMLParser):
+ """HTML Parser for TextElement.
+
+ TextElements may contain embedded html tags, which can include
+ quotes in attributes. We only want to check the actual text.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.extracted_text = []
+
+ def handle_data(self, data):
+ self.extracted_text.append(data)
+
+
+class Linter(visitor.Visitor):
+ """Fluent linter implementation.
+
+ This subclasses the Fluent AST visitor. Methods are called corresponding
+ to each type of node in the Fluent AST. It is possible to control
+ whether a node is recursed into by calling the generic_visit method on
+ the superclass.
+
+ See the documentation here:
+ https://www.projectfluent.org/python-fluent/fluent.syntax/stable/usage.html
+ """
+
+ def __init__(
+ self, path, config, exclusions, contents, offsets_and_lines, brand_names=[]
+ ):
+ super().__init__()
+ self.path = path
+ self.config = config
+ self.exclusions = exclusions
+ self.contents = contents
+ self.offsets_and_lines = offsets_and_lines
+
+ self.results = []
+ self.identifier_re = re.compile(r"[a-z0-9-]+")
+ self.apostrophe_re = re.compile(r"\w'")
+ self.incorrect_apostrophe_re = re.compile(r"\w\u2018\w")
+ self.single_quote_re = re.compile(r"'(.+)'")
+ self.double_quote_re = re.compile(r"\".+\"")
+ self.ellipsis_re = re.compile(r"\.\.\.")
+
+ self.brand_names = brand_names
+ self.minimum_id_length = 9
+
+ self.state = {
+ # The resource comment should be at the top of the page after the license.
+ "node_can_be_resource_comment": True,
+ # Group comments must be followed by a message. Two group comments are not
+ # allowed in a row.
+ "can_have_group_comment": True,
+ # Comment bound to the current message
+ "comment": "",
+ # The current group comment
+ "group_comment": "",
+ # Variables in the current message
+ "variables": [],
+ }
+
+ # Set this to true to debug print the root node's json. This is useful for
+ # writing new lint rules, or debugging existing ones.
+ self.debug_print_json = False
+
+ def generic_visit(self, node):
+ node_name = type(node).__name__
+ self.state["node_can_be_resource_comment"] = self.state[
+ "node_can_be_resource_comment"
+ ] and (
+ # This is the root node.
+ node_name == "Resource"
+ # Empty space is allowed.
+ or node_name == "Span"
+ # Comments are allowed
+ or node_name == "Comment"
+ )
+
+ if self.debug_print_json:
+ import json
+
+ print(json.dumps(node.to_json(), indent=2))
+ # Only debug print the root node.
+ self.debug_print_json = False
+
+ super(Linter, self).generic_visit(node)
+
+ def visit_Attribute(self, node):
+ # Only visit values for Attribute nodes, the identifier comes from dom.
+ super().generic_visit(node.value)
+
+ def visit_FunctionReference(self, node):
+ # We don't recurse into function references, the identifiers there are
+ # allowed to be free form.
+ pass
+
+ def visit_Message(self, node):
+ # There must be at least one message or term between group comments.
+ self.state["can_have_group_comment"] = True
+ self.last_message_id = node.id.name
+
+ super().generic_visit(node)
+
+ # Check if variables are referenced in comments
+ if self.state["variables"]:
+ comments = self.state["comment"] + self.state["group_comment"]
+ missing_references = [
+ v for v in self.state["variables"] if f"${v}" not in comments
+ ]
+ if missing_references:
+ self.add_error(
+ node,
+ "VC01",
+ "Messages including variables should have a comment "
+ "explaining what will replace the variable. "
+ "Missing references: "
+ + ", ".join([f"${m}" for m in missing_references]),
+ )
+
+ # Reset current comment and variable references after reading the
+ # message.
+ self.state["comment"] = ""
+ self.state["variables"] = []
+
+ def visit_Term(self, node):
+ # There must be at least one message or term between group comments.
+ self.state["can_have_group_comment"] = True
+ self.last_message_id = None
+
+ super().generic_visit(node)
+
+ # Reset current comment and variable references after reading the term.
+ self.state["comment"] = ""
+ self.state["variables"] = []
+
+ def visit_MessageReference(self, node):
+ # We don't recurse into message references, the identifiers are either
+ # checked elsewhere or are attributes and come from DOM.
+ pass
+
+ def visit_Identifier(self, node):
+ if (
+ self.path not in self.exclusions["ID01"]["files"]
+ and node.name not in self.exclusions["ID01"]["messages"]
+ and not self.identifier_re.fullmatch(node.name)
+ ):
+ self.add_error(
+ node,
+ "ID01",
+ "Identifiers may only contain lowercase characters and -",
+ )
+ if (
+ len(node.name) < self.minimum_id_length
+ and self.path not in self.exclusions["ID02"]["files"]
+ and node.name not in self.exclusions["ID02"]["messages"]
+ ):
+ self.add_error(
+ node,
+ "ID02",
+ f"Identifiers must be at least {self.minimum_id_length} characters long",
+ )
+
+ def visit_TextElement(self, node):
+ parser = TextElementHTMLParser()
+ parser.feed(node.value)
+ for text in parser.extracted_text:
+ # To check for apostrophes, first remove pairs of straight quotes
+ # used as delimiters.
+ cleaned_str = re.sub(self.single_quote_re, "\1", node.value)
+ if self.apostrophe_re.search(cleaned_str):
+ self.add_error(
+ node,
+ "TE01",
+ "Strings with apostrophes should use foo\u2019s instead of foo's.",
+ )
+ if self.incorrect_apostrophe_re.search(text):
+ self.add_error(
+ node,
+ "TE02",
+ "Strings with apostrophes should use foo\u2019s instead of foo\u2018s.",
+ )
+ if self.single_quote_re.search(text):
+ self.add_error(
+ node,
+ "TE03",
+ "Single-quoted strings should use Unicode \u2018foo\u2019 instead of 'foo'.",
+ )
+ if self.double_quote_re.search(text):
+ self.add_error(
+ node,
+ "TE04",
+ 'Double-quoted strings should use Unicode \u201cfoo\u201d instead of "foo".',
+ )
+ if self.ellipsis_re.search(text):
+ self.add_error(
+ node,
+ "TE05",
+ "Strings with an ellipsis should use the Unicode \u2026 character"
+ " instead of three periods",
+ )
+
+ # If part of a message, check for brand names
+ if (
+ self.last_message_id is not None
+ and self.path not in self.exclusions["CO01"]["files"]
+ and self.last_message_id not in self.exclusions["CO01"]["messages"]
+ ):
+ found_brands = []
+ for brand in self.brand_names:
+ if brand in text:
+ found_brands.append(brand)
+ if found_brands:
+ self.add_error(
+ node,
+ "CO01",
+ "Strings should use the corresponding terms instead of"
+ f" hard-coded brand names ({', '.join(found_brands)})",
+ )
+
+ def visit_ResourceComment(self, node):
+ # This node is a comment with: "###"
+ if not self.state["node_can_be_resource_comment"]:
+ self.add_error(
+ node,
+ "RC01",
+ "Resource comments (###) should be placed at the top of the file, just "
+ "after the license header. There should only be one resource comment "
+ "per file.",
+ )
+ return
+
+ lines_after = get_newlines_count_after(node.span, self.contents)
+ lines_before = get_newlines_count_before(node.span, self.contents)
+
+ if node.span.end == len(self.contents) - 1:
+ # This file only contains a resource comment.
+ return
+
+ if lines_after != 2:
+ self.add_error(
+ node,
+ "RC02",
+ "Resource comments (###) should be followed by one empty line.",
+ )
+ return
+
+ if lines_before != 2:
+ self.add_error(
+ node,
+ "RC03",
+ "Resource comments (###) should have one empty line above them.",
+ )
+ return
+
+ def visit_SelectExpression(self, node):
+ # We only want to visit the variant values, the identifiers in selectors
+ # and keys are allowed to be free form.
+ for variant in node.variants:
+ super().generic_visit(variant.value)
+
+ # Store the variable used for the SelectExpression, excluding functions
+ # like PLATFORM()
+ if (
+ type(node.selector) == ast.VariableReference
+ and node.selector.id.name not in self.state["variables"]
+ ):
+ self.state["variables"].append(node.selector.id.name)
+
+ def visit_Comment(self, node):
+ # This node is a comment with: "#"
+
+ # Store the comment
+ self.state["comment"] = node.content
+
+ def visit_GroupComment(self, node):
+ # This node is a comment with: "##"
+
+ # Store the group comment
+ self.state["group_comment"] = node.content
+
+ if not self.state["can_have_group_comment"]:
+ self.add_error(
+ node,
+ "GC04",
+ "Group comments (##) must be followed by at least one message "
+ "or term. Make sure that a single group comment with multiple "
+ "paragraphs is not separated by whitespace, as it will be "
+ "interpreted as two different comments.",
+ )
+ return
+
+ self.state["can_have_group_comment"] = False
+
+ lines_after = get_newlines_count_after(node.span, self.contents)
+ lines_before = get_newlines_count_before(node.span, self.contents)
+
+ if node.span.end == len(self.contents) - 1:
+ # The group comment is the last thing in the file.
+
+ if node.content == "":
+ # Empty comments are allowed at the end of the file.
+ return
+
+ self.add_error(
+ node,
+ "GC01",
+ "Group comments (##) should not be at the end of the file, they should "
+ "always be above a message. Only an empty group comment is allowed at "
+ "the end of a file.",
+ )
+ return
+
+ if lines_after != 2:
+ self.add_error(
+ node,
+ "GC02",
+ "Group comments (##) should be followed by one empty line.",
+ )
+ return
+
+ if lines_before != 2:
+ self.add_error(
+ node,
+ "GC03",
+ "Group comments (##) should have an empty line before them.",
+ )
+ return
+
+ def visit_VariableReference(self, node):
+ # Identifiers are allowed to be free form, but need to store them
+ # for comment checks.
+
+ if node.id.name not in self.state["variables"]:
+ self.state["variables"].append(node.id.name)
+
+ def add_error(self, node, rule, msg):
+ (col, line) = self.span_to_line_and_col(node.span)
+ res = {
+ "path": self.path,
+ "lineno": line,
+ "column": col,
+ "rule": rule,
+ "message": msg,
+ }
+ self.results.append(result.from_config(self.config, **res))
+
+ def span_to_line_and_col(self, span):
+ i = bisect.bisect_left(self.offsets_and_lines, (span.start, 0))
+ if i > 0:
+ col = span.start - self.offsets_and_lines[i - 1][0]
+ else:
+ col = 1 + span.start
+ return (col, self.offsets_and_lines[i][1])
+
+
+def get_offsets_and_lines(contents):
+ """Return a list consisting of tuples of (offset, line).
+
+ The Fluent AST contains spans of start and end offsets in the file.
+ This function returns a list of offsets and line numbers so that errors
+ can be reported using line and column.
+ """
+ line = 1
+ result = []
+ for m in re.finditer(r"\n", contents):
+ result.append((m.start(), line))
+ line += 1
+ return result
+
+
+def get_newlines_count_after(span, contents):
+ # Determine the number of newlines.
+ count = 0
+ for i in range(span.end, len(contents)):
+ assert contents[i] != "\r", "This linter does not handle \\r characters."
+ if contents[i] != "\n":
+ break
+ count += 1
+
+ return count
+
+
+def get_newlines_count_before(span, contents):
+ # Determine the range of newline characters.
+ count = 0
+ for i in range(span.start - 1, 0, -1):
+ assert contents[i] != "\r", "This linter does not handle \\r characters."
+ if contents[i] != "\n":
+ break
+ count += 1
+
+ return count
+
+
+def get_exclusions(root):
+ with open(
+ mozpath.join(root, "tools", "lint", "fluent-lint", "exclusions.yml")
+ ) as f:
+ exclusions = list(yaml.safe_load_all(f))[0]
+ for error_type in exclusions:
+ exclusions[error_type]["files"] = set(
+ [mozpath.join(root, x) for x in exclusions[error_type]["files"]]
+ )
+ return exclusions
+
+
+def get_branding_list(root, brand_files):
+ class MessageExtractor(visitor.Visitor):
+ def __init__(self):
+ self.brands = []
+ self.last_message_id = None
+
+ def visit_Term(self, node):
+ self.last_message_id = node.id.name
+ self.generic_visit(node)
+
+ def visit_TextElement(self, node):
+ if self.last_message_id:
+ self.brands += [node.value]
+ self.last_message_id = None
+ self.generic_visit(node)
+
+ extractor = MessageExtractor()
+
+ for brand_path in brand_files:
+ brand_file = mozpath.join(root, brand_path)
+ if os.path.exists(brand_file):
+ with open(brand_file, encoding="utf-8") as f:
+ messages = parse(f.read())
+ extractor.visit(messages)
+
+ return list(set(extractor.brands))
+
+
+def lint(paths, config, fix=None, **lintargs):
+ root = lintargs["root"]
+ files = list(expand_exclusions(paths, config, root))
+ exclusions = get_exclusions(root)
+ brand_files = config.get("brand-files")
+ brand_names = get_branding_list(root, brand_files)
+ results = []
+ for path in files:
+ contents = open(path, "r", encoding="utf-8").read()
+ linter = Linter(
+ path,
+ config,
+ exclusions,
+ contents,
+ get_offsets_and_lines(contents),
+ brand_names,
+ )
+ linter.visit(parse(contents))
+ results.extend(linter.results)
+ return results