diff options
Diffstat (limited to '')
-rw-r--r-- | tools/lint/fluent-lint/__init__.py | 470 |
1 files changed, 470 insertions, 0 deletions
diff --git a/tools/lint/fluent-lint/__init__.py b/tools/lint/fluent-lint/__init__.py new file mode 100644 index 0000000000..3d0373ea01 --- /dev/null +++ b/tools/lint/fluent-lint/__init__.py @@ -0,0 +1,470 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +import bisect +import os +import re +from html.parser import HTMLParser + +import mozpack.path as mozpath +import yaml +from fluent.syntax import ast, parse, visitor +from mozlint import result +from mozlint.pathutils import expand_exclusions + + +class TextElementHTMLParser(HTMLParser): + """HTML Parser for TextElement. + + TextElements may contain embedded html tags, which can include + quotes in attributes. We only want to check the actual text. + """ + + def __init__(self): + super().__init__() + self.extracted_text = [] + + def handle_data(self, data): + self.extracted_text.append(data) + + +class Linter(visitor.Visitor): + """Fluent linter implementation. + + This subclasses the Fluent AST visitor. Methods are called corresponding + to each type of node in the Fluent AST. It is possible to control + whether a node is recursed into by calling the generic_visit method on + the superclass. + + See the documentation here: + https://www.projectfluent.org/python-fluent/fluent.syntax/stable/usage.html + """ + + def __init__( + self, path, config, exclusions, contents, offsets_and_lines, brand_names=[] + ): + super().__init__() + self.path = path + self.config = config + self.exclusions = exclusions + self.contents = contents + self.offsets_and_lines = offsets_and_lines + + self.results = [] + self.identifier_re = re.compile(r"[a-z0-9-]+") + self.apostrophe_re = re.compile(r"\w'") + self.incorrect_apostrophe_re = re.compile(r"\w\u2018\w") + self.single_quote_re = re.compile(r"'(.+)'") + self.double_quote_re = re.compile(r"\".+\"") + self.ellipsis_re = re.compile(r"\.\.\.") + + self.brand_names = brand_names + self.minimum_id_length = 9 + + self.state = { + # The resource comment should be at the top of the page after the license. + "node_can_be_resource_comment": True, + # Group comments must be followed by a message. Two group comments are not + # allowed in a row. + "can_have_group_comment": True, + # Comment bound to the current message + "comment": "", + # The current group comment + "group_comment": "", + # Variables in the current message + "variables": [], + } + + # Set this to true to debug print the root node's json. This is useful for + # writing new lint rules, or debugging existing ones. + self.debug_print_json = False + + def generic_visit(self, node): + node_name = type(node).__name__ + self.state["node_can_be_resource_comment"] = self.state[ + "node_can_be_resource_comment" + ] and ( + # This is the root node. + node_name == "Resource" + # Empty space is allowed. + or node_name == "Span" + # Comments are allowed + or node_name == "Comment" + ) + + if self.debug_print_json: + import json + + print(json.dumps(node.to_json(), indent=2)) + # Only debug print the root node. + self.debug_print_json = False + + super(Linter, self).generic_visit(node) + + def visit_Attribute(self, node): + # Only visit values for Attribute nodes, the identifier comes from dom. + super().generic_visit(node.value) + + def visit_FunctionReference(self, node): + # We don't recurse into function references, the identifiers there are + # allowed to be free form. + pass + + def visit_Message(self, node): + # There must be at least one message or term between group comments. + self.state["can_have_group_comment"] = True + self.last_message_id = node.id.name + + super().generic_visit(node) + + # Check if variables are referenced in comments + if self.state["variables"]: + comments = self.state["comment"] + self.state["group_comment"] + missing_references = [ + v for v in self.state["variables"] if f"${v}" not in comments + ] + if missing_references: + self.add_error( + node, + "VC01", + "Messages including variables should have a comment " + "explaining what will replace the variable. " + "Missing references: " + + ", ".join([f"${m}" for m in missing_references]), + ) + + # Reset current comment and variable references after reading the + # message. + self.state["comment"] = "" + self.state["variables"] = [] + + def visit_Term(self, node): + # There must be at least one message or term between group comments. + self.state["can_have_group_comment"] = True + self.last_message_id = None + + super().generic_visit(node) + + # Reset current comment and variable references after reading the term. + self.state["comment"] = "" + self.state["variables"] = [] + + def visit_MessageReference(self, node): + # We don't recurse into message references, the identifiers are either + # checked elsewhere or are attributes and come from DOM. + pass + + def visit_Identifier(self, node): + if ( + self.path not in self.exclusions["ID01"]["files"] + and node.name not in self.exclusions["ID01"]["messages"] + and not self.identifier_re.fullmatch(node.name) + ): + self.add_error( + node, + "ID01", + "Identifiers may only contain lowercase characters and -", + ) + if ( + len(node.name) < self.minimum_id_length + and self.path not in self.exclusions["ID02"]["files"] + and node.name not in self.exclusions["ID02"]["messages"] + ): + self.add_error( + node, + "ID02", + f"Identifiers must be at least {self.minimum_id_length} characters long", + ) + + def visit_TextElement(self, node): + parser = TextElementHTMLParser() + parser.feed(node.value) + for text in parser.extracted_text: + # To check for apostrophes, first remove pairs of straight quotes + # used as delimiters. + cleaned_str = re.sub(self.single_quote_re, "\1", node.value) + if self.apostrophe_re.search(cleaned_str): + self.add_error( + node, + "TE01", + "Strings with apostrophes should use foo\u2019s instead of foo's.", + ) + if self.incorrect_apostrophe_re.search(text): + self.add_error( + node, + "TE02", + "Strings with apostrophes should use foo\u2019s instead of foo\u2018s.", + ) + if self.single_quote_re.search(text): + self.add_error( + node, + "TE03", + "Single-quoted strings should use Unicode \u2018foo\u2019 instead of 'foo'.", + ) + if self.double_quote_re.search(text): + self.add_error( + node, + "TE04", + 'Double-quoted strings should use Unicode \u201cfoo\u201d instead of "foo".', + ) + if self.ellipsis_re.search(text): + self.add_error( + node, + "TE05", + "Strings with an ellipsis should use the Unicode \u2026 character" + " instead of three periods", + ) + + # If part of a message, check for brand names + if ( + self.last_message_id is not None + and self.path not in self.exclusions["CO01"]["files"] + and self.last_message_id not in self.exclusions["CO01"]["messages"] + ): + found_brands = [] + for brand in self.brand_names: + if brand in text: + found_brands.append(brand) + if found_brands: + self.add_error( + node, + "CO01", + "Strings should use the corresponding terms instead of" + f" hard-coded brand names ({', '.join(found_brands)})", + ) + + def visit_ResourceComment(self, node): + # This node is a comment with: "###" + if not self.state["node_can_be_resource_comment"]: + self.add_error( + node, + "RC01", + "Resource comments (###) should be placed at the top of the file, just " + "after the license header. There should only be one resource comment " + "per file.", + ) + return + + lines_after = get_newlines_count_after(node.span, self.contents) + lines_before = get_newlines_count_before(node.span, self.contents) + + if node.span.end == len(self.contents) - 1: + # This file only contains a resource comment. + return + + if lines_after != 2: + self.add_error( + node, + "RC02", + "Resource comments (###) should be followed by one empty line.", + ) + return + + if lines_before != 2: + self.add_error( + node, + "RC03", + "Resource comments (###) should have one empty line above them.", + ) + return + + def visit_SelectExpression(self, node): + # We only want to visit the variant values, the identifiers in selectors + # and keys are allowed to be free form. + for variant in node.variants: + super().generic_visit(variant.value) + + # Store the variable used for the SelectExpression, excluding functions + # like PLATFORM() + if ( + type(node.selector) == ast.VariableReference + and node.selector.id.name not in self.state["variables"] + ): + self.state["variables"].append(node.selector.id.name) + + def visit_Comment(self, node): + # This node is a comment with: "#" + + # Store the comment + self.state["comment"] = node.content + + def visit_GroupComment(self, node): + # This node is a comment with: "##" + + # Store the group comment + self.state["group_comment"] = node.content + + if not self.state["can_have_group_comment"]: + self.add_error( + node, + "GC04", + "Group comments (##) must be followed by at least one message " + "or term. Make sure that a single group comment with multiple " + "paragraphs is not separated by whitespace, as it will be " + "interpreted as two different comments.", + ) + return + + self.state["can_have_group_comment"] = False + + lines_after = get_newlines_count_after(node.span, self.contents) + lines_before = get_newlines_count_before(node.span, self.contents) + + if node.span.end == len(self.contents) - 1: + # The group comment is the last thing in the file. + + if node.content == "": + # Empty comments are allowed at the end of the file. + return + + self.add_error( + node, + "GC01", + "Group comments (##) should not be at the end of the file, they should " + "always be above a message. Only an empty group comment is allowed at " + "the end of a file.", + ) + return + + if lines_after != 2: + self.add_error( + node, + "GC02", + "Group comments (##) should be followed by one empty line.", + ) + return + + if lines_before != 2: + self.add_error( + node, + "GC03", + "Group comments (##) should have an empty line before them.", + ) + return + + def visit_VariableReference(self, node): + # Identifiers are allowed to be free form, but need to store them + # for comment checks. + + if node.id.name not in self.state["variables"]: + self.state["variables"].append(node.id.name) + + def add_error(self, node, rule, msg): + (col, line) = self.span_to_line_and_col(node.span) + res = { + "path": self.path, + "lineno": line, + "column": col, + "rule": rule, + "message": msg, + } + self.results.append(result.from_config(self.config, **res)) + + def span_to_line_and_col(self, span): + i = bisect.bisect_left(self.offsets_and_lines, (span.start, 0)) + if i > 0: + col = span.start - self.offsets_and_lines[i - 1][0] + else: + col = 1 + span.start + return (col, self.offsets_and_lines[i][1]) + + +def get_offsets_and_lines(contents): + """Return a list consisting of tuples of (offset, line). + + The Fluent AST contains spans of start and end offsets in the file. + This function returns a list of offsets and line numbers so that errors + can be reported using line and column. + """ + line = 1 + result = [] + for m in re.finditer(r"\n", contents): + result.append((m.start(), line)) + line += 1 + return result + + +def get_newlines_count_after(span, contents): + # Determine the number of newlines. + count = 0 + for i in range(span.end, len(contents)): + assert contents[i] != "\r", "This linter does not handle \\r characters." + if contents[i] != "\n": + break + count += 1 + + return count + + +def get_newlines_count_before(span, contents): + # Determine the range of newline characters. + count = 0 + for i in range(span.start - 1, 0, -1): + assert contents[i] != "\r", "This linter does not handle \\r characters." + if contents[i] != "\n": + break + count += 1 + + return count + + +def get_exclusions(root): + with open( + mozpath.join(root, "tools", "lint", "fluent-lint", "exclusions.yml") + ) as f: + exclusions = list(yaml.safe_load_all(f))[0] + for error_type in exclusions: + exclusions[error_type]["files"] = set( + [mozpath.join(root, x) for x in exclusions[error_type]["files"]] + ) + return exclusions + + +def get_branding_list(root, brand_files): + class MessageExtractor(visitor.Visitor): + def __init__(self): + self.brands = [] + self.last_message_id = None + + def visit_Term(self, node): + self.last_message_id = node.id.name + self.generic_visit(node) + + def visit_TextElement(self, node): + if self.last_message_id: + self.brands += [node.value] + self.last_message_id = None + self.generic_visit(node) + + extractor = MessageExtractor() + + for brand_path in brand_files: + brand_file = mozpath.join(root, brand_path) + if os.path.exists(brand_file): + with open(brand_file, encoding="utf-8") as f: + messages = parse(f.read()) + extractor.visit(messages) + + return list(set(extractor.brands)) + + +def lint(paths, config, fix=None, **lintargs): + root = lintargs["root"] + files = list(expand_exclusions(paths, config, root)) + exclusions = get_exclusions(root) + brand_files = config.get("brand-files") + brand_names = get_branding_list(root, brand_files) + results = [] + for path in files: + contents = open(path, "r", encoding="utf-8").read() + linter = Linter( + path, + config, + exclusions, + contents, + get_offsets_and_lines(contents), + brand_names, + ) + linter.visit(parse(contents)) + results.extend(linter.results) + return results |