diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/python/fluent.migrate/fluent/migrate | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/python/fluent.migrate/fluent/migrate')
14 files changed, 2189 insertions, 0 deletions
diff --git a/third_party/python/fluent.migrate/fluent/migrate/__init__.py b/third_party/python/fluent.migrate/fluent/migrate/__init__.py new file mode 100644 index 0000000000..158277a0a2 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/__init__.py @@ -0,0 +1,8 @@ +from .transforms import ( # noqa: F401 + CONCAT, + COPY, + COPY_PATTERN, + PLURALS, + REPLACE, + REPLACE_IN_TEXT, +) diff --git a/third_party/python/fluent.migrate/fluent/migrate/_context.py b/third_party/python/fluent.migrate/fluent/migrate/_context.py new file mode 100644 index 0000000000..34a23cde67 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/_context.py @@ -0,0 +1,351 @@ +from __future__ import annotations +from typing import Dict, Optional, Set, Tuple, cast + +import os +import codecs +from functools import partial +import logging +from itertools import zip_longest + +from compare_locales.parser import getParser +from compare_locales.plurals import get_plural +import fluent.syntax.ast as FTL +from fluent.syntax.parser import FluentParser +from fluent.syntax.serializer import FluentSerializer + +from .changesets import Changes +from .errors import UnreadableReferenceError +from .evaluator import Evaluator +from .merge import merge_resource +from .transforms import Source + + +class InternalContext: + """Internal context for merging translation resources. + + For the public interface, see `context.MigrationContext`. + """ + + dependencies: Dict[Tuple[str, str], Set[Tuple[str, Source]]] = {} + localization_dir: str + reference_dir: str + + def __init__(self, lang, enforce_translated=False): + self.fluent_parser = FluentParser(with_spans=False) + self.fluent_serializer = FluentSerializer() + + # An iterable of plural category names relevant to the context's + # language. E.g. ('one', 'other') for English. + self.plural_categories = get_plural(lang) + if self.plural_categories is None: + logger = logging.getLogger("migrate") + logger.warning( + f'Plural rule for "{lang}" is not defined in "compare-locales"' + ) + self.plural_categories = ("one", "other") + + self.enforce_translated = enforce_translated + # Parsed input resources stored by resource path. + self.reference_resources = {} + self.localization_resources = {} + self.target_resources = {} + + # An iterable of `FTL.Message` objects some of whose nodes can be the + # transform operations. + self.transforms = {} + + # The evaluator instance is an AST transformer capable of walking an + # AST hierarchy and evaluating nodes which are migration Transforms. + self.evaluator = Evaluator(self) + + def read_ftl_resource(self, path: str): + """Read an FTL resource and parse it into an AST.""" + f = codecs.open(path, "r", "utf8") + try: + contents = f.read() + except UnicodeDecodeError as err: + logger = logging.getLogger("migrate") + logger.warning(f"Unable to read file {path}: {err}") + raise err + finally: + f.close() + + ast = self.fluent_parser.parse(contents) + + annots = [ + annot + for entry in ast.body + if isinstance(entry, FTL.Junk) + for annot in entry.annotations + ] + + if len(annots): + logger = logging.getLogger("migrate") + for annot in annots: + msg = annot.message + logger.warning(f"Syntax error in {path}: {msg}") + + return ast + + def read_legacy_resource(self, path: str): + """Read a legacy resource and parse it into a dict.""" + parser = getParser(path) + parser.readFile(path) + # Transform the parsed result which is an iterator into a dict. + return { + entity.key: entity.val + for entity in parser + if entity.localized or self.enforce_translated + } + + def read_reference_ftl(self, path: str): + """Read and parse a reference FTL file. + + A missing resource file is a fatal error and will raise an + UnreadableReferenceError. + """ + fullpath = os.path.join(self.reference_dir, path) + try: + return self.read_ftl_resource(fullpath) + except OSError: + error_message = f"Missing reference file: {fullpath}" + logging.getLogger("migrate").error(error_message) + raise UnreadableReferenceError(error_message) + except UnicodeDecodeError as err: + error_message = f"Error reading file {fullpath}: {err}" + logging.getLogger("migrate").error(error_message) + raise UnreadableReferenceError(error_message) + + def read_localization_ftl(self, path: str): + """Read and parse an existing localization FTL file. + + Create a new FTL.Resource if the file doesn't exist or can't be + decoded. + """ + fullpath = os.path.join(self.localization_dir, path) + try: + return self.read_ftl_resource(fullpath) + except OSError: + logger = logging.getLogger("migrate") + logger.info( + "Localization file {} does not exist and " + "it will be created".format(path) + ) + return FTL.Resource() + except UnicodeDecodeError: + logger = logging.getLogger("migrate") + logger.warning( + "Localization file {} has broken encoding. " + "It will be re-created and some translations " + "may be lost".format(path) + ) + return FTL.Resource() + + def maybe_add_localization(self, path: str): + """Add a localization resource to migrate translations from. + + Uses a compare-locales parser to create a dict of (key, string value) + tuples. + For Fluent sources, we store the AST. + """ + try: + fullpath = os.path.join(self.localization_dir, path) + if not fullpath.endswith(".ftl"): + collection = self.read_legacy_resource(fullpath) + else: + collection = self.read_ftl_resource(fullpath) + except OSError: + logger = logging.getLogger("migrate") + logger.warning(f"Missing localization file: {path}") + else: + self.localization_resources[path] = collection + + def get_legacy_source(self, path: str, key: str): + """Get an entity value from a localized legacy source. + + Used by the `Source` transform. + """ + resource = self.localization_resources[path] + return resource.get(key, None) + + def get_fluent_source_pattern(self, path: str, key: str): + """Get a pattern from a localized Fluent source. + + If the key contains a `.`, does an attribute lookup. + Used by the `COPY_PATTERN` transform. + """ + resource = self.localization_resources[path] + msg_key, _, attr_key = key.partition(".") + found = None + for entry in resource.body: + if isinstance(entry, (FTL.Message, FTL.Term)): + if entry.id.name == msg_key: + found = entry + break + if found is None: + return None + if not attr_key: + return found.value + for attribute in found.attributes: + if attribute.id.name == attr_key: + return attribute.value + return None + + def messages_equal(self, res1, res2): + """Compare messages and terms of two FTL resources. + + Uses FTL.BaseNode.equals to compare all messages/terms + in two FTL resources. + If the order or number of messages differ, the result is also False. + """ + + def message_id(message): + "Return the message's identifer name for sorting purposes." + return message.id.name + + messages1 = sorted( + ( + entry + for entry in res1.body + if isinstance(entry, FTL.Message) or isinstance(entry, FTL.Term) + ), + key=message_id, + ) + messages2 = sorted( + ( + entry + for entry in res2.body + if isinstance(entry, FTL.Message) or isinstance(entry, FTL.Term) + ), + key=message_id, + ) + for msg1, msg2 in zip_longest(messages1, messages2): + if msg1 is None or msg2 is None: + return False + if not msg1.equals(msg2): + return False + return True + + def merge_changeset( + self, + changeset: Optional[Changes] = None, + known_translations: Optional[Changes] = None, + ): + """Return a generator of FTL ASTs for the changeset. + + The input data must be configured earlier using the `add_*` methods. + if given, `changeset` must be a set of (path, key) tuples describing + which legacy translations are to be merged. If `changeset` is None, + all legacy translations will be allowed to be migrated in a single + changeset. + + We use the `in_changeset` method to determine if a message should be + migrated for the given changeset. + + Given `changeset`, return a dict whose keys are resource paths and + values are `FTL.Resource` instances. The values will also be used to + update this context's existing localization resources. + """ + + if changeset is None: + # Merge all known legacy translations. Used in tests. + changeset = { + (path, key) + for path, strings in self.localization_resources.items() + if not path.endswith(".ftl") + for key in strings.keys() + } + + if known_translations is None: + known_translations = changeset + + for path, reference in self.reference_resources.items(): + current = self.target_resources[path] + transforms = self.transforms.get(path, []) + in_changeset = partial( + self.in_changeset, changeset, known_translations, path + ) + + # Merge legacy translations with the existing ones using the + # reference as a template. + snapshot = merge_resource( + self, reference, current, transforms, in_changeset + ) + + # Skip this path if the messages in the merged snapshot are + # identical to those in the current state of the localization file. + # This may happen when: + # + # - none of the transforms is in the changset, or + # - all messages which would be migrated by the context's + # transforms already exist in the current state. + if self.messages_equal(current, snapshot): + continue + + # Store the merged snapshot on the context so that the next merge + # already takes it into account as the existing localization. + self.target_resources[path] = snapshot + + # The result for this path is a complete `FTL.Resource`. + yield path, snapshot + + def in_changeset( + self, changeset: Changes, known_translations: Changes, path: str, ident + ) -> bool: + """Check if a message should be migrated in this changeset. + + The message is identified by path and ident. + + + A message will be migrated only if all of its dependencies + are present in the currently processed changeset. + + If a transform defined for this message points to a missing + legacy translation, this message will not be merged. The + missing legacy dependency won't be present in the changeset. + + This also means that partially translated messages (e.g. + constructed from two legacy strings out of which only one is + avaiable) will never be migrated. + """ + message_deps = self.dependencies.get((path, ident), None) + + # Don't merge if we don't have a transform for this message. + if message_deps is None: + return False + + # As a special case, if a transform exists but has no + # dependecies, it's a hardcoded `FTL.Node` which doesn't + # migrate any existing translation but rather creates a new + # one. Merge it. + if len(message_deps) == 0: + return True + + # Make sure all the dependencies are present in the current + # changeset. Partial migrations are not currently supported. + # See https://bugzilla.mozilla.org/show_bug.cgi?id=1321271 + # We only return True if our current changeset touches + # the transform, and we have all of the dependencies. + active_deps = cast(bool, message_deps & changeset) + available_deps = message_deps & known_translations + return active_deps and message_deps == available_deps + + def serialize_changeset( + self, changeset: Changes, known_translations: Optional[Changes] = None + ): + """Return a dict of serialized FTLs for the changeset. + + Given `changeset`, return a dict whose keys are resource paths and + values are serialized FTL snapshots. + """ + + return { + path: self.fluent_serializer.serialize(snapshot) + for path, snapshot in self.merge_changeset(changeset, known_translations) + } + + def evaluate(self, node): + return self.evaluator.visit(node) + + +logging.basicConfig() diff --git a/third_party/python/fluent.migrate/fluent/migrate/blame.py b/third_party/python/fluent.migrate/fluent/migrate/blame.py new file mode 100644 index 0000000000..7ea505edaf --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/blame.py @@ -0,0 +1,77 @@ +from __future__ import annotations +from typing import Dict, Iterable, Tuple, TypedDict, cast + +import argparse +import json +from os.path import join + +from compare_locales.parser import Junk, getParser +from compare_locales.parser.fluent import FluentEntity + +from .repo_client import RepoClient + +BlameData = Dict[str, Dict[str, Tuple[int, float]]] +"File path -> message key -> [userid, timestamp]" + + +class BlameResult(TypedDict): + authors: list[str] + blame: BlameData + + +class Blame: + def __init__(self, client: RepoClient): + self.client = client + self.users: list[str] = [] + self.blame: BlameData = {} + + def attribution(self, file_paths: Iterable[str]) -> BlameResult: + for file in file_paths: + blame = self.client.blame(file) + self.handleFile(file, blame) + return {"authors": self.users, "blame": self.blame} + + def handleFile(self, path: str, file_blame: list[Tuple[str, int]]): + try: + parser = getParser(path) + except UserWarning: + return + + self.blame[path] = {} + + self.readFile(parser, path) + entities = parser.parse() + for e in entities: + if isinstance(e, Junk): + continue + if e.val_span: + key_vals: list[tuple[str, str]] = [(e.key, e.val_span)] + else: + key_vals = [] + if isinstance(e, FluentEntity): + key_vals += [ + (f"{e.key}.{attr.key}", cast(str, attr.val_span)) + for attr in e.attributes + ] + for key, (val_start, val_end) in key_vals: + entity_lines = file_blame[ + (e.ctx.linecol(val_start)[0] - 1) : e.ctx.linecol(val_end)[0] + ] + user, timestamp = max(entity_lines, key=lambda x: x[1]) + if user not in self.users: + self.users.append(user) + userid = self.users.index(user) + self.blame[path][key] = (userid, timestamp) + + def readFile(self, parser, path: str): + parser.readFile(join(self.client.root, path)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("repo_path") + parser.add_argument("file_path", nargs="+") + args = parser.parse_args() + blame = Blame(RepoClient(args.repo_path)) + attrib = blame.attribution(args.file_path) + print(json.dumps(attrib, indent=4, separators=(",", ": "))) diff --git a/third_party/python/fluent.migrate/fluent/migrate/changesets.py b/third_party/python/fluent.migrate/fluent/migrate/changesets.py new file mode 100644 index 0000000000..e687175550 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/changesets.py @@ -0,0 +1,66 @@ +from __future__ import annotations +from typing import Set, Tuple, TypedDict + +import time + +from .blame import BlameResult + +Changes = Set[Tuple[str, str]] + + +class Changeset(TypedDict): + author: str + first_commit: float + changes: Changes + + +def by_first_commit(item: Changeset): + """Order two changesets by their first commit date.""" + return item["first_commit"] + + +def convert_blame_to_changesets(blame_json: BlameResult) -> list[Changeset]: + """Convert a blame dict into a list of changesets. + + The blame information in `blame_json` should be a dict of the following + structure: + + { + 'authors': [ + 'A.N. Author <author@example.com>', + ], + 'blame': { + 'path/one': { + 'key1': [0, 1346095921.0], + }, + } + } + + It will be transformed into a list of changesets which can be fed into + `InternalContext.serialize_changeset`: + + [ + { + 'author': 'A.N. Author <author@example.com>', + 'first_commit': 1346095921.0, + 'changes': { + ('path/one', 'key1'), + } + }, + ] + + """ + now = time.time() + changesets: list[Changeset] = [ + {"author": author, "first_commit": now, "changes": set()} + for author in blame_json["authors"] + ] + + for path, keys_info in blame_json["blame"].items(): + for key, (author_index, timestamp) in keys_info.items(): + changeset = changesets[author_index] + changeset["changes"].add((path, key)) + if timestamp < changeset["first_commit"]: + changeset["first_commit"] = timestamp + + return sorted(changesets, key=by_first_commit) diff --git a/third_party/python/fluent.migrate/fluent/migrate/context.py b/third_party/python/fluent.migrate/fluent/migrate/context.py new file mode 100644 index 0000000000..8d32ab2c0a --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/context.py @@ -0,0 +1,160 @@ +from __future__ import annotations +from typing import List, Set, Tuple, cast + +import logging + +import fluent.syntax.ast as FTL +from fluent.migrate.util import fold + +from .transforms import Source +from .util import get_message, skeleton +from .errors import ( + EmptyLocalizationError, + UnreadableReferenceError, +) +from ._context import InternalContext + + +__all__ = [ + "EmptyLocalizationError", + "UnreadableReferenceError", + "MigrationContext", +] + + +class MigrationContext(InternalContext): + """Stateful context for merging translation resources. + + `MigrationContext` must be configured with the target locale and the + directory locations of the input data. + + The transformation takes four types of input data: + + - The en-US FTL reference files which will be used as templates for + message order, comments and sections. If the reference_dir is None, + the migration will create Messages and Terms in the order given by + the transforms. + + - The current FTL files for the given locale. + + - A list of `FTL.Message` or `FTL.Term` objects some of whose nodes + are special helper or transform nodes: + + helpers: VARIABLE_REFERENCE, MESSAGE_REFERENCE, TERM_REFERENCE + transforms: COPY, REPLACE_IN_TEXT, REPLACE, PLURALS, CONCAT + fluent value helper: COPY_PATTERN + + The legacy (DTD, properties) translation files are deduced by the + dependencies in the transforms. The translations from these files will be + read from the localization_dir and transformed into FTL and merged + into the existing FTL files for the given language. + """ + + def __init__( + self, + locale: str, + reference_dir: str, + localization_dir: str, + enforce_translated=False, + ): + super().__init__( + locale, + enforce_translated=enforce_translated, + ) + self.locale = locale + # Paths to directories with input data, relative to CWD. + self.reference_dir = reference_dir + self.localization_dir = localization_dir + + self.dependencies = {} + """ + A dict whose keys are `(path, key)` tuples corresponding to target + FTL translations, and values are sets of `(path, key)` tuples + corresponding to localized entities which will be migrated. + """ + + def add_transforms( + self, target: str, reference: str, transforms: List[FTL.Message | FTL.Term] + ): + """Define transforms for target using reference as template. + + `target` is a path of the destination FTL file relative to the + localization directory. `reference` is a path to the template FTL + file relative to the reference directory. + + Each transform is an extended FTL node with `Transform` nodes as some + values. Transforms are stored in their lazy AST form until + `merge_changeset` is called, at which point they are evaluated to real + FTL nodes with migrated translations. + + Each transform is scanned for `Source` nodes which will be used to + build the list of dependencies for the transformed message. + + For transforms that merely copy legacy messages or Fluent patterns, + using `fluent.migrate.helpers.transforms_from` is recommended. + """ + + def get_sources(acc, cur): + if isinstance(cur, Source): + acc.add((cur.path, cur.key)) + return acc + + if self.reference_dir is None: + # Add skeletons to resource body for each transform + # if there's no reference. + reference_ast = self.reference_resources.get(target) + if reference_ast is None: + reference_ast = FTL.Resource() + reference_ast.body.extend(skeleton(transform) for transform in transforms) + else: + reference_ast = self.read_reference_ftl(reference) + self.reference_resources[target] = reference_ast + + for node in transforms: + ident = cast(str, node.id.name) + # Scan `node` for `Source` nodes and collect the information they + # store into a set of dependencies. + dependencies = cast(Set[Tuple[str, Source]], fold(get_sources, node, set())) + # Set these sources as dependencies for the current transform. + self.dependencies[(target, ident)] = dependencies + + # The target Fluent message should exist in the reference file. If + # it doesn't, it's probably a typo. + # Of course, only if we're having a reference. + if self.reference_dir is None: + continue + if get_message(reference_ast.body, ident) is None: + logger = logging.getLogger("migrate") + logger.warning( + '{} "{}" was not found in {}'.format( + type(node).__name__, ident, reference + ) + ) + + # Keep track of localization resource paths which were defined as + # sources in the transforms. + expected_paths = set() + + # Read all legacy translation files defined in Source transforms. This + # may fail but a single missing legacy resource doesn't mean that the + # migration can't succeed. + for dependencies in self.dependencies.values(): + for path in {path for path, _ in dependencies}: + expected_paths.add(path) + self.maybe_add_localization(path) + + # However, if all legacy resources are missing, bail out early. There + # are no translations to migrate. We'd also get errors in hg annotate. + if len(expected_paths) > 0 and len(self.localization_resources) == 0: + error_message = "No localization files were found" + logging.getLogger("migrate").error(error_message) + raise EmptyLocalizationError(error_message) + + # Add the current transforms to any other transforms added earlier for + # this path. + path_transforms = self.transforms.setdefault(target, []) + path_transforms += transforms + + if target not in self.target_resources: + target_ast = self.read_localization_ftl(target) + self.target_resources[target] = target_ast diff --git a/third_party/python/fluent.migrate/fluent/migrate/errors.py b/third_party/python/fluent.migrate/fluent/migrate/errors.py new file mode 100644 index 0000000000..dcc3025377 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/errors.py @@ -0,0 +1,22 @@ +class SkipTransform(RuntimeError): + pass + + +class MigrationError(ValueError): + pass + + +class EmptyLocalizationError(MigrationError): + pass + + +class NotSupportedError(MigrationError): + pass + + +class UnreadableReferenceError(MigrationError): + pass + + +class InvalidTransformError(MigrationError): + pass diff --git a/third_party/python/fluent.migrate/fluent/migrate/evaluator.py b/third_party/python/fluent.migrate/fluent/migrate/evaluator.py new file mode 100644 index 0000000000..90c626f933 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/evaluator.py @@ -0,0 +1,28 @@ +from fluent.syntax import ast as FTL +from fluent.syntax.visitor import Transformer + +from .transforms import Transform + + +class Evaluator(Transformer): + """An AST transformer for evaluating migration Transforms. + + An AST transformer (i.e. a visitor capable of modifying the AST) which + walks an AST hierarchy and evaluates nodes which are migration Transforms. + """ + + def __init__(self, ctx): + self.ctx = ctx + + def visit(self, node): + if not isinstance(node, FTL.BaseNode): + return node + + if isinstance(node, Transform): + # Some transforms don't expect other transforms as children. + # Evaluate the children first. + transform = self.generic_visit(node) + # Then, evaluate this transform. + return transform(self.ctx) + + return self.generic_visit(node) diff --git a/third_party/python/fluent.migrate/fluent/migrate/helpers.py b/third_party/python/fluent.migrate/fluent/migrate/helpers.py new file mode 100644 index 0000000000..2a221d5de6 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/helpers.py @@ -0,0 +1,148 @@ +"""Fluent AST helpers. + +The functions defined in this module offer a shorthand for defining common AST +nodes. + +They take a string argument and immediately return a corresponding AST node. +(As opposed to Transforms which are AST nodes on their own and only return the +migrated AST nodes when they are evaluated by a MigrationContext.) """ + +from __future__ import annotations +from typing import List + +from fluent.syntax import FluentParser, ast as FTL +from fluent.syntax.visitor import Transformer +from .transforms import Transform, CONCAT, COPY, COPY_PATTERN +from .errors import NotSupportedError, InvalidTransformError + + +def VARIABLE_REFERENCE(name): + """Create an ExternalArgument expression.""" + + return FTL.VariableReference(id=FTL.Identifier(name)) + + +def MESSAGE_REFERENCE(name): + """Create a MessageReference expression. + + If the passed name contains a `.`, we're generating + a message reference with an attribute. + """ + if "." in name: + name, attribute = name.split(".") + attribute = FTL.Identifier(attribute) + else: + attribute = None + + return FTL.MessageReference( + id=FTL.Identifier(name), + attribute=attribute, + ) + + +def TERM_REFERENCE(name): + """Create a TermReference expression.""" + + return FTL.TermReference(id=FTL.Identifier(name)) + + +class IntoTranforms(Transformer): + IMPLICIT_TRANSFORMS = ("CONCAT",) + FORBIDDEN_TRANSFORMS = ("PLURALS", "REPLACE", "REPLACE_IN_TEXT") + + def __init__(self, substitutions): + self.substitutions = substitutions + + def visit_Junk(self, node): + anno = node.annotations[0] + raise InvalidTransformError( + "Transform contains parse error: {}, at {}".format( + anno.message, anno.span.start + ) + ) + + def visit_FunctionReference(self, node): + name = node.id.name + if name in self.IMPLICIT_TRANSFORMS: + raise NotSupportedError( + "{} may not be used with transforms_from(). It runs " + "implicitly on all Patterns anyways.".format(name) + ) + if name in self.FORBIDDEN_TRANSFORMS: + raise NotSupportedError( + "{} may not be used with transforms_from(). It requires " + "additional logic in Python code.".format(name) + ) + if name in ("COPY", "COPY_PATTERN"): + args = (self.into_argument(arg) for arg in node.arguments.positional) + kwargs = { + arg.name.name: self.into_argument(arg.value) + for arg in node.arguments.named + } + if name == "COPY": + return COPY(*args, **kwargs) + return COPY_PATTERN(*args, **kwargs) + return self.generic_visit(node) + + def visit_Placeable(self, node): + """If the expression is a Transform, replace this Placeable + with the Transform it's holding. + Transforms evaluate to Patterns, which are flattened as + elements of Patterns in Transform.pattern_of, but only + one level deep. + """ + node = self.generic_visit(node) + if isinstance(node.expression, Transform): + return node.expression + return node + + def visit_Pattern(self, node): + """Replace the Pattern with CONCAT which is more accepting of its + elements. CONCAT takes PatternElements, Expressions and other + Patterns (e.g. returned from evaluating transforms). + """ + node = self.generic_visit(node) + return CONCAT(*node.elements) + + def into_argument(self, node): + """Convert AST node into an argument to migration transforms.""" + if isinstance(node, FTL.StringLiteral): + # Special cases for booleans which don't exist in Fluent. + if node.value == "True": + return True + if node.value == "False": + return False + return node.value + if isinstance(node, FTL.MessageReference): + try: + return self.substitutions[node.id.name] + except KeyError: + raise InvalidTransformError( + "Unknown substitution in COPY: {}".format(node.id.name) + ) + else: + raise InvalidTransformError( + "Invalid argument passed to COPY: {}".format(type(node).__name__) + ) + + +def transforms_from(ftl, **substitutions) -> List[FTL.Message | FTL.Term]: + """Parse FTL code into a list of Message nodes with Transforms. + + The FTL may use a fabricated COPY function inside of placeables which + will be converted into actual COPY migration transform. + + new-key = Hardcoded text { COPY("filepath.dtd", "string.key") } + + For convenience, COPY may also refer to transforms_from's keyword + arguments via the MessageReference syntax: + + transforms_from(\""" + new-key = Hardcoded text { COPY(file_dtd, "string.key") } + \""", file_dtd="very/long/path/to/a/file.dtd") + + """ + + parser = FluentParser(with_spans=False) + resource = parser.parse(ftl) + return IntoTranforms(substitutions).visit(resource).body diff --git a/third_party/python/fluent.migrate/fluent/migrate/merge.py b/third_party/python/fluent.migrate/fluent/migrate/merge.py new file mode 100644 index 0000000000..921054a054 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/merge.py @@ -0,0 +1,51 @@ +import fluent.syntax.ast as FTL + +from .errors import SkipTransform +from .util import get_message, get_transform + + +def merge_resource(ctx, reference, current, transforms, in_changeset): + """Transform legacy translations into FTL. + + Use the `reference` FTL AST as a template. For each en-US string in the + reference, first check for an existing translation in the current FTL + `localization` and use it if it's present; then if the string has + a transform defined in the migration specification and if it's in the + currently processed changeset, evaluate the transform. + """ + + def merge_body(body): + return [entry for entry in map(merge_entry, body) if entry is not None] + + def merge_entry(entry): + # All standalone comments will be merged. + if isinstance(entry, FTL.BaseComment): + return entry + + # Ignore Junk + if isinstance(entry, FTL.Junk): + return None + + ident = entry.id.name + + # If the message is present in the existing localization, we add it to + # the resulting resource. This ensures consecutive merges don't remove + # translations but rather create supersets of them. + existing = get_message(current.body, ident) + if existing is not None: + return existing + + transform = get_transform(transforms, ident) + + # Make sure this message is supposed to be migrated as part of the + # current changeset. + if transform is not None and in_changeset(ident): + if transform.comment is None: + transform.comment = entry.comment + try: + return ctx.evaluate(transform) + except SkipTransform: + return None + + body = merge_body(reference.body) + return FTL.Resource(body) diff --git a/third_party/python/fluent.migrate/fluent/migrate/repo_client.py b/third_party/python/fluent.migrate/fluent/migrate/repo_client.py new file mode 100644 index 0000000000..4236bc4286 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/repo_client.py @@ -0,0 +1,106 @@ +from __future__ import annotations +from typing import Tuple + +import json +from subprocess import run + +from os.path import isdir, join + +import hglib + + +def git(root: str, *args: str) -> str: + """ + Wrapper for calling command-line git in the `root` directory. + Raises an exception on any error, including a non-0 return code. + Returns the command's stdout as a string. + """ + git = ["git"] + git.extend(args) + proc = run(git, capture_output=True, cwd=root, encoding="utf-8") + if proc.returncode != 0: + raise Exception(proc.stderr or f"git command failed: {args}") + return proc.stdout + + +class RepoClient: + def __init__(self, root: str): + self.root = root + if isdir(join(root, ".hg")): + self.hgclient = hglib.open(root, "utf-8") + elif isdir(join(root, ".git")): + self.hgclient = None + stdout = git(self.root, "rev-parse", "--is-inside-work-tree") + if stdout != "true\n": + raise Exception("git rev-parse failed") + else: + raise Exception(f"Unsupported repository: {root}") + + def close(self): + if self.hgclient: + self.hgclient.close() + + def blame(self, file: str) -> list[Tuple[str, int]]: + "Return a list of (author, time) tuples for each line in `file`." + if self.hgclient: + args = hglib.util.cmdbuilder( + b"annotate", + file.encode("latin-1"), + template="json", + date=True, + user=True, + cwd=self.root, + ) + blame_json = self.hgclient.rawcommand(args) + return [ + (line["user"], int(line["date"][0])) + for line in json.loads(blame_json)[0]["lines"] + ] + else: + lines: list[Tuple[str, int]] = [] + user = "" + time = 0 + stdout = git(self.root, "blame", "--porcelain", file) + for line in stdout.splitlines(): + if line.startswith("author "): + user = line[7:] + elif line.startswith("author-mail "): + user += line[11:] # includes leading space + elif line.startswith("author-time "): + time = int(line[12:]) + elif line.startswith("\t"): + lines.append((user, time)) + return lines + + def commit(self, message: str, author: str): + "Add and commit all work tree files" + if self.hgclient: + self.hgclient.commit(message, user=author.encode("utf-8"), addremove=True) + else: + git(self.root, "add", ".") + git(self.root, "commit", f"--author={author}", f"--message={message}") + + def head(self) -> str: + "Identifier for the most recent commit" + if self.hgclient: + return self.hgclient.tip().node.decode("utf-8") + else: + return git(self.root, "rev-parse", "HEAD").strip() + + def log(self, from_commit: str, to_commit: str) -> list[str]: + if self.hgclient: + return [ + rev.desc.decode("utf-8") + for rev in self.hgclient.log(f"{to_commit} % {from_commit}") + ] + else: + return ( + git( + self.root, + "log", + "--pretty=format:%s", + f"{from_commit}..{to_commit}", + ) + .strip() + .splitlines() + ) diff --git a/third_party/python/fluent.migrate/fluent/migrate/tool.py b/third_party/python/fluent.migrate/fluent/migrate/tool.py new file mode 100644 index 0000000000..c5b33ef803 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/tool.py @@ -0,0 +1,184 @@ +from __future__ import annotations +from types import ModuleType +from typing import Iterable, cast + +import argparse +from contextlib import contextmanager +import importlib +import logging +import os +import sys + +from fluent.migrate.blame import Blame +from fluent.migrate.changesets import Changes, convert_blame_to_changesets +from fluent.migrate.context import MigrationContext +from fluent.migrate.errors import MigrationError +from fluent.migrate.repo_client import RepoClient + + +@contextmanager +def dont_write_bytecode(): + _dont_write_bytecode = sys.dont_write_bytecode + sys.dont_write_bytecode = True + yield + sys.dont_write_bytecode = _dont_write_bytecode + + +class Migrator: + def __init__( + self, locale: str, reference_dir: str, localization_dir: str, dry_run: bool + ): + self.locale = locale + self.reference_dir = reference_dir + self.localization_dir = localization_dir + self.dry_run = dry_run + self._client = None + + @property + def client(self): + if self._client is None: + self._client = RepoClient(self.localization_dir) + return self._client + + def close(self): + # close hglib.client, if we cached one. + if self._client is not None: + self._client.close() + + def run(self, migration: ModuleType): + print("\nRunning migration {} for {}".format(migration.__name__, self.locale)) + + # For each migration create a new context. + ctx = MigrationContext(self.locale, self.reference_dir, self.localization_dir) + + try: + # Add the migration spec. + migration.migrate(ctx) + except MigrationError as e: + print( + " Skipping migration {} for {}:\n {}".format( + migration.__name__, self.locale, e + ) + ) + return + + # Keep track of how many changesets we're committing. + index = 0 + description_template = cast(str, migration.migrate.__doc__) + + # Annotate localization files used as sources by this migration + # to preserve attribution of translations. + files = ctx.localization_resources.keys() + blame = Blame(self.client).attribution(files) + changesets = convert_blame_to_changesets(blame) + known_legacy_translations = set() + + for changeset in changesets: + snapshot = self.snapshot( + ctx, changeset["changes"], known_legacy_translations + ) + if not snapshot: + continue + self.serialize_changeset(snapshot) + index += 1 + self.commit_changeset(description_template, changeset["author"], index) + + def snapshot( + self, + ctx: MigrationContext, + changes_in_changeset: Changes, + known_legacy_translations: Changes, + ): + """Run the migration for the changeset, with the set of + this and all prior legacy translations. + """ + known_legacy_translations.update(changes_in_changeset) + return ctx.serialize_changeset(changes_in_changeset, known_legacy_translations) + + def serialize_changeset(self, snapshot): + """Write serialized FTL files to disk.""" + for path, content in snapshot.items(): + fullpath = os.path.join(self.localization_dir, path) + print(f" Writing to {fullpath}") + if not self.dry_run: + fulldir = os.path.dirname(fullpath) + if not os.path.isdir(fulldir): + os.makedirs(fulldir) + with open(fullpath, "wb") as f: + f.write(content.encode("utf8")) + f.close() + + def commit_changeset(self, description_template: str, author: str, index: int): + message = description_template.format(index=index, author=author) + + print(f" Committing changeset: {message}") + if self.dry_run: + return + try: + self.client.commit(message, author) + except Exception as err: + print(f" WARNING: commit failed ({err})") + + +def main( + locale, + reference_dir: str, + localization_dir: str, + migrations: Iterable[ModuleType], + dry_run: bool, +): + """Run migrations and commit files with the result.""" + migrator = Migrator(locale, reference_dir, localization_dir, dry_run) + + for migration in migrations: + migrator.run(migration) + + migrator.close() + + +def cli(): + parser = argparse.ArgumentParser(description="Migrate translations to FTL.") + parser.add_argument( + "migrations", + metavar="MIGRATION", + type=str, + nargs="+", + help="migrations to run (Python modules)", + ) + parser.add_argument( + "--locale", "--lang", type=str, help="target locale code (--lang is deprecated)" + ) + parser.add_argument( + "--reference-dir", type=str, help="directory with reference FTL files" + ) + parser.add_argument( + "--localization-dir", type=str, help="directory for localization files" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="do not write to disk nor commit any changes", + ) + parser.set_defaults(dry_run=False) + + logger = logging.getLogger("migrate") + logger.setLevel(logging.INFO) + + args = parser.parse_args() + + # Don't byte-compile migrations. + # They're not our code, and infrequently run + with dont_write_bytecode(): + migrations = map(importlib.import_module, args.migrations) + + main( + locale=args.locale, + reference_dir=args.reference_dir, + localization_dir=args.localization_dir, + migrations=migrations, + dry_run=args.dry_run, + ) + + +if __name__ == "__main__": + cli() diff --git a/third_party/python/fluent.migrate/fluent/migrate/transforms.py b/third_party/python/fluent.migrate/fluent/migrate/transforms.py new file mode 100644 index 0000000000..f45ad1531c --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/transforms.py @@ -0,0 +1,558 @@ +"""Migration Transforms. + +Transforms are AST nodes which describe how legacy translations should be +migrated. They are created inert and only return the migrated AST nodes when +they are evaluated by a MigrationContext. + +All Transforms evaluate to Fluent Patterns. This makes them suitable for +defining migrations of values of message, attributes and variants. The special +CONCAT Transform is capable of joining multiple Patterns returned by evaluating +other Transforms into a single Pattern. It can also concatenate Pattern +elements: TextElements and Placeables. + +The COPY, REPLACE and PLURALS Transforms inherit from Source which is a special +AST Node defining the location (the file path and the id) of the legacy +translation. During the migration, the current MigrationContext scans the +migration spec for Source nodes and extracts the information about all legacy +translations being migrated. For instance, + + COPY('file.dtd', 'hello') + +is equivalent to: + + FTL.Pattern([ + Source('file.dtd', 'hello') + ]) + +Sometimes it's useful to work with text rather than (path, key) source +definitions. This is the case when the migrated translation requires some +hardcoded text, e.g. <a> and </a> when multiple translations become a single +one with a DOM overlay. In such cases it's best to use FTL.TextElements: + + FTL.Message( + id=FTL.Identifier('update-failed'), + value=CONCAT( + COPY('aboutDialog.dtd', 'update.failed.start'), + FTL.TextElement('<a>'), + COPY('aboutDialog.dtd', 'update.failed.linkText'), + FTL.TextElement('</a>'), + COPY('aboutDialog.dtd', 'update.failed.end'), + ) + ) + +The REPLACE_IN_TEXT Transform also takes TextElements as input, making it +possible to pass it as the foreach function of the PLURALS Transform. In the +example below, each slice of the plural string is converted into a +TextElement by PLURALS and then run through the REPLACE_IN_TEXT transform. + + FTL.Message( + FTL.Identifier('delete-all'), + value=PLURALS( + 'aboutDownloads.dtd', + 'deleteAll', + VARIABLE_REFERENCE('num'), + lambda text: REPLACE_IN_TEXT( + text, + { + '#1': VARIABLE_REFERENCE('num') + } + ) + ) + ) +""" + +import re + +from fluent.syntax import ast as FTL +from fluent.syntax.visitor import Transformer +from .errors import NotSupportedError + + +def chain_elements(elements): + """Flatten a list of FTL nodes into an iterator over PatternElements.""" + for element in elements: + if isinstance(element, FTL.Pattern): + # PY3 yield from element.elements + yield from element.elements + elif isinstance(element, FTL.PatternElement): + yield element + elif isinstance(element, FTL.Expression): + yield FTL.Placeable(element) + else: + raise RuntimeError("Expected Pattern, PatternElement or Expression") + + +re_leading_ws = re.compile( + r"\A(?:(?P<whitespace> +)(?P<text>.*?)|(?P<block_text>\n.*?))\Z", + re.S, +) +re_trailing_ws = re.compile( + r"\A(?:(?P<text>.*?)(?P<whitespace> +)|(?P<block_text>.*\n))\Z", re.S +) + + +def extract_whitespace(regex, element): + """Extract leading or trailing whitespace from a TextElement. + + Return a tuple of (Placeable, TextElement) in which the Placeable + encodes the extracted whitespace as a StringLiteral and the + TextElement has the same amount of whitespace removed. The + Placeable with the extracted whitespace is always returned first. + If the element starts or ends with a newline, add an empty + StringLiteral. + """ + match = re.search(regex, element.value) + if match: + # If white-space is None, we're a newline. Add an + # empty { "" } + whitespace = match.group("whitespace") or "" + placeable = FTL.Placeable(FTL.StringLiteral(whitespace)) + if whitespace == element.value: + return placeable, None + else: + # Either text or block_text matched the rest. + text = match.group("text") or match.group("block_text") + return placeable, FTL.TextElement(text) + else: + return None, element + + +class Transform(FTL.BaseNode): + def __call__(self, ctx): + raise NotImplementedError + + @staticmethod + def pattern_of(*elements): + normalized = [] + + # Normalize text content: convert text content to TextElements, join + # adjacent text and prune empty. Text content is either existing + # TextElements or whitespace-only StringLiterals. This may result in + # leading and trailing whitespace being put back into TextElements if + # the new Pattern is built from existing Patterns (CONCAT(COPY...)). + # The leading and trailing whitespace of the new Pattern will be + # extracted later into new StringLiterals. + for element in chain_elements(elements): + if isinstance(element, FTL.TextElement): + text_content = element.value + elif ( + isinstance(element, FTL.Placeable) + and isinstance(element.expression, FTL.StringLiteral) + and re.match(r"^ *$", element.expression.value) + ): + text_content = element.expression.value + else: + # The element does not contain text content which should be + # normalized. It may be a number, a reference, or + # a StringLiteral which should be preserved in the Pattern. + normalized.append(element) + continue + + previous = normalized[-1] if len(normalized) else None + if isinstance(previous, FTL.TextElement): + # Join adjacent TextElements. + previous.value += text_content + elif len(text_content) > 0: + # Normalize non-empty text to a TextElement. + normalized.append(FTL.TextElement(text_content)) + else: + # Prune empty text. + pass + + # Store empty values explicitly as {""}. + if len(normalized) == 0: + empty = FTL.Placeable(FTL.StringLiteral("")) + return FTL.Pattern([empty]) + + # Extract explicit leading whitespace into a StringLiteral. + if isinstance(normalized[0], FTL.TextElement): + ws, text = extract_whitespace(re_leading_ws, normalized[0]) + normalized[:1] = [ws, text] + + # Extract explicit trailing whitespace into a StringLiteral. + if isinstance(normalized[-1], FTL.TextElement): + ws, text = extract_whitespace(re_trailing_ws, normalized[-1]) + normalized[-1:] = [text, ws] + + return FTL.Pattern([element for element in normalized if element is not None]) + + +class Source(Transform): + """Base class for Transforms that get translations from source files. + + The contract is that the first argument is the source path, and the + second is a key representing legacy string IDs, or Fluent id.attr. + """ + + def __init__(self, path, key): + self.path = path + self.key = key + + +class FluentSource(Source): + """Declare a Fluent source translation to be copied over. + + When evaluated, it clones the Pattern of the parsed source. + """ + + def __init__(self, path, key): + if not path.endswith(".ftl"): + raise NotSupportedError( + "Please use COPY to migrate from legacy files " "({})".format(path) + ) + if key[0] == "-" and "." in key: + raise NotSupportedError( + "Cannot migrate from Term Attributes, as they are" + "locale-dependent ({})".format(path) + ) + super().__init__(path, key) + + def __call__(self, ctx): + pattern = ctx.get_fluent_source_pattern(self.path, self.key) + return pattern.clone() + + +class COPY_PATTERN(FluentSource): + """Create a Pattern with the translation value from the given source. + + The given key can be a Message ID, Message ID.attribute_name, or + Term ID. Accessing Term attributes is not supported, as they're internal + to the localization. + """ + + pass + + +class TransformPattern(FluentSource, Transformer): + """Base class for modifying a Fluent pattern as part of a migration. + + Implement visit_* methods of the Transformer pattern to do the + actual modifications. + """ + + def __call__(self, ctx): + pattern = super().__call__(ctx) + return self.visit(pattern) + + def visit_Pattern(self, node): + # Make sure we're creating valid Patterns after restructuring + # transforms. + node = self.generic_visit(node) + pattern = Transform.pattern_of(*node.elements) + return pattern + + def visit_Placeable(self, node): + # Ensure we have a Placeable with an expression still. + # Transforms could have replaced the expression with + # a Pattern or PatternElement, in which case we + # just pass that through. + # Patterns then get flattened by visit_Pattern. + node = self.generic_visit(node) + if isinstance(node.expression, (FTL.Pattern, FTL.PatternElement)): + return node.expression + return node + + +class LegacySource(Source): + """Declare the source translation to be migrated with other transforms. + + When evaluated, `Source` returns a TextElement with the content from the + source translation. Escaped characters are unescaped by the + compare-locales parser according to the file format: + + - in properties files: \\uXXXX, + - in DTD files: known named, decimal, and hexadecimal HTML entities. + + Consult the following files for the list of known named HTML entities: + + https://github.com/python/cpython/blob/2.7/Lib/htmlentitydefs.py + https://github.com/python/cpython/blob/3.6/Lib/html/entities.py + + By default, leading and trailing whitespace on each line as well as + leading and trailing empty lines will be stripped from the source + translation's content. Set `trim=False` to disable this behavior. + """ + + def __init__(self, path, key, trim=None): + if path.endswith(".ftl"): + raise NotSupportedError( + "Please use COPY_PATTERN to migrate from Fluent files " + "({})".format(path) + ) + + super().__init__(path, key) + self.trim = trim + + def get_text(self, ctx): + return ctx.get_legacy_source(self.path, self.key) + + @staticmethod + def trim_text(text): + # strip leading white-space from each line + text = re.sub("^[ \t]+", "", text, flags=re.M) + # strip trailing white-space from each line + text = re.sub("[ \t]+$", "", text, flags=re.M) + # strip leading and trailing empty lines + text = text.strip("\r\n") + return text + + def __call__(self, ctx): + text = self.get_text(ctx) + if self.trim is not False: + text = self.trim_text(text) + return FTL.TextElement(text) + + +class COPY(LegacySource): + """Create a Pattern with the translation value from the given source.""" + + def __call__(self, ctx): + element = super().__call__(ctx) + return Transform.pattern_of(element) + + +PRINTF = re.compile( + r"%(?P<good>%|" + r"(?:(?P<number>[1-9][0-9]*)\$)?" + r"(?P<width>\*|[0-9]+)?" + r"(?P<prec>\.(?:\*|[0-9]+)?)?" + r"(?P<spec>[duxXosScpfg]))" +) + + +def number(): + i = 1 + while True: + yield i + i += 1 + + +def normalize_printf(text): + """Normalize printf arguments so that they're all numbered. + Gecko forbids mixing unnumbered and numbered ones, so + we just need to convert unnumbered to numbered ones. + Also remove ones that have zero width, as they're intended + to be removed from the output by the localizer. + """ + next_number = number() + + def normalized(match): + if match.group("good") == "%": + return "%" + hidden = match.group("width") == "0" + if match.group("number"): + return "" if hidden else match.group() + num = next(next_number) + return "" if hidden else "%{}${}".format(num, match.group("spec")) + + return PRINTF.sub(normalized, text) + + +class REPLACE_IN_TEXT(Transform): + """Create a Pattern from a TextElement and replace legacy placeables. + + The original placeables are defined as keys on the `replacements` dict. + For each key the value must be defined as a FTL Pattern, Placeable, + TextElement or Expression to be interpolated. + """ + + def __init__(self, element, replacements, normalize_printf=False): + self.element = element + self.replacements = replacements + self.normalize_printf = normalize_printf + + def __call__(self, ctx): + # For each specified replacement, find all indices of the original + # placeable in the source translation. If missing, the list of indices + # will be empty. + value = self.element.value + if self.normalize_printf: + value = normalize_printf(value) + key_indices = { + key: [m.start() for m in re.finditer(re.escape(key), value)] + for key in self.replacements.keys() + } + + # Build a dict of indices to replacement keys. + keys_indexed = {} + for key, indices in key_indices.items(): + for index in indices: + keys_indexed[index] = key + + # Order the replacements by the position of the original placeable in + # the translation. + replacements = ( + (key, ctx.evaluate(self.replacements[key])) + for index, key in sorted(keys_indexed.items(), key=lambda x: x[0]) + ) + + # A list of PatternElements built from the legacy translation and the + # FTL replacements. It may contain empty or adjacent TextElements. + elements = [] + tail = value + + # Convert original placeables and text into FTL Nodes. For each + # original placeable the translation will be partitioned around it and + # the text before it will be converted into an `FTL.TextElement` and + # the placeable will be replaced with its replacement. + for key, node in replacements: + before, key, tail = tail.partition(key) + elements.append(FTL.TextElement(before)) + elements.append(node) + + # Don't forget about the tail after the loop ends. + elements.append(FTL.TextElement(tail)) + return Transform.pattern_of(*elements) + + +class REPLACE(LegacySource): + """Create a Pattern with interpolations from given source. + + Interpolations in the translation value from the given source will be + replaced with FTL placeables using the `REPLACE_IN_TEXT` transform. + """ + + def __init__(self, path, key, replacements, **kwargs): + # We default normalize_printf to False except for .properties files. + # We still allow the caller to override the default value. + normalize_printf = False + if "normalize_printf" in kwargs: + normalize_printf = kwargs["normalize_printf"] + del kwargs["normalize_printf"] + elif path.endswith(".properties"): + normalize_printf = True + + super().__init__(path, key, **kwargs) + self.replacements = replacements + self.normalize_printf = normalize_printf + + def __call__(self, ctx): + element = super().__call__(ctx) + return REPLACE_IN_TEXT( + element, self.replacements, normalize_printf=self.normalize_printf + )(ctx) + + +class PLURALS(LegacySource): + """Create a Pattern with plurals from given source. + + Build an `FTL.SelectExpression` with the supplied `selector` and variants + extracted from the source. The original translation should be a + semicolon-separated list of plural forms. Each form will be converted + into a TextElement and run through the `foreach` function, which should + return an `FTL.Node` or a `Transform`. By default, the `foreach` function + creates a valid Pattern from the TextElement passed into it. + """ + + DEFAULT_ORDER = ("zero", "one", "two", "few", "many", "other") + + def __init__(self, path, key, selector, foreach=Transform.pattern_of, **kwargs): + super().__init__(path, key, **kwargs) + self.selector = selector + self.foreach = foreach + + def __call__(self, ctx): + element = super().__call__(ctx) + selector = ctx.evaluate(self.selector) + keys = ctx.plural_categories + forms = [FTL.TextElement(part.strip()) for part in element.value.split(";")] + + # The default CLDR form should be the last we have in DEFAULT_ORDER, + # usually `other`, but in some cases `many`. If we don't have a variant + # for that, we'll append one, using the, in CLDR order, last existing + # variant in the legacy translation. That may or may not be the last + # variant. + default_key = [key for key in reversed(self.DEFAULT_ORDER) if key in keys][0] + + # Match keys to legacy forms in the order they are defined in Gecko's + # PluralForm.jsm. Filter out empty forms. + pairs = [(key, var) for key, var in zip(keys, forms) if var.value] + + # A special case for legacy translations which don't define any + # plural forms. + if len(pairs) == 0: + return Transform.pattern_of() + + # A special case for languages with one plural category or one legacy + # variant. We don't need to insert a SelectExpression for them. + if len(pairs) == 1: + _, only_form = pairs[0] + only_variant = ctx.evaluate(self.foreach(only_form)) + return Transform.pattern_of(only_variant) + + # Make sure the default key is defined. If it's missing, use the last + # form (in CLDR order) found in the legacy translation. + pairs.sort(key=lambda pair: self.DEFAULT_ORDER.index(pair[0])) + last_key, last_form = pairs[-1] + if last_key != default_key: + pairs.append((default_key, last_form)) + + def createVariant(key, form): + # Run the legacy plural form through `foreach` which returns an + # `FTL.Node` describing the transformation required for each + # variant. Then evaluate it to a migrated FTL node. + value = ctx.evaluate(self.foreach(form)) + return FTL.Variant( + key=FTL.Identifier(key), value=value, default=key == default_key + ) + + select = FTL.SelectExpression( + selector=selector, + variants=[createVariant(key, form) for key, form in pairs], + ) + + return Transform.pattern_of(select) + + +class CONCAT(Transform): + """Create a new Pattern from Patterns, PatternElements and Expressions. + + When called with at least two elements, `CONCAT` disables the trimming + behavior of the elements which are subclasses of `LegacySource` by + setting `trim=False`, unless `trim` has already been set explicitly. The + following two `CONCAT` calls are equivalent: + + CONCAT( + FTL.TextElement("Hello"), + COPY("file.properties", "hello") + ) + + CONCAT( + FTL.TextElement("Hello"), + COPY("file.properties", "hello", trim=False) + ) + + Set `trim=True` explicitly to force trimming: + + CONCAT( + FTL.TextElement("Hello "), + COPY("file.properties", "hello", trim=True) + ) + + When called with a single element and when the element is a subclass of + `LegacySource`, the trimming behavior is not changed. The following two + transforms are equivalent: + + CONCAT(COPY("file.properties", "hello")) + + COPY("file.properties", "hello") + """ + + def __init__(self, *elements, **kwargs): + # We want to support both passing elements as *elements in the + # migration specs and as elements=[]. The latter is used by + # FTL.BaseNode.traverse when it recreates the traversed node using its + # attributes as kwargs. + self.elements = list(kwargs.get("elements", elements)) + + # We want to make CONCAT(COPY()) equivalent to COPY() so that it's + # always safe (no-op) to wrap transforms in a CONCAT. This is used by + # the implementation of transforms_from. + if len(self.elements) > 1: + for elem in self.elements: + # Only change trim if it hasn't been set explicitly. + if isinstance(elem, LegacySource) and elem.trim is None: + elem.trim = False + + def __call__(self, ctx): + return Transform.pattern_of(*self.elements) diff --git a/third_party/python/fluent.migrate/fluent/migrate/util.py b/third_party/python/fluent.migrate/fluent/migrate/util.py new file mode 100644 index 0000000000..43d9e62c19 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/util.py @@ -0,0 +1,107 @@ +import textwrap + +import fluent.syntax.ast as FTL +from fluent.syntax.parser import FluentParser, FluentParserStream + + +fluent_parser = FluentParser(with_spans=False) + + +def parse(Parser, string): + if Parser is FluentParser: + return fluent_parser.parse(string) + + # Parsing a legacy resource. + + # Parse the string into the internal Context. + parser = Parser() + # compare-locales expects ASCII strings. + parser.readContents(string.encode("utf8")) + # Transform the parsed result which is an iterator into a dict. + return {ent.key: ent for ent in parser} + + +def ftl_resource_to_ast(code): + return fluent_parser.parse(ftl(code)) + + +def ftl_resource_to_json(code): + return fluent_parser.parse(ftl(code)).to_json() + + +def ftl_pattern_to_json(code): + ps = FluentParserStream(ftl(code)) + return fluent_parser.maybe_get_pattern(ps).to_json() + + +def to_json(merged_iter): + return {path: resource.to_json() for path, resource in merged_iter} + + +LOCALIZABLE_ENTRIES = (FTL.Message, FTL.Term) + + +def get_message(body, ident): + """Get message called `ident` from the `body` iterable.""" + for entity in body: + if isinstance(entity, LOCALIZABLE_ENTRIES) and entity.id.name == ident: + return entity + + +def get_transform(body, ident): + """Get entity called `ident` from the `body` iterable.""" + for transform in body: + if transform.id.name == ident: + return transform + + +def skeleton(node): + """Create a skeleton copy of the given node. + + For localizable entries, the value is None and the attributes are {}. + That's not a valid Fluent entry, so it requires further manipulation to + set values and/or attributes. + """ + if isinstance(node, LOCALIZABLE_ENTRIES): + return type(node)(id=node.id.clone(), value=None) + return node.clone() + + +def ftl(code): + """Nicer indentation for FTL code. + + The code returned by this function is meant to be compared against the + output of the FTL Serializer. The input code will end with a newline to + match the output of the serializer. + """ + + # The code might be triple-quoted. + code = code.lstrip("\n") + + return textwrap.dedent(code) + + +def fold(fun, node, init): + """Reduce `node` to a single value using `fun`. + + Apply `fun` against an accumulator and each subnode of `node` (in postorder + traversal) to reduce it to a single value. + """ + + def fold_(vals, acc): + if not vals: + return acc + + head = list(vals)[0] + tail = list(vals)[1:] + + if isinstance(head, FTL.BaseNode): + acc = fold(fun, head, acc) + if isinstance(head, list): + acc = fold_(head, acc) + if isinstance(head, dict): + acc = fold_(head.values(), acc) + + return fold_(tail, fun(acc, head)) + + return fold_(vars(node).values(), init) diff --git a/third_party/python/fluent.migrate/fluent/migrate/validator.py b/third_party/python/fluent.migrate/fluent/migrate/validator.py new file mode 100644 index 0000000000..4e05865434 --- /dev/null +++ b/third_party/python/fluent.migrate/fluent/migrate/validator.py @@ -0,0 +1,323 @@ +import argparse +import ast +from itertools import zip_longest + +from fluent.migrate import transforms +from fluent.migrate.errors import MigrationError +from fluent.migrate.helpers import transforms_from +from fluent.syntax import ast as FTL +from fluent.syntax.visitor import Visitor +from compare_locales import mozpath + + +class MigrateNotFoundException(Exception): + pass + + +class BadContextAPIException(Exception): + pass + + +def process_assign(node, context): + if isinstance(node.value, ast.Constant): + val = node.value.value + elif isinstance(node.value, ast.Name): + val = context.get(node.value.id) + elif isinstance(node.value, ast.Call): + val = node.value + else: + val = None + if val is None: + return + for target in node.targets: + if isinstance(target, ast.Name): + context[target.id] = val + + +class Validator: + """Validate a migration recipe + + Extract information from the migration recipe about which files to + migrate from, and which files to migrate to. + Also check for errors in the recipe, or bad API usage. + """ + + @classmethod + def validate(cls, path, code=None): + if code is None: + with open(path) as fh: + code = fh.read() + validator = cls(code, path) + return validator.inspect() + + def __init__(self, code, path): + self.ast = ast.parse(code, path) + + def inspect(self): + migrate_func = None + global_assigns = {} + for top_level in ast.iter_child_nodes(self.ast): + if isinstance(top_level, ast.FunctionDef) and top_level.name == "migrate": + if migrate_func: + raise MigrateNotFoundException("Duplicate definition of migrate") + migrate_func = top_level + details = self.inspect_migrate(migrate_func, global_assigns) + if isinstance(top_level, ast.Assign): + process_assign(top_level, global_assigns) + if isinstance(top_level, (ast.Import, ast.ImportFrom)): + if "module" in top_level._fields: + module = top_level.module + else: + module = None + for alias in top_level.names: + asname = alias.asname or alias.name + dotted = alias.name + if module: + dotted = f"{module}.{dotted}" + global_assigns[asname] = dotted + if not migrate_func: + raise MigrateNotFoundException("migrate function not found") + return details + + def inspect_migrate(self, migrate_func, global_assigns): + if len(migrate_func.args.args) != 1 or any( + getattr(migrate_func.args, arg_field) + for arg_field in migrate_func.args._fields + if arg_field != "args" + ): + raise MigrateNotFoundException("migrate takes only one positional argument") + arg = migrate_func.args.args[0] + if isinstance(arg, ast.Name): + ctx_var = arg.id # python 2 + else: + ctx_var = arg.arg # python 3 + visitor = MigrateAnalyzer(ctx_var, global_assigns) + visitor.visit(migrate_func) + return { + "references": visitor.references, + "issues": visitor.issues, + } + + +def full_name(node, global_assigns): + leafs = [] + while isinstance(node, ast.Attribute): + leafs.append(node.attr) + node = node.value + if isinstance(node, ast.Name): + leafs.append(global_assigns.get(node.id, node.id)) + return ".".join(reversed(leafs)) + + +PATH_TYPES = (str,) + (ast.Call,) + + +class MigrateAnalyzer(ast.NodeVisitor): + def __init__(self, ctx_var, global_assigns): + super().__init__() + self.ctx_var = ctx_var + self.global_assigns = global_assigns + self.depth = 0 + self.issues = [] + self.references = set() + + def generic_visit(self, node): + self.depth += 1 + super().generic_visit(node) + self.depth -= 1 + + def visit_Assign(self, node): + if self.depth == 1: + process_assign(node, self.global_assigns) + self.generic_visit(node) + + def visit_Attribute(self, node): + if isinstance(node.value, ast.Name) and node.value.id == self.ctx_var: + if node.attr not in ( + "add_transforms", + "locale", + ): + raise BadContextAPIException( + "Unexpected attribute access on {}.{}".format( + self.ctx_var, node.attr + ) + ) + self.generic_visit(node) + + def visit_Call(self, node): + if ( + isinstance(node.func, ast.Attribute) + and isinstance(node.func.value, ast.Name) + and node.func.value.id == self.ctx_var + ): + return self.call_ctx(node) + dotted = full_name(node.func, self.global_assigns) + if dotted == "fluent.migrate.helpers.transforms_from": + return self.call_helpers_transforms_from(node) + if dotted.startswith("fluent.migrate."): + return self.call_transform(node, dotted) + self.generic_visit(node) + + def call_ctx(self, node): + if node.func.attr == "add_transforms": + return self.call_add_transforms(node) + raise BadContextAPIException( + "Unexpected call on {}.{}".format(self.ctx_var, node.func.attr) + ) + + def call_add_transforms(self, node): + args_msg = ( + "Expected arguments to {}.add_transforms: " + "target_ftl_path, reference_ftl_path, list_of_transforms" + ).format(self.ctx_var) + ref_msg = ( + "Expected second argument to {}.add_transforms: " + "reference should be string or variable with string value" + ).format(self.ctx_var) + # Just check call signature here, check actual types below + if not self.check_arguments(node, (ast.AST, ast.AST, ast.AST)): + self.issues.append( + { + "msg": args_msg, + "line": node.lineno, + } + ) + return + in_reference = node.args[1] + if isinstance(in_reference, ast.Name): + in_reference = self.global_assigns.get(in_reference.id) + if isinstance(in_reference, ast.Constant): + in_reference = in_reference.value + if not isinstance(in_reference, str): + self.issues.append( + { + "msg": ref_msg, + "line": node.args[1].lineno, + } + ) + return + self.references.add(in_reference) + # Checked node.args[1]. + # There's not a lot we can say about our target path, + # ignoring that. + # For our transforms, we want more checks. + self.generic_visit(node.args[2]) + + def call_transform(self, node, dotted): + module, called = dotted.rsplit(".", 1) + if module not in ("fluent.migrate", "fluent.migrate.transforms"): + return + transform = getattr(transforms, called) + if not issubclass(transform, transforms.Source): + return + bad_args = f"{called} takes path and key as first two params" + if not self.check_arguments( + node, + ( + (ast.Constant, ast.Name), + (ast.Constant, ast.Name), + ), + allow_more=True, + check_kwargs=False, + ): + self.issues.append({"msg": bad_args, "line": node.lineno}) + return + path = node.args[0] + if isinstance(path, ast.Constant): + path = path.value + if isinstance(path, ast.Name): + path = self.global_assigns.get(path.id) + if not isinstance(path, PATH_TYPES): + self.issues.append({"msg": bad_args, "line": node.lineno}) + + def call_helpers_transforms_from(self, node): + args_msg = "Expected arguments to transforms_from: " "str, **substitions" + if not self.check_arguments(node, (ast.Constant,), check_kwargs=False): + self.issues.append( + { + "msg": args_msg, + "line": node.lineno, + } + ) + return + kwargs = {} + found_bad_keywords = False + for keyword in node.keywords: + v = keyword.value + if isinstance(v, ast.Constant): + v = v.value + if isinstance(v, ast.Name): + v = self.global_assigns.get(v.id) + if isinstance(v, ast.Call): + v = "determined at runtime" + if not isinstance(v, PATH_TYPES): + msg = "Bad keyword arg {} to transforms_from".format(keyword.arg) + self.issues.append( + { + "msg": msg, + "line": node.lineno, + } + ) + found_bad_keywords = True + else: + kwargs[keyword.arg] = v + if found_bad_keywords: + return + try: + transforms = transforms_from(node.args[0].value, **kwargs) + except MigrationError as e: + self.issues.append( + { + "msg": str(e), + "line": node.lineno, + } + ) + return + ti = TransformsInspector() + ti.visit(transforms) + self.issues.extend( + { + "msg": issue, + "line": node.lineno, + } + for issue in set(ti.issues) + ) + + def check_arguments(self, node, argspec, check_kwargs=True, allow_more=False): + if check_kwargs and ( + node.keywords or (hasattr(node, "kwargs") and node.kwargs) + ): + return False + if hasattr(node, "starargs") and node.starargs: + return False + for arg, NODE_TYPE in zip_longest(node.args, argspec): + if NODE_TYPE is None: + return True if allow_more else False + if not (isinstance(arg, NODE_TYPE)): + return False + return True + + +class TransformsInspector(Visitor): + def __init__(self): + super().__init__() + self.issues = [] + + def generic_visit(self, node): + if isinstance(node, transforms.Source): + src = node.path + # Source needs paths to be normalized + # https://bugzilla.mozilla.org/show_bug.cgi?id=1568199 + if src != mozpath.normpath(src): + self.issues.append(f'Source "{src}" needs to be a normalized path') + super().generic_visit(node) + + +def cli(): + parser = argparse.ArgumentParser() + parser.add_argument("migration") + args = parser.parse_args() + issues = Validator.validate(args.migration)["issues"] + for issue in issues: + print(issue["msg"], "at line", issue["line"]) + return 1 if issues else 0 |