summaryrefslogtreecommitdiffstats
path: root/third_party/python/fluent.migrate/fluent/migrate/transforms.py
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/python/fluent.migrate/fluent/migrate/transforms.py')
-rw-r--r--third_party/python/fluent.migrate/fluent/migrate/transforms.py576
1 files changed, 576 insertions, 0 deletions
diff --git a/third_party/python/fluent.migrate/fluent/migrate/transforms.py b/third_party/python/fluent.migrate/fluent/migrate/transforms.py
new file mode 100644
index 0000000000..11b722125f
--- /dev/null
+++ b/third_party/python/fluent.migrate/fluent/migrate/transforms.py
@@ -0,0 +1,576 @@
+"""Migration Transforms.
+
+Transforms are AST nodes which describe how legacy translations should be
+migrated. They are created inert and only return the migrated AST nodes when
+they are evaluated by a MigrationContext.
+
+All Transforms evaluate to Fluent Patterns. This makes them suitable for
+defining migrations of values of message, attributes and variants. The special
+CONCAT Transform is capable of joining multiple Patterns returned by evaluating
+other Transforms into a single Pattern. It can also concatenate Pattern
+elements: TextElements and Placeables.
+
+The COPY, REPLACE and PLURALS Transforms inherit from Source which is a special
+AST Node defining the location (the file path and the id) of the legacy
+translation. During the migration, the current MigrationContext scans the
+migration spec for Source nodes and extracts the information about all legacy
+translations being migrated. For instance,
+
+ COPY('file.dtd', 'hello')
+
+is equivalent to:
+
+ FTL.Pattern([
+ Source('file.dtd', 'hello')
+ ])
+
+Sometimes it's useful to work with text rather than (path, key) source
+definitions. This is the case when the migrated translation requires some
+hardcoded text, e.g. <a> and </a> when multiple translations become a single
+one with a DOM overlay. In such cases it's best to use FTL.TextElements:
+
+ FTL.Message(
+ id=FTL.Identifier('update-failed'),
+ value=CONCAT(
+ COPY('aboutDialog.dtd', 'update.failed.start'),
+ FTL.TextElement('<a>'),
+ COPY('aboutDialog.dtd', 'update.failed.linkText'),
+ FTL.TextElement('</a>'),
+ COPY('aboutDialog.dtd', 'update.failed.end'),
+ )
+ )
+
+The REPLACE_IN_TEXT Transform also takes TextElements as input, making it
+possible to pass it as the foreach function of the PLURALS Transform. In the
+example below, each slice of the plural string is converted into a
+TextElement by PLURALS and then run through the REPLACE_IN_TEXT transform.
+
+ FTL.Message(
+ FTL.Identifier('delete-all'),
+ value=PLURALS(
+ 'aboutDownloads.dtd',
+ 'deleteAll',
+ VARIABLE_REFERENCE('num'),
+ lambda text: REPLACE_IN_TEXT(
+ text,
+ {
+ '#1': VARIABLE_REFERENCE('num')
+ }
+ )
+ )
+ )
+"""
+
+import re
+
+from fluent.syntax import ast as FTL
+from fluent.syntax.visitor import Transformer
+from .errors import NotSupportedError
+
+
+def chain_elements(elements):
+ '''Flatten a list of FTL nodes into an iterator over PatternElements.'''
+ for element in elements:
+ if isinstance(element, FTL.Pattern):
+ # PY3 yield from element.elements
+ yield from element.elements
+ elif isinstance(element, FTL.PatternElement):
+ yield element
+ elif isinstance(element, FTL.Expression):
+ yield FTL.Placeable(element)
+ else:
+ raise RuntimeError(
+ 'Expected Pattern, PatternElement or Expression')
+
+
+re_leading_ws = re.compile(
+ r'\A(?:(?P<whitespace> +)(?P<text>.*?)|(?P<block_text>\n.*?))\Z',
+ re.S,
+)
+re_trailing_ws = re.compile(
+ r'\A(?:(?P<text>.*?)(?P<whitespace> +)|(?P<block_text>.*\n))\Z',
+ re.S
+)
+
+
+def extract_whitespace(regex, element):
+ '''Extract leading or trailing whitespace from a TextElement.
+
+ Return a tuple of (Placeable, TextElement) in which the Placeable
+ encodes the extracted whitespace as a StringLiteral and the
+ TextElement has the same amount of whitespace removed. The
+ Placeable with the extracted whitespace is always returned first.
+ If the element starts or ends with a newline, add an empty
+ StringLiteral.
+ '''
+ match = re.search(regex, element.value)
+ if match:
+ # If white-space is None, we're a newline. Add an
+ # empty { "" }
+ whitespace = match.group('whitespace') or ''
+ placeable = FTL.Placeable(FTL.StringLiteral(whitespace))
+ if whitespace == element.value:
+ return placeable, None
+ else:
+ # Either text or block_text matched the rest.
+ text = match.group('text') or match.group('block_text')
+ return placeable, FTL.TextElement(text)
+ else:
+ return None, element
+
+
+class Transform(FTL.BaseNode):
+ def __call__(self, ctx):
+ raise NotImplementedError
+
+ @staticmethod
+ def pattern_of(*elements):
+ normalized = []
+
+ # Normalize text content: convert text content to TextElements, join
+ # adjacent text and prune empty. Text content is either existing
+ # TextElements or whitespace-only StringLiterals. This may result in
+ # leading and trailing whitespace being put back into TextElements if
+ # the new Pattern is built from existing Patterns (CONCAT(COPY...)).
+ # The leading and trailing whitespace of the new Pattern will be
+ # extracted later into new StringLiterals.
+ for element in chain_elements(elements):
+ if isinstance(element, FTL.TextElement):
+ text_content = element.value
+ elif isinstance(element, FTL.Placeable) \
+ and isinstance(element.expression, FTL.StringLiteral) \
+ and re.match(r'^ *$', element.expression.value):
+ text_content = element.expression.value
+ else:
+ # The element does not contain text content which should be
+ # normalized. It may be a number, a reference, or
+ # a StringLiteral which should be preserved in the Pattern.
+ normalized.append(element)
+ continue
+
+ previous = normalized[-1] if len(normalized) else None
+ if isinstance(previous, FTL.TextElement):
+ # Join adjacent TextElements.
+ previous.value += text_content
+ elif len(text_content) > 0:
+ # Normalize non-empty text to a TextElement.
+ normalized.append(FTL.TextElement(text_content))
+ else:
+ # Prune empty text.
+ pass
+
+ # Store empty values explicitly as {""}.
+ if len(normalized) == 0:
+ empty = FTL.Placeable(FTL.StringLiteral(''))
+ return FTL.Pattern([empty])
+
+ # Extract explicit leading whitespace into a StringLiteral.
+ if isinstance(normalized[0], FTL.TextElement):
+ ws, text = extract_whitespace(re_leading_ws, normalized[0])
+ normalized[:1] = [ws, text]
+
+ # Extract explicit trailing whitespace into a StringLiteral.
+ if isinstance(normalized[-1], FTL.TextElement):
+ ws, text = extract_whitespace(re_trailing_ws, normalized[-1])
+ normalized[-1:] = [text, ws]
+
+ return FTL.Pattern([
+ element
+ for element in normalized
+ if element is not None
+ ])
+
+
+class Source(Transform):
+ """Base class for Transforms that get translations from source files.
+
+ The contract is that the first argument is the source path, and the
+ second is a key representing legacy string IDs, or Fluent id.attr.
+ """
+ def __init__(self, path, key):
+ self.path = path
+ self.key = key
+
+
+class FluentSource(Source):
+ """Declare a Fluent source translation to be copied over.
+
+ When evaluated, it clones the Pattern of the parsed source.
+ """
+ def __init__(self, path, key):
+ if not path.endswith('.ftl'):
+ raise NotSupportedError(
+ 'Please use COPY to migrate from legacy files '
+ '({})'.format(path)
+ )
+ if key[0] == '-' and '.' in key:
+ raise NotSupportedError(
+ 'Cannot migrate from Term Attributes, as they are'
+ 'locale-dependent ({})'.format(path)
+ )
+ super().__init__(path, key)
+
+ def __call__(self, ctx):
+ pattern = ctx.get_fluent_source_pattern(self.path, self.key)
+ return pattern.clone()
+
+
+class COPY_PATTERN(FluentSource):
+ """Create a Pattern with the translation value from the given source.
+
+ The given key can be a Message ID, Message ID.attribute_name, or
+ Term ID. Accessing Term attributes is not supported, as they're internal
+ to the localization.
+ """
+ pass
+
+
+class TransformPattern(FluentSource, Transformer):
+ """Base class for modifying a Fluent pattern as part of a migration.
+
+ Implement visit_* methods of the Transformer pattern to do the
+ actual modifications.
+ """
+ def __call__(self, ctx):
+ pattern = super().__call__(ctx)
+ return self.visit(pattern)
+
+ def visit_Pattern(self, node):
+ # Make sure we're creating valid Patterns after restructuring
+ # transforms.
+ node = self.generic_visit(node)
+ pattern = Transform.pattern_of(*node.elements)
+ return pattern
+
+ def visit_Placeable(self, node):
+ # Ensure we have a Placeable with an expression still.
+ # Transforms could have replaced the expression with
+ # a Pattern or PatternElement, in which case we
+ # just pass that through.
+ # Patterns then get flattened by visit_Pattern.
+ node = self.generic_visit(node)
+ if isinstance(node.expression, (FTL.Pattern, FTL.PatternElement)):
+ return node.expression
+ return node
+
+
+class LegacySource(Source):
+ """Declare the source translation to be migrated with other transforms.
+
+ When evaluated, `Source` returns a TextElement with the content from the
+ source translation. Escaped characters are unescaped by the
+ compare-locales parser according to the file format:
+
+ - in properties files: \\uXXXX,
+ - in DTD files: known named, decimal, and hexadecimal HTML entities.
+
+ Consult the following files for the list of known named HTML entities:
+
+ https://github.com/python/cpython/blob/2.7/Lib/htmlentitydefs.py
+ https://github.com/python/cpython/blob/3.6/Lib/html/entities.py
+
+ By default, leading and trailing whitespace on each line as well as
+ leading and trailing empty lines will be stripped from the source
+ translation's content. Set `trim=False` to disable this behavior.
+ """
+
+ def __init__(self, path, key, trim=None):
+ if path.endswith('.ftl'):
+ raise NotSupportedError(
+ 'Please use COPY_PATTERN to migrate from Fluent files '
+ '({})'.format(path))
+
+ super().__init__(path, key)
+ self.trim = trim
+
+ def get_text(self, ctx):
+ return ctx.get_legacy_source(self.path, self.key)
+
+ @staticmethod
+ def trim_text(text):
+ # strip leading white-space from each line
+ text = re.sub('^[ \t]+', '', text, flags=re.M)
+ # strip trailing white-space from each line
+ text = re.sub('[ \t]+$', '', text, flags=re.M)
+ # strip leading and trailing empty lines
+ text = text.strip('\r\n')
+ return text
+
+ def __call__(self, ctx):
+ text = self.get_text(ctx)
+ if self.trim is not False:
+ text = self.trim_text(text)
+ return FTL.TextElement(text)
+
+
+class COPY(LegacySource):
+ """Create a Pattern with the translation value from the given source."""
+
+ def __call__(self, ctx):
+ element = super().__call__(ctx)
+ return Transform.pattern_of(element)
+
+
+PRINTF = re.compile(
+ r'%(?P<good>%|'
+ r'(?:(?P<number>[1-9][0-9]*)\$)?'
+ r'(?P<width>\*|[0-9]+)?'
+ r'(?P<prec>\.(?:\*|[0-9]+)?)?'
+ r'(?P<spec>[duxXosScpfg]))'
+)
+
+
+def number():
+ i = 1
+ while True:
+ yield i
+ i += 1
+
+
+def normalize_printf(text):
+ """Normalize printf arguments so that they're all numbered.
+ Gecko forbids mixing unnumbered and numbered ones, so
+ we just need to convert unnumbered to numbered ones.
+ Also remove ones that have zero width, as they're intended
+ to be removed from the output by the localizer.
+ """
+ next_number = number()
+
+ def normalized(match):
+ if match.group('good') == '%':
+ return '%'
+ hidden = match.group('width') == '0'
+ if match.group('number'):
+ return '' if hidden else match.group()
+ num = next(next_number)
+ return '' if hidden else '%{}${}'.format(num, match.group('spec'))
+
+ return PRINTF.sub(normalized, text)
+
+
+class REPLACE_IN_TEXT(Transform):
+ """Create a Pattern from a TextElement and replace legacy placeables.
+
+ The original placeables are defined as keys on the `replacements` dict.
+ For each key the value must be defined as a FTL Pattern, Placeable,
+ TextElement or Expression to be interpolated.
+ """
+
+ def __init__(self, element, replacements, normalize_printf=False):
+ self.element = element
+ self.replacements = replacements
+ self.normalize_printf = normalize_printf
+
+ def __call__(self, ctx):
+ # For each specified replacement, find all indices of the original
+ # placeable in the source translation. If missing, the list of indices
+ # will be empty.
+ value = self.element.value
+ if self.normalize_printf:
+ value = normalize_printf(value)
+ key_indices = {
+ key: [m.start() for m in re.finditer(re.escape(key), value)]
+ for key in self.replacements.keys()
+ }
+
+ # Build a dict of indices to replacement keys.
+ keys_indexed = {}
+ for key, indices in key_indices.items():
+ for index in indices:
+ keys_indexed[index] = key
+
+ # Order the replacements by the position of the original placeable in
+ # the translation.
+ replacements = (
+ (key, ctx.evaluate(self.replacements[key]))
+ for index, key
+ in sorted(keys_indexed.items(), key=lambda x: x[0])
+ )
+
+ # A list of PatternElements built from the legacy translation and the
+ # FTL replacements. It may contain empty or adjacent TextElements.
+ elements = []
+ tail = value
+
+ # Convert original placeables and text into FTL Nodes. For each
+ # original placeable the translation will be partitioned around it and
+ # the text before it will be converted into an `FTL.TextElement` and
+ # the placeable will be replaced with its replacement.
+ for key, node in replacements:
+ before, key, tail = tail.partition(key)
+ elements.append(FTL.TextElement(before))
+ elements.append(node)
+
+ # Don't forget about the tail after the loop ends.
+ elements.append(FTL.TextElement(tail))
+ return Transform.pattern_of(*elements)
+
+
+class REPLACE(LegacySource):
+ """Create a Pattern with interpolations from given source.
+
+ Interpolations in the translation value from the given source will be
+ replaced with FTL placeables using the `REPLACE_IN_TEXT` transform.
+ """
+
+ def __init__(
+ self, path, key, replacements, **kwargs
+ ):
+ # We default normalize_printf to False except for .properties files.
+ # We still allow the caller to override the default value.
+ normalize_printf = False
+ if 'normalize_printf' in kwargs:
+ normalize_printf = kwargs['normalize_printf']
+ del kwargs['normalize_printf']
+ elif path.endswith('.properties'):
+ normalize_printf = True
+
+ super().__init__(path, key, **kwargs)
+ self.replacements = replacements
+ self.normalize_printf = normalize_printf
+
+ def __call__(self, ctx):
+ element = super().__call__(ctx)
+ return REPLACE_IN_TEXT(
+ element, self.replacements,
+ normalize_printf=self.normalize_printf
+ )(ctx)
+
+
+class PLURALS(LegacySource):
+ """Create a Pattern with plurals from given source.
+
+ Build an `FTL.SelectExpression` with the supplied `selector` and variants
+ extracted from the source. The original translation should be a
+ semicolon-separated list of plural forms. Each form will be converted
+ into a TextElement and run through the `foreach` function, which should
+ return an `FTL.Node` or a `Transform`. By default, the `foreach` function
+ creates a valid Pattern from the TextElement passed into it.
+ """
+ DEFAULT_ORDER = ('zero', 'one', 'two', 'few', 'many', 'other')
+
+ def __init__(self, path, key, selector, foreach=Transform.pattern_of,
+ **kwargs):
+ super().__init__(path, key, **kwargs)
+ self.selector = selector
+ self.foreach = foreach
+
+ def __call__(self, ctx):
+ element = super().__call__(ctx)
+ selector = ctx.evaluate(self.selector)
+ keys = ctx.plural_categories
+ forms = [
+ FTL.TextElement(part.strip())
+ for part in element.value.split(';')
+ ]
+
+ # The default CLDR form should be the last we have in DEFAULT_ORDER,
+ # usually `other`, but in some cases `many`. If we don't have a variant
+ # for that, we'll append one, using the, in CLDR order, last existing
+ # variant in the legacy translation. That may or may not be the last
+ # variant.
+ default_key = [
+ key for key in reversed(self.DEFAULT_ORDER) if key in keys
+ ][0]
+
+ # Match keys to legacy forms in the order they are defined in Gecko's
+ # PluralForm.jsm. Filter out empty forms.
+ pairs = [
+ (key, var)
+ for key, var in zip(keys, forms)
+ if var.value
+ ]
+
+ # A special case for legacy translations which don't define any
+ # plural forms.
+ if len(pairs) == 0:
+ return Transform.pattern_of()
+
+ # A special case for languages with one plural category or one legacy
+ # variant. We don't need to insert a SelectExpression for them.
+ if len(pairs) == 1:
+ _, only_form = pairs[0]
+ only_variant = ctx.evaluate(self.foreach(only_form))
+ return Transform.pattern_of(only_variant)
+
+ # Make sure the default key is defined. If it's missing, use the last
+ # form (in CLDR order) found in the legacy translation.
+ pairs.sort(key=lambda pair: self.DEFAULT_ORDER.index(pair[0]))
+ last_key, last_form = pairs[-1]
+ if last_key != default_key:
+ pairs.append((default_key, last_form))
+
+ def createVariant(key, form):
+ # Run the legacy plural form through `foreach` which returns an
+ # `FTL.Node` describing the transformation required for each
+ # variant. Then evaluate it to a migrated FTL node.
+ value = ctx.evaluate(self.foreach(form))
+ return FTL.Variant(
+ key=FTL.Identifier(key),
+ value=value,
+ default=key == default_key
+ )
+
+ select = FTL.SelectExpression(
+ selector=selector,
+ variants=[
+ createVariant(key, form)
+ for key, form in pairs
+ ]
+ )
+
+ return Transform.pattern_of(select)
+
+
+class CONCAT(Transform):
+ """Create a new Pattern from Patterns, PatternElements and Expressions.
+
+ When called with at least two elements, `CONCAT` disables the trimming
+ behavior of the elements which are subclasses of `LegacySource` by
+ setting `trim=False`, unless `trim` has already been set explicitly. The
+ following two `CONCAT` calls are equivalent:
+
+ CONCAT(
+ FTL.TextElement("Hello"),
+ COPY("file.properties", "hello")
+ )
+
+ CONCAT(
+ FTL.TextElement("Hello"),
+ COPY("file.properties", "hello", trim=False)
+ )
+
+ Set `trim=True` explicitly to force trimming:
+
+ CONCAT(
+ FTL.TextElement("Hello "),
+ COPY("file.properties", "hello", trim=True)
+ )
+
+ When called with a single element and when the element is a subclass of
+ `LegacySource`, the trimming behavior is not changed. The following two
+ transforms are equivalent:
+
+ CONCAT(COPY("file.properties", "hello"))
+
+ COPY("file.properties", "hello")
+ """
+
+ def __init__(self, *elements, **kwargs):
+ # We want to support both passing elements as *elements in the
+ # migration specs and as elements=[]. The latter is used by
+ # FTL.BaseNode.traverse when it recreates the traversed node using its
+ # attributes as kwargs.
+ self.elements = list(kwargs.get('elements', elements))
+
+ # We want to make CONCAT(COPY()) equivalent to COPY() so that it's
+ # always safe (no-op) to wrap transforms in a CONCAT. This is used by
+ # the implementation of transforms_from.
+ if len(self.elements) > 1:
+ for elem in self.elements:
+ # Only change trim if it hasn't been set explicitly.
+ if isinstance(elem, LegacySource) and elem.trim is None:
+ elem.trim = False
+
+ def __call__(self, ctx):
+ return Transform.pattern_of(*self.elements)