diff options
Diffstat (limited to 'intl/icu/source/python/icutools/databuilder/filtration.py')
-rw-r--r-- | intl/icu/source/python/icutools/databuilder/filtration.py | 427 |
1 files changed, 427 insertions, 0 deletions
diff --git a/intl/icu/source/python/icutools/databuilder/filtration.py b/intl/icu/source/python/icutools/databuilder/filtration.py new file mode 100644 index 0000000000..e9339a0895 --- /dev/null +++ b/intl/icu/source/python/icutools/databuilder/filtration.py @@ -0,0 +1,427 @@ +# Copyright (C) 2018 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html + +# Python 2/3 Compatibility (ICU-20299) +# TODO(ICU-20301): Remove this. +from __future__ import print_function + +from abc import abstractmethod +from collections import defaultdict +import re +import sys + +from . import * +from . import utils +from .request_types import * + + +# Note: for this to be a proper abstract class, it should extend abc.ABC. +# There is no nice way to do this that works in both Python 2 and 3. +# TODO(ICU-20301): Make this inherit from abc.ABC. +class Filter(object): + @staticmethod + def create_from_json(json_data, io): + assert io != None + if "filterType" in json_data: + filter_type = json_data["filterType"] + else: + filter_type = "file-stem" + + if filter_type == "file-stem": + return FileStemFilter(json_data) + elif filter_type == "language": + return LanguageFilter(json_data) + elif filter_type == "regex": + return RegexFilter(json_data) + elif filter_type == "exclude": + return ExclusionFilter() + elif filter_type == "union": + return UnionFilter(json_data, io) + elif filter_type == "locale": + return LocaleFilter(json_data, io) + else: + print("Error: Unknown filterType option: %s" % filter_type, file=sys.stderr) + return None + + def filter(self, request): + if not request.apply_file_filter(self): + return [] + for file in request.all_input_files(): + assert self.match(file) + return [request] + + @staticmethod + def _file_to_file_stem(file): + start = file.filename.rfind("/") + limit = file.filename.rfind(".") + return file.filename[start+1:limit] + + @staticmethod + def _file_to_subdir(file): + limit = file.filename.rfind("/") + if limit == -1: + return None + return file.filename[:limit] + + @abstractmethod + def match(self, file): + pass + + +class InclusionFilter(Filter): + def match(self, file): + return True + + +class ExclusionFilter(Filter): + def match(self, file): + return False + + +class IncludeExcludeFilter(Filter): + def __init__(self, json_data): + if "whitelist" in json_data: + self.is_includelist = True + self.includelist = json_data["whitelist"] + elif "includelist" in json_data: + self.is_includelist = True + self.includelist = json_data["includelist"] + elif "blacklist" in json_data: + self.is_includelist = False + self.excludelist = json_data["blacklist"] + elif "excludelist" in json_data: + self.is_includelist = False + self.excludelist = json_data["excludelist"] + else: + raise AssertionError("Need either includelist or excludelist: %s" % str(json_data)) + + def match(self, file): + file_stem = self._file_to_file_stem(file) + return self._should_include(file_stem) + + @abstractmethod + def _should_include(self, file_stem): + pass + + +class FileStemFilter(IncludeExcludeFilter): + def _should_include(self, file_stem): + if self.is_includelist: + return file_stem in self.includelist + else: + return file_stem not in self.excludelist + + +class LanguageFilter(IncludeExcludeFilter): + def _should_include(self, file_stem): + language = file_stem.split("_")[0] + if language == "root": + # Always include root.txt + return True + if self.is_includelist: + return language in self.includelist + else: + return language not in self.excludelist + + +class RegexFilter(IncludeExcludeFilter): + def __init__(self, *args): + # TODO(ICU-20301): Change this to: super().__init__(*args) + super(RegexFilter, self).__init__(*args) + if self.is_includelist: + self.includelist = [re.compile(pat) for pat in self.includelist] + else: + self.excludelist = [re.compile(pat) for pat in self.excludelist] + + def _should_include(self, file_stem): + if self.is_includelist: + for pattern in self.includelist: + if pattern.match(file_stem): + return True + return False + else: + for pattern in self.excludelist: + if pattern.match(file_stem): + return False + return True + + +class UnionFilter(Filter): + def __init__(self, json_data, io): + # Collect the sub-filters. + self.sub_filters = [] + for filter_json in json_data["unionOf"]: + self.sub_filters.append(Filter.create_from_json(filter_json, io)) + + def match(self, file): + """Match iff any of the sub-filters match.""" + for filter in self.sub_filters: + if filter.match(file): + return True + return False + + +LANGUAGE_SCRIPT_REGEX = re.compile(r"^([a-z]{2,3})_[A-Z][a-z]{3}$") +LANGUAGE_ONLY_REGEX = re.compile(r"^[a-z]{2,3}$") + +class LocaleFilter(Filter): + def __init__(self, json_data, io): + if "whitelist" in json_data: + self.locales_requested = list(json_data["whitelist"]) + elif "includelist" in json_data: + self.locales_requested = list(json_data["includelist"]) + else: + raise AssertionError("You must have an includelist in a locale filter") + self.include_children = json_data.get("includeChildren", True) + self.include_scripts = json_data.get("includeScripts", False) + + # Load the dependency graph from disk + self.dependency_data_by_tree = { + tree: io.read_locale_deps(tree) + for tree in utils.ALL_TREES + } + + def match(self, file): + tree = self._file_to_subdir(file) + assert tree is not None + locale = self._file_to_file_stem(file) + + # A locale is *required* if it is *requested* or an ancestor of a + # *requested* locale. + if locale in self._locales_required(tree): + return True + + # Resolve include_scripts and include_children. + return self._match_recursive(locale, tree) + + def _match_recursive(self, locale, tree): + # Base case: return True if we reached a *requested* locale, + # or False if we ascend out of the locale tree. + if locale is None: + return False + if locale in self.locales_requested: + return True + + # Check for alternative scripts. + # This causes sr_Latn to check sr instead of going directly to root. + if self.include_scripts: + match = LANGUAGE_SCRIPT_REGEX.match(locale) + if match and self._match_recursive(match.group(1), tree): + return True + + # Check if we are a descendant of a *requested* locale. + if self.include_children: + parent = self._get_parent_locale(locale, tree) + if self._match_recursive(parent, tree): + return True + + # No matches. + return False + + def _get_parent_locale(self, locale, tree): + """Gets the parent locale in the given tree, according to dependency data.""" + dependency_data = self.dependency_data_by_tree[tree] + if "parents" in dependency_data and locale in dependency_data["parents"]: + return dependency_data["parents"][locale] + if "aliases" in dependency_data and locale in dependency_data["aliases"]: + return dependency_data["aliases"][locale] + if LANGUAGE_ONLY_REGEX.match(locale): + return "root" + i = locale.rfind("_") + if i < 0: + assert locale == "root", "Invalid locale: %s/%s" % (tree, locale) + return None + return locale[:i] + + def _locales_required(self, tree): + """Returns a generator of all required locales in the given tree.""" + for locale in self.locales_requested: + while locale is not None: + yield locale + locale = self._get_parent_locale(locale, tree) + + +def apply_filters(requests, config, io): + """Runs the filters and returns a new list of requests.""" + requests = _apply_file_filters(requests, config, io) + requests = _apply_resource_filters(requests, config, io) + return requests + + +def _apply_file_filters(old_requests, config, io): + """Filters out entire files.""" + filters = _preprocess_file_filters(old_requests, config, io) + new_requests = [] + for request in old_requests: + category = request.category + if category in filters: + new_requests += filters[category].filter(request) + else: + new_requests.append(request) + return new_requests + + +def _preprocess_file_filters(requests, config, io): + all_categories = set( + request.category + for request in requests + ) + all_categories.remove(None) + all_categories = list(sorted(all_categories)) + json_data = config.filters_json_data + filters = {} + default_filter_json = "exclude" if config.strategy == "additive" else "include" + for category in all_categories: + filter_json = default_filter_json + # Special default for category "brkitr_lstm" and "brkitr_adaboost" as "exclude" for now. + if "brkitr_lstm" == category or "brkitr_adaboost" == category: + filter_json = "exclude" + # Figure out the correct filter to create for now. + if "featureFilters" in json_data and category in json_data["featureFilters"]: + filter_json = json_data["featureFilters"][category] + if filter_json == "include" and "localeFilter" in json_data and category.endswith("_tree"): + filter_json = json_data["localeFilter"] + # Resolve the filter JSON into a filter object + if filter_json == "exclude": + filters[category] = ExclusionFilter() + elif filter_json == "include": + pass # no-op + else: + filters[category] = Filter.create_from_json(filter_json, io) + if "featureFilters" in json_data: + for category in json_data["featureFilters"]: + if category not in all_categories: + print("Warning: category %s is not known" % category, file=sys.stderr) + return filters + + +class ResourceFilterInfo(object): + def __init__(self, category, strategy): + self.category = category + self.strategy = strategy + self.filter_tmp_dir = "filters/%s" % category + self.input_files = None + self.filter_files = None + self.rules_by_file = None + + def apply_to_requests(self, all_requests): + # Call this method only once per list of requests. + assert self.input_files is None + for request in all_requests: + if request.category != self.category: + continue + if not isinstance(request, AbstractExecutionRequest): + continue + if request.tool != IcuTool("genrb"): + continue + if not request.input_files: + continue + self._set_files(request.input_files) + request.dep_targets += [self.filter_files[:]] + arg_str = "--filterDir {TMP_DIR}/%s" % self.filter_tmp_dir + request.args = "%s %s" % (arg_str, request.args) + + # Make sure we found the target request + if self.input_files is None: + print("WARNING: Category not found: %s" % self.category, file=sys.stderr) + self.input_files = [] + self.filter_files = [] + self.rules_by_file = [] + + def _set_files(self, files): + # Note: The input files to genrb for a certain category should always + # be the same. For example, there are often two genrb calls: one for + # --writePoolBundle, and the other for --usePoolBundle. They are both + # expected to have the same list of input files. + if self.input_files is not None: + assert self.input_files == files + return + self.input_files = list(files) + self.filter_files = [ + TmpFile("%s/%s" % (self.filter_tmp_dir, basename)) + for basename in ( + file.filename[file.filename.rfind("/")+1:] + for file in files + ) + ] + if self.strategy == "additive": + self.rules_by_file = [ + [r"-/", r"+/%%ALIAS", r"+/%%Parent"] + for _ in range(len(files)) + ] + else: + self.rules_by_file = [ + [r"+/"] + for _ in range(len(files)) + ] + + def add_rules(self, file_filter, rules): + for file, rule_list in zip(self.input_files, self.rules_by_file): + if file_filter.match(file): + rule_list += rules + + def make_requests(self): + # Map from rule list to filter files with that rule list + unique_rules = defaultdict(list) + for filter_file, rules in zip(self.filter_files, self.rules_by_file): + unique_rules[tuple(rules)].append(filter_file) + + new_requests = [] + i = 0 + for rules, filter_files in unique_rules.items(): + base_filter_file = filter_files[0] + new_requests += [ + PrintFileRequest( + name = "%s_print_%d" % (self.category, i), + output_file = base_filter_file, + content = self._generate_resource_filter_txt(rules) + ) + ] + i += 1 + for filter_file in filter_files[1:]: + new_requests += [ + CopyRequest( + name = "%s_copy_%d" % (self.category, i), + input_file = base_filter_file, + output_file = filter_file + ) + ] + i += 1 + return new_requests + + @staticmethod + def _generate_resource_filter_txt(rules): + result = "# Caution: This file is automatically generated\n\n" + result += "\n".join(rules) + return result + + +def _apply_resource_filters(all_requests, config, io): + """Creates filters for looking within resource bundle files.""" + json_data = config.filters_json_data + if "resourceFilters" not in json_data: + return all_requests + + collected = {} + for entry in json_data["resourceFilters"]: + if "files" in entry: + file_filter = Filter.create_from_json(entry["files"], io) + else: + file_filter = InclusionFilter() + for category in entry["categories"]: + # not defaultdict because we need to pass arguments to the constructor + if category not in collected: + filter_info = ResourceFilterInfo(category, config.strategy) + filter_info.apply_to_requests(all_requests) + collected[category] = filter_info + else: + filter_info = collected[category] + filter_info.add_rules(file_filter, entry["rules"]) + + # Add the filter generation requests to the beginning so that by default + # they are made before genrb gets run (order is required by windirect) + new_requests = [] + for filter_info in collected.values(): + new_requests += filter_info.make_requests() + new_requests += all_requests + return new_requests |