diff options
Diffstat (limited to 'testing/web-platform/tests/tools/manifest/manifest.py')
-rw-r--r-- | testing/web-platform/tests/tools/manifest/manifest.py | 449 |
1 files changed, 449 insertions, 0 deletions
diff --git a/testing/web-platform/tests/tools/manifest/manifest.py b/testing/web-platform/tests/tools/manifest/manifest.py new file mode 100644 index 0000000000..4b7792ec00 --- /dev/null +++ b/testing/web-platform/tests/tools/manifest/manifest.py @@ -0,0 +1,449 @@ +import os +import sys +from atomicwrites import atomic_write +from copy import deepcopy +from multiprocessing import Pool, cpu_count + +from . import jsonlib +from . import vcs +from .item import (ConformanceCheckerTest, + CrashTest, + ManifestItem, + ManualTest, + PrintRefTest, + RefTest, + SupportFile, + TestharnessTest, + VisualTest, + WebDriverSpecTest) +from .log import get_logger +from .sourcefile import SourceFile +from .typedata import TypeData + +MYPY = False +if MYPY: + # MYPY is set to True when run under Mypy. + from logging import Logger + from typing import Any + from typing import Container + from typing import Dict + from typing import IO + from typing import Iterator + from typing import Iterable + from typing import Optional + from typing import Set + from typing import Text + from typing import Tuple + from typing import Type + from typing import Union + + +CURRENT_VERSION = 8 # type: int + + +class ManifestError(Exception): + pass + + +class ManifestVersionMismatch(ManifestError): + pass + + +class InvalidCacheError(Exception): + pass + + +item_classes = {"testharness": TestharnessTest, + "reftest": RefTest, + "print-reftest": PrintRefTest, + "crashtest": CrashTest, + "manual": ManualTest, + "wdspec": WebDriverSpecTest, + "conformancechecker": ConformanceCheckerTest, + "visual": VisualTest, + "support": SupportFile} # type: Dict[Text, Type[ManifestItem]] + + +def compute_manifest_items(source_file): + # type: (SourceFile) -> Tuple[Tuple[Text, ...], Text, Set[ManifestItem], Text] + rel_path_parts = source_file.rel_path_parts + new_type, manifest_items = source_file.manifest_items() + file_hash = source_file.hash + return rel_path_parts, new_type, set(manifest_items), file_hash + + +if MYPY: + ManifestDataType = Dict[Any, TypeData] +else: + ManifestDataType = dict + + +class ManifestData(ManifestDataType): + def __init__(self, manifest): + # type: (Manifest) -> None + """Dictionary subclass containing a TypeData instance for each test type, + keyed by type name""" + self.initialized = False # type: bool + for key, value in item_classes.items(): + self[key] = TypeData(manifest, value) + self.initialized = True + self.json_obj = None # type: None + + def __setitem__(self, key, value): + # type: (Text, TypeData) -> None + if self.initialized: + raise AttributeError + dict.__setitem__(self, key, value) + + def paths(self): + # type: () -> Set[Text] + """Get a list of all paths containing test items + without actually constructing all the items""" + rv = set() # type: Set[Text] + for item_data in self.values(): + for item in item_data: + rv.add(os.path.sep.join(item)) + return rv + + def type_by_path(self): + # type: () -> Dict[Tuple[Text, ...], Text] + rv = {} + for item_type, item_data in self.items(): + for item in item_data: + rv[item] = item_type + return rv + + +class Manifest: + def __init__(self, tests_root, url_base="/"): + # type: (Text, Text) -> None + assert url_base is not None + self._data = ManifestData(self) # type: ManifestData + self.tests_root = tests_root # type: Text + self.url_base = url_base # type: Text + + def __iter__(self): + # type: () -> Iterator[Tuple[Text, Text, Set[ManifestItem]]] + return self.itertypes() + + def itertypes(self, *types): + # type: (*Text) -> Iterator[Tuple[Text, Text, Set[ManifestItem]]] + for item_type in (types or sorted(self._data.keys())): + for path in self._data[item_type]: + rel_path = os.sep.join(path) + tests = self._data[item_type][path] + yield item_type, rel_path, tests + + def iterpath(self, path): + # type: (Text) -> Iterable[ManifestItem] + tpath = tuple(path.split(os.path.sep)) + + for type_tests in self._data.values(): + i = type_tests.get(tpath, set()) + assert i is not None + yield from i + + def iterdir(self, dir_name): + # type: (Text) -> Iterable[ManifestItem] + tpath = tuple(dir_name.split(os.path.sep)) + tpath_len = len(tpath) + + for type_tests in self._data.values(): + for path, tests in type_tests.items(): + if path[:tpath_len] == tpath: + yield from tests + + def update(self, tree, parallel=True): + # type: (Iterable[Tuple[Text, Optional[Text], bool]], bool) -> bool + """Update the manifest given an iterable of items that make up the updated manifest. + + The iterable must either generate tuples of the form (SourceFile, True) for paths + that are to be updated, or (path, False) for items that are not to be updated. This + unusual API is designed as an optimistaion meaning that SourceFile items need not be + constructed in the case we are not updating a path, but the absence of an item from + the iterator may be used to remove defunct entries from the manifest.""" + + logger = get_logger() + + changed = False + + # Create local variable references to these dicts so we avoid the + # attribute access in the hot loop below + data = self._data + + types = data.type_by_path() + remaining_manifest_paths = set(types) + + to_update = [] + + for path, file_hash, updated in tree: + path_parts = tuple(path.split(os.path.sep)) + is_new = path_parts not in remaining_manifest_paths + + if not updated and is_new: + # This is kind of a bandaid; if we ended up here the cache + # was invalid but we've been using it anyway. That's obviously + # bad; we should fix the underlying issue that we sometimes + # use an invalid cache. But at least this fixes the immediate + # problem + raise InvalidCacheError + + if not updated: + remaining_manifest_paths.remove(path_parts) + else: + assert self.tests_root is not None + source_file = SourceFile(self.tests_root, + path, + self.url_base, + file_hash) + + hash_changed = False # type: bool + + if not is_new: + if file_hash is None: + file_hash = source_file.hash + remaining_manifest_paths.remove(path_parts) + old_type = types[path_parts] + old_hash = data[old_type].hashes[path_parts] + if old_hash != file_hash: + hash_changed = True + del data[old_type][path_parts] + + if is_new or hash_changed: + to_update.append(source_file) + + if to_update: + logger.debug("Computing manifest update for %s items" % len(to_update)) + changed = True + + + # 25 items was derived experimentally (2020-01) to be approximately the + # point at which it is quicker to create a Pool and parallelize update. + pool = None + if parallel and len(to_update) > 25 and cpu_count() > 1: + # On Python 3 on Windows, using >= MAXIMUM_WAIT_OBJECTS processes + # causes a crash in the multiprocessing module. Whilst this enum + # can technically have any value, it is usually 64. For safety, + # restrict manifest regeneration to 48 processes on Windows. + # + # See https://bugs.python.org/issue26903 and https://bugs.python.org/issue40263 + processes = cpu_count() + if sys.platform == "win32" and processes > 48: + processes = 48 + pool = Pool(processes) + + # chunksize set > 1 when more than 10000 tests, because + # chunking is a net-gain once we get to very large numbers + # of items (again, experimentally, 2020-01) + chunksize = max(1, len(to_update) // 10000) + logger.debug("Doing a multiprocessed update. CPU count: %s, " + "processes: %s, chunksize: %s" % (cpu_count(), processes, chunksize)) + results = pool.imap_unordered(compute_manifest_items, + to_update, + chunksize=chunksize + ) # type: Iterator[Tuple[Tuple[Text, ...], Text, Set[ManifestItem], Text]] + else: + results = map(compute_manifest_items, to_update) + + for result in results: + rel_path_parts, new_type, manifest_items, file_hash = result + data[new_type][rel_path_parts] = manifest_items + data[new_type].hashes[rel_path_parts] = file_hash + + # Make sure to terminate the Pool, to avoid hangs on Python 3. + # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool + if pool is not None: + pool.terminate() + + if remaining_manifest_paths: + changed = True + for rel_path_parts in remaining_manifest_paths: + for test_data in data.values(): + if rel_path_parts in test_data: + del test_data[rel_path_parts] + + return changed + + def to_json(self, caller_owns_obj=True): + # type: (bool) -> Dict[Text, Any] + """Dump a manifest into a object which can be serialized as JSON + + If caller_owns_obj is False, then the return value remains + owned by the manifest; it is _vitally important_ that _no_ + (even read) operation is done on the manifest, as otherwise + objects within the object graph rooted at the return value can + be mutated. This essentially makes this mode very dangerous + and only to be used under extreme care. + + """ + out_items = { + test_type: type_paths.to_json() + for test_type, type_paths in self._data.items() if type_paths + } + + if caller_owns_obj: + out_items = deepcopy(out_items) + + rv = {"url_base": self.url_base, + "items": out_items, + "version": CURRENT_VERSION} # type: Dict[Text, Any] + return rv + + @classmethod + def from_json(cls, tests_root, obj, types=None, callee_owns_obj=False): + # type: (Text, Dict[Text, Any], Optional[Container[Text]], bool) -> Manifest + """Load a manifest from a JSON object + + This loads a manifest for a given local test_root path from an + object obj, potentially partially loading it to only load the + types given by types. + + If callee_owns_obj is True, then ownership of obj transfers + to this function when called, and the caller must never mutate + the obj or anything referred to in the object graph rooted at + obj. + + """ + version = obj.get("version") + if version != CURRENT_VERSION: + raise ManifestVersionMismatch + + self = cls(tests_root, url_base=obj.get("url_base", "/")) + if not hasattr(obj, "items"): + raise ManifestError + + for test_type, type_paths in obj["items"].items(): + if test_type not in item_classes: + raise ManifestError + + if types and test_type not in types: + continue + + if not callee_owns_obj: + type_paths = deepcopy(type_paths) + + self._data[test_type].set_json(type_paths) + + return self + + +def load(tests_root, manifest, types=None): + # type: (Text, Union[IO[bytes], Text], Optional[Container[Text]]) -> Optional[Manifest] + logger = get_logger() + + logger.warning("Prefer load_and_update instead") + return _load(logger, tests_root, manifest, types) + + +__load_cache = {} # type: Dict[Text, Manifest] + + +def _load(logger, # type: Logger + tests_root, # type: Text + manifest, # type: Union[IO[bytes], Text] + types=None, # type: Optional[Container[Text]] + allow_cached=True # type: bool + ): + # type: (...) -> Optional[Manifest] + manifest_path = (manifest if isinstance(manifest, str) + else manifest.name) + if allow_cached and manifest_path in __load_cache: + return __load_cache[manifest_path] + + if isinstance(manifest, str): + if os.path.exists(manifest): + logger.debug("Opening manifest at %s" % manifest) + else: + logger.debug("Creating new manifest at %s" % manifest) + try: + with open(manifest, encoding="utf-8") as f: + rv = Manifest.from_json(tests_root, + jsonlib.load(f), + types=types, + callee_owns_obj=True) + except OSError: + return None + except ValueError: + logger.warning("%r may be corrupted", manifest) + return None + else: + rv = Manifest.from_json(tests_root, + jsonlib.load(manifest), + types=types, + callee_owns_obj=True) + + if allow_cached: + __load_cache[manifest_path] = rv + return rv + + +def load_and_update(tests_root, # type: Text + manifest_path, # type: Text + url_base, # type: Text + update=True, # type: bool + rebuild=False, # type: bool + metadata_path=None, # type: Optional[Text] + cache_root=None, # type: Optional[Text] + working_copy=True, # type: bool + types=None, # type: Optional[Container[Text]] + write_manifest=True, # type: bool + allow_cached=True, # type: bool + parallel=True # type: bool + ): + # type: (...) -> Manifest + + logger = get_logger() + + manifest = None + if not rebuild: + try: + manifest = _load(logger, + tests_root, + manifest_path, + types=types, + allow_cached=allow_cached) + except ManifestVersionMismatch: + logger.info("Manifest version changed, rebuilding") + except ManifestError: + logger.warning("Failed to load manifest, rebuilding") + + if manifest is not None and manifest.url_base != url_base: + logger.info("Manifest url base did not match, rebuilding") + manifest = None + + if manifest is None: + manifest = Manifest(tests_root, url_base) + rebuild = True + update = True + + if rebuild or update: + logger.info("Updating manifest") + for retry in range(2): + try: + tree = vcs.get_tree(tests_root, manifest, manifest_path, cache_root, + working_copy, rebuild) + changed = manifest.update(tree, parallel) + break + except InvalidCacheError: + logger.warning("Manifest cache was invalid, doing a complete rebuild") + rebuild = True + else: + # If we didn't break there was an error + raise + if write_manifest and changed: + write(manifest, manifest_path) + tree.dump_caches() + + return manifest + + +def write(manifest, manifest_path): + # type: (Manifest, Text) -> None + dir_name = os.path.dirname(manifest_path) + if not os.path.exists(dir_name): + os.makedirs(dir_name) + with atomic_write(manifest_path, overwrite=True) as f: + # Use ',' instead of the default ', ' separator to prevent trailing + # spaces: https://docs.python.org/2/library/json.html#json.dump + jsonlib.dump_dist(manifest.to_json(caller_owns_obj=True), f) + f.write("\n") |