1 files changed, 449 insertions, 0 deletions
diff --git a/testing/web-platform/tests/tools/manifest/manifest.py b/testing/web-platform/tests/tools/manifest/manifest.py
new file mode 100644
index 0000000000..4b7792ec00
--- /dev/null
+++ b/testing/web-platform/tests/tools/manifest/manifest.py
@@ -0,0 +1,449 @@
+import os
+import sys
+from atomicwrites import atomic_write
+from copy import deepcopy
+from multiprocessing import Pool, cpu_count
+
+from . import jsonlib
+from . import vcs
+from .item import (ConformanceCheckerTest,
+                   CrashTest,
+                   ManifestItem,
+                   ManualTest,
+                   PrintRefTest,
+                   RefTest,
+                   SupportFile,
+                   TestharnessTest,
+                   VisualTest,
+                   WebDriverSpecTest)
+from .log import get_logger
+from .sourcefile import SourceFile
+from .typedata import TypeData
+
+MYPY = False
+if MYPY:
+    # MYPY is set to True when run under Mypy.
+    from logging import Logger
+    from typing import Any
+    from typing import Container
+    from typing import Dict
+    from typing import IO
+    from typing import Iterator
+    from typing import Iterable
+    from typing import Optional
+    from typing import Set
+    from typing import Text
+    from typing import Tuple
+    from typing import Type
+    from typing import Union
+
+
+CURRENT_VERSION = 8  # type: int
+
+
+class ManifestError(Exception):
+    pass
+
+
+class ManifestVersionMismatch(ManifestError):
+    pass
+
+
+class InvalidCacheError(Exception):
+    pass
+
+
+item_classes = {"testharness": TestharnessTest,
+                "reftest": RefTest,
+                "print-reftest": PrintRefTest,
+                "crashtest": CrashTest,
+                "manual": ManualTest,
+                "wdspec": WebDriverSpecTest,
+                "conformancechecker": ConformanceCheckerTest,
+                "visual": VisualTest,
+                "support": SupportFile}  # type: Dict[Text, Type[ManifestItem]]
+
+
+def compute_manifest_items(source_file):
+    # type: (SourceFile) -> Tuple[Tuple[Text, ...], Text, Set[ManifestItem], Text]
+    rel_path_parts = source_file.rel_path_parts
+    new_type, manifest_items = source_file.manifest_items()
+    file_hash = source_file.hash
+    return rel_path_parts, new_type, set(manifest_items), file_hash
+
+
+if MYPY:
+    ManifestDataType = Dict[Any, TypeData]
+else:
+    ManifestDataType = dict
+
+
+class ManifestData(ManifestDataType):
+    def __init__(self, manifest):
+        # type: (Manifest) -> None
+        """Dictionary subclass containing a TypeData instance for each test type,
+        keyed by type name"""
+        self.initialized = False  # type: bool
+        for key, value in item_classes.items():
+            self[key] = TypeData(manifest, value)
+        self.initialized = True
+        self.json_obj = None  # type: None
+
+    def __setitem__(self, key, value):
+        # type: (Text, TypeData) -> None
+        if self.initialized:
+            raise AttributeError
+        dict.__setitem__(self, key, value)
+
+    def paths(self):
+        # type: () -> Set[Text]
+        """Get a list of all paths containing test items
+        without actually constructing all the items"""
+        rv = set()  # type: Set[Text]
+        for item_data in self.values():
+            for item in item_data:
+                rv.add(os.path.sep.join(item))
+        return rv
+
+    def type_by_path(self):
+        # type: () -> Dict[Tuple[Text, ...], Text]
+        rv = {}
+        for item_type, item_data in self.items():
+            for item in item_data:
+                rv[item] = item_type
+        return rv
+
+
+class Manifest:
+    def __init__(self, tests_root, url_base="/"):
+        # type: (Text, Text) -> None
+        assert url_base is not None
+        self._data = ManifestData(self)  # type: ManifestData
+        self.tests_root = tests_root  # type: Text
+        self.url_base = url_base  # type: Text
+
+    def __iter__(self):
+        # type: () -> Iterator[Tuple[Text, Text, Set[ManifestItem]]]
+        return self.itertypes()
+
+    def itertypes(self, *types):
+        # type: (*Text) -> Iterator[Tuple[Text, Text, Set[ManifestItem]]]
+        for item_type in (types or sorted(self._data.keys())):
+            for path in self._data[item_type]:
+                rel_path = os.sep.join(path)
+                tests = self._data[item_type][path]
+                yield item_type, rel_path, tests
+
+    def iterpath(self, path):
+        # type: (Text) -> Iterable[ManifestItem]
+        tpath = tuple(path.split(os.path.sep))
+
+        for type_tests in self._data.values():
+            i = type_tests.get(tpath, set())
+            assert i is not None
+            yield from i
+
+    def iterdir(self, dir_name):
+        # type: (Text) -> Iterable[ManifestItem]
+        tpath = tuple(dir_name.split(os.path.sep))
+        tpath_len = len(tpath)
+
+        for type_tests in self._data.values():
+            for path, tests in type_tests.items():
+                if path[:tpath_len] == tpath:
+                    yield from tests
+
+    def update(self, tree, parallel=True):
+        # type: (Iterable[Tuple[Text, Optional[Text], bool]], bool) -> bool
+        """Update the manifest given an iterable of items that make up the updated manifest.
+
+        The iterable must either generate tuples of the form (SourceFile, True) for paths
+        that are to be updated, or (path, False) for items that are not to be updated. This
+        unusual API is designed as an optimistaion meaning that SourceFile items need not be
+        constructed in the case we are not updating a path, but the absence of an item from
+        the iterator may be used to remove defunct entries from the manifest."""
+
+        logger = get_logger()
+
+        changed = False
+
+        # Create local variable references to these dicts so we avoid the
+        # attribute access in the hot loop below
+        data = self._data
+
+        types = data.type_by_path()
+        remaining_manifest_paths = set(types)
+
+        to_update = []
+
+        for path, file_hash, updated in tree:
+            path_parts = tuple(path.split(os.path.sep))
+            is_new = path_parts not in remaining_manifest_paths
+
+            if not updated and is_new:
+                # This is kind of a bandaid; if we ended up here the cache
+                # was invalid but we've been using it anyway. That's obviously
+                # bad; we should fix the underlying issue that we sometimes
+                # use an invalid cache. But at least this fixes the immediate
+                # problem
+                raise InvalidCacheError
+
+            if not updated:
+                remaining_manifest_paths.remove(path_parts)
+            else:
+                assert self.tests_root is not None
+                source_file = SourceFile(self.tests_root,
+                                         path,
+                                         self.url_base,
+                                         file_hash)
+
+                hash_changed = False  # type: bool
+
+                if not is_new:
+                    if file_hash is None:
+                        file_hash = source_file.hash
+                    remaining_manifest_paths.remove(path_parts)
+                    old_type = types[path_parts]
+                    old_hash = data[old_type].hashes[path_parts]
+                    if old_hash != file_hash:
+                        hash_changed = True
+                        del data[old_type][path_parts]
+
+                if is_new or hash_changed:
+                    to_update.append(source_file)
+
+        if to_update:
+            logger.debug("Computing manifest update for %s items" % len(to_update))
+            changed = True
+
+
+        # 25 items was derived experimentally (2020-01) to be approximately the
+        # point at which it is quicker to create a Pool and parallelize update.
+        pool = None
+        if parallel and len(to_update) > 25 and cpu_count() > 1:
+            # On Python 3 on Windows, using >= MAXIMUM_WAIT_OBJECTS processes
+            # causes a crash in the multiprocessing module. Whilst this enum
+            # can technically have any value, it is usually 64. For safety,
+            # restrict manifest regeneration to 48 processes on Windows.
+            #
+            # See https://bugs.python.org/issue26903 and https://bugs.python.org/issue40263
+            processes = cpu_count()
+            if sys.platform == "win32" and processes > 48:
+                processes = 48
+            pool = Pool(processes)
+
+            # chunksize set > 1 when more than 10000 tests, because
+            # chunking is a net-gain once we get to very large numbers
+            # of items (again, experimentally, 2020-01)
+            chunksize = max(1, len(to_update) // 10000)
+            logger.debug("Doing a multiprocessed update. CPU count: %s, "
+                "processes: %s, chunksize: %s" % (cpu_count(), processes, chunksize))
+            results = pool.imap_unordered(compute_manifest_items,
+                                          to_update,
+                                          chunksize=chunksize
+                                          )  # type: Iterator[Tuple[Tuple[Text, ...], Text, Set[ManifestItem], Text]]
+        else:
+            results = map(compute_manifest_items, to_update)
+
+        for result in results:
+            rel_path_parts, new_type, manifest_items, file_hash = result
+            data[new_type][rel_path_parts] = manifest_items
+            data[new_type].hashes[rel_path_parts] = file_hash
+
+        # Make sure to terminate the Pool, to avoid hangs on Python 3.
+        # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool
+        if pool is not None:
+            pool.terminate()
+
+        if remaining_manifest_paths:
+            changed = True
+            for rel_path_parts in remaining_manifest_paths:
+                for test_data in data.values():
+                    if rel_path_parts in test_data:
+                        del test_data[rel_path_parts]
+
+        return changed
+
+    def to_json(self, caller_owns_obj=True):
+        # type: (bool) -> Dict[Text, Any]
+        """Dump a manifest into a object which can be serialized as JSON
+
+        If caller_owns_obj is False, then the return value remains
+        owned by the manifest; it is _vitally important_ that _no_
+        (even read) operation is done on the manifest, as otherwise
+        objects within the object graph rooted at the return value can
+        be mutated. This essentially makes this mode very dangerous
+        and only to be used under extreme care.
+
+        """
+        out_items = {
+            test_type: type_paths.to_json()
+            for test_type, type_paths in self._data.items() if type_paths
+        }
+
+        if caller_owns_obj:
+            out_items = deepcopy(out_items)
+
+        rv = {"url_base": self.url_base,
+              "items": out_items,
+              "version": CURRENT_VERSION}  # type: Dict[Text, Any]
+        return rv
+
+    @classmethod
+    def from_json(cls, tests_root, obj, types=None, callee_owns_obj=False):
+        # type: (Text, Dict[Text, Any], Optional[Container[Text]], bool) -> Manifest
+        """Load a manifest from a JSON object
+
+        This loads a manifest for a given local test_root path from an
+        object obj, potentially partially loading it to only load the
+        types given by types.
+
+        If callee_owns_obj is True, then ownership of obj transfers
+        to this function when called, and the caller must never mutate
+        the obj or anything referred to in the object graph rooted at
+        obj.
+
+        """
+        version = obj.get("version")
+        if version != CURRENT_VERSION:
+            raise ManifestVersionMismatch
+
+        self = cls(tests_root, url_base=obj.get("url_base", "/"))
+        if not hasattr(obj, "items"):
+            raise ManifestError
+
+        for test_type, type_paths in obj["items"].items():
+            if test_type not in item_classes:
+                raise ManifestError
+
+            if types and test_type not in types:
+                continue
+
+            if not callee_owns_obj:
+                type_paths = deepcopy(type_paths)
+
+            self._data[test_type].set_json(type_paths)
+
+        return self
+
+
+def load(tests_root, manifest, types=None):
+    # type: (Text, Union[IO[bytes], Text], Optional[Container[Text]]) -> Optional[Manifest]
+    logger = get_logger()
+
+    logger.warning("Prefer load_and_update instead")
+    return _load(logger, tests_root, manifest, types)
+
+
+__load_cache = {}  # type: Dict[Text, Manifest]
+
+
+def _load(logger,  # type: Logger
+          tests_root,  # type: Text
+          manifest,  # type: Union[IO[bytes], Text]
+          types=None,  # type: Optional[Container[Text]]
+          allow_cached=True  # type: bool
+          ):
+    # type: (...) -> Optional[Manifest]
+    manifest_path = (manifest if isinstance(manifest, str)
+                     else manifest.name)
+    if allow_cached and manifest_path in __load_cache:
+        return __load_cache[manifest_path]
+
+    if isinstance(manifest, str):
+        if os.path.exists(manifest):
+            logger.debug("Opening manifest at %s" % manifest)
+        else:
+            logger.debug("Creating new manifest at %s" % manifest)
+        try:
+            with open(manifest, encoding="utf-8") as f:
+                rv = Manifest.from_json(tests_root,
+                                        jsonlib.load(f),
+                                        types=types,
+                                        callee_owns_obj=True)
+        except OSError:
+            return None
+        except ValueError:
+            logger.warning("%r may be corrupted", manifest)
+            return None
+    else:
+        rv = Manifest.from_json(tests_root,
+                                jsonlib.load(manifest),
+                                types=types,
+                                callee_owns_obj=True)
+
+    if allow_cached:
+        __load_cache[manifest_path] = rv
+    return rv
+
+
+def load_and_update(tests_root,  # type: Text
+                    manifest_path,  # type: Text
+                    url_base,  # type: Text
+                    update=True,  # type: bool
+                    rebuild=False,  # type: bool
+                    metadata_path=None,  # type: Optional[Text]
+                    cache_root=None,  # type: Optional[Text]
+                    working_copy=True,  # type: bool
+                    types=None,  # type: Optional[Container[Text]]
+                    write_manifest=True,  # type: bool
+                    allow_cached=True,  # type: bool
+                    parallel=True  # type: bool
+                    ):
+    # type: (...) -> Manifest
+
+    logger = get_logger()
+
+    manifest = None
+    if not rebuild:
+        try:
+            manifest = _load(logger,
+                             tests_root,
+                             manifest_path,
+                             types=types,
+                             allow_cached=allow_cached)
+        except ManifestVersionMismatch:
+            logger.info("Manifest version changed, rebuilding")
+        except ManifestError:
+            logger.warning("Failed to load manifest, rebuilding")
+
+        if manifest is not None and manifest.url_base != url_base:
+            logger.info("Manifest url base did not match, rebuilding")
+            manifest = None
+
+    if manifest is None:
+        manifest = Manifest(tests_root, url_base)
+        rebuild = True
+        update = True
+
+    if rebuild or update:
+        logger.info("Updating manifest")
+        for retry in range(2):
+            try:
+                tree = vcs.get_tree(tests_root, manifest, manifest_path, cache_root,
+                                    working_copy, rebuild)
+                changed = manifest.update(tree, parallel)
+                break
+            except InvalidCacheError:
+                logger.warning("Manifest cache was invalid, doing a complete rebuild")
+                rebuild = True
+        else:
+            # If we didn't break there was an error
+            raise
+        if write_manifest and changed:
+            write(manifest, manifest_path)
+        tree.dump_caches()
+
+    return manifest
+
+
+def write(manifest, manifest_path):
+    # type: (Manifest, Text) -> None
+    dir_name = os.path.dirname(manifest_path)
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+    with atomic_write(manifest_path, overwrite=True) as f:
+        # Use ',' instead of the default ', ' separator to prevent trailing
+        # spaces: https://docs.python.org/2/library/json.html#json.dump
+        jsonlib.dump_dist(manifest.to_json(caller_owns_obj=True), f)
+        f.write("\n")