testing/web-platform/tests/tools/manifest/manifest.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449

import os
import sys
from atomicwrites import atomic_write
from copy import deepcopy
from multiprocessing import Pool, cpu_count

from . import jsonlib
from . import vcs
from .item import (ConformanceCheckerTest,
                   CrashTest,
                   ManifestItem,
                   ManualTest,
                   PrintRefTest,
                   RefTest,
                   SupportFile,
                   TestharnessTest,
                   VisualTest,
                   WebDriverSpecTest)
from .log import get_logger
from .sourcefile import SourceFile
from .typedata import TypeData

MYPY = False
if MYPY:
    # MYPY is set to True when run under Mypy.
    from logging import Logger
    from typing import Any
    from typing import Container
    from typing import Dict
    from typing import IO
    from typing import Iterator
    from typing import Iterable
    from typing import Optional
    from typing import Set
    from typing import Text
    from typing import Tuple
    from typing import Type
    from typing import Union


CURRENT_VERSION = 8  # type: int


class ManifestError(Exception):
    pass


class ManifestVersionMismatch(ManifestError):
    pass


class InvalidCacheError(Exception):
    pass


item_classes = {"testharness": TestharnessTest,
                "reftest": RefTest,
                "print-reftest": PrintRefTest,
                "crashtest": CrashTest,
                "manual": ManualTest,
                "wdspec": WebDriverSpecTest,
                "conformancechecker": ConformanceCheckerTest,
                "visual": VisualTest,
                "support": SupportFile}  # type: Dict[Text, Type[ManifestItem]]


def compute_manifest_items(source_file):
    # type: (SourceFile) -> Tuple[Tuple[Text, ...], Text, Set[ManifestItem], Text]
    rel_path_parts = source_file.rel_path_parts
    new_type, manifest_items = source_file.manifest_items()
    file_hash = source_file.hash
    return rel_path_parts, new_type, set(manifest_items), file_hash


if MYPY:
    ManifestDataType = Dict[Any, TypeData]
else:
    ManifestDataType = dict


class ManifestData(ManifestDataType):
    def __init__(self, manifest):
        # type: (Manifest) -> None
        """Dictionary subclass containing a TypeData instance for each test type,
        keyed by type name"""
        self.initialized = False  # type: bool
        for key, value in item_classes.items():
            self[key] = TypeData(manifest, value)
        self.initialized = True
        self.json_obj = None  # type: None

    def __setitem__(self, key, value):
        # type: (Text, TypeData) -> None
        if self.initialized:
            raise AttributeError
        dict.__setitem__(self, key, value)

    def paths(self):
        # type: () -> Set[Text]
        """Get a list of all paths containing test items
        without actually constructing all the items"""
        rv = set()  # type: Set[Text]
        for item_data in self.values():
            for item in item_data:
                rv.add(os.path.sep.join(item))
        return rv

    def type_by_path(self):
        # type: () -> Dict[Tuple[Text, ...], Text]
        rv = {}
        for item_type, item_data in self.items():
            for item in item_data:
                rv[item] = item_type
        return rv


class Manifest:
    def __init__(self, tests_root, url_base="/"):
        # type: (Text, Text) -> None
        assert url_base is not None
        self._data = ManifestData(self)  # type: ManifestData
        self.tests_root = tests_root  # type: Text
        self.url_base = url_base  # type: Text

    def __iter__(self):
        # type: () -> Iterator[Tuple[Text, Text, Set[ManifestItem]]]
        return self.itertypes()

    def itertypes(self, *types):
        # type: (*Text) -> Iterator[Tuple[Text, Text, Set[ManifestItem]]]
        for item_type in (types or sorted(self._data.keys())):
            for path in self._data[item_type]:
                rel_path = os.sep.join(path)
                tests = self._data[item_type][path]
                yield item_type, rel_path, tests

    def iterpath(self, path):
        # type: (Text) -> Iterable[ManifestItem]
        tpath = tuple(path.split(os.path.sep))

        for type_tests in self._data.values():
            i = type_tests.get(tpath, set())
            assert i is not None
            yield from i

    def iterdir(self, dir_name):
        # type: (Text) -> Iterable[ManifestItem]
        tpath = tuple(dir_name.split(os.path.sep))
        tpath_len = len(tpath)

        for type_tests in self._data.values():
            for path, tests in type_tests.items():
                if path[:tpath_len] == tpath:
                    yield from tests

    def update(self, tree, parallel=True):
        # type: (Iterable[Tuple[Text, Optional[Text], bool]], bool) -> bool
        """Update the manifest given an iterable of items that make up the updated manifest.

        The iterable must either generate tuples of the form (SourceFile, True) for paths
        that are to be updated, or (path, False) for items that are not to be updated. This
        unusual API is designed as an optimistaion meaning that SourceFile items need not be
        constructed in the case we are not updating a path, but the absence of an item from
        the iterator may be used to remove defunct entries from the manifest."""

        logger = get_logger()

        changed = False

        # Create local variable references to these dicts so we avoid the
        # attribute access in the hot loop below
        data = self._data

        types = data.type_by_path()
        remaining_manifest_paths = set(types)

        to_update = []

        for path, file_hash, updated in tree:
            path_parts = tuple(path.split(os.path.sep))
            is_new = path_parts not in remaining_manifest_paths

            if not updated and is_new:
                # This is kind of a bandaid; if we ended up here the cache
                # was invalid but we've been using it anyway. That's obviously
                # bad; we should fix the underlying issue that we sometimes
                # use an invalid cache. But at least this fixes the immediate
                # problem
                raise InvalidCacheError

            if not updated:
                remaining_manifest_paths.remove(path_parts)
            else:
                assert self.tests_root is not None
                source_file = SourceFile(self.tests_root,
                                         path,
                                         self.url_base,
                                         file_hash)

                hash_changed = False  # type: bool

                if not is_new:
                    if file_hash is None:
                        file_hash = source_file.hash
                    remaining_manifest_paths.remove(path_parts)
                    old_type = types[path_parts]
                    old_hash = data[old_type].hashes[path_parts]
                    if old_hash != file_hash:
                        hash_changed = True
                        del data[old_type][path_parts]

                if is_new or hash_changed:
                    to_update.append(source_file)

        if to_update:
            logger.debug("Computing manifest update for %s items" % len(to_update))
            changed = True


        # 25 items was derived experimentally (2020-01) to be approximately the
        # point at which it is quicker to create a Pool and parallelize update.
        pool = None
        if parallel and len(to_update) > 25 and cpu_count() > 1:
            # On Python 3 on Windows, using >= MAXIMUM_WAIT_OBJECTS processes
            # causes a crash in the multiprocessing module. Whilst this enum
            # can technically have any value, it is usually 64. For safety,
            # restrict manifest regeneration to 48 processes on Windows.
            #
            # See https://bugs.python.org/issue26903 and https://bugs.python.org/issue40263
            processes = cpu_count()
            if sys.platform == "win32" and processes > 48:
                processes = 48
            pool = Pool(processes)

            # chunksize set > 1 when more than 10000 tests, because
            # chunking is a net-gain once we get to very large numbers
            # of items (again, experimentally, 2020-01)
            chunksize = max(1, len(to_update) // 10000)
            logger.debug("Doing a multiprocessed update. CPU count: %s, "
                "processes: %s, chunksize: %s" % (cpu_count(), processes, chunksize))
            results = pool.imap_unordered(compute_manifest_items,
                                          to_update,
                                          chunksize=chunksize
                                          )  # type: Iterator[Tuple[Tuple[Text, ...], Text, Set[ManifestItem], Text]]
        else:
            results = map(compute_manifest_items, to_update)

        for result in results:
            rel_path_parts, new_type, manifest_items, file_hash = result
            data[new_type][rel_path_parts] = manifest_items
            data[new_type].hashes[rel_path_parts] = file_hash

        # Make sure to terminate the Pool, to avoid hangs on Python 3.
        # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool
        if pool is not None:
            pool.terminate()

        if remaining_manifest_paths:
            changed = True
            for rel_path_parts in remaining_manifest_paths:
                for test_data in data.values():
                    if rel_path_parts in test_data:
                        del test_data[rel_path_parts]

        return changed

    def to_json(self, caller_owns_obj=True):
        # type: (bool) -> Dict[Text, Any]
        """Dump a manifest into a object which can be serialized as JSON

        If caller_owns_obj is False, then the return value remains
        owned by the manifest; it is _vitally important_ that _no_
        (even read) operation is done on the manifest, as otherwise
        objects within the object graph rooted at the return value can
        be mutated. This essentially makes this mode very dangerous
        and only to be used under extreme care.

        """
        out_items = {
            test_type: type_paths.to_json()
            for test_type, type_paths in self._data.items() if type_paths
        }

        if caller_owns_obj:
            out_items = deepcopy(out_items)

        rv = {"url_base": self.url_base,
              "items": out_items,
              "version": CURRENT_VERSION}  # type: Dict[Text, Any]
        return rv

    @classmethod
    def from_json(cls, tests_root, obj, types=None, callee_owns_obj=False):
        # type: (Text, Dict[Text, Any], Optional[Container[Text]], bool) -> Manifest
        """Load a manifest from a JSON object

        This loads a manifest for a given local test_root path from an
        object obj, potentially partially loading it to only load the
        types given by types.

        If callee_owns_obj is True, then ownership of obj transfers
        to this function when called, and the caller must never mutate
        the obj or anything referred to in the object graph rooted at
        obj.

        """
        version = obj.get("version")
        if version != CURRENT_VERSION:
            raise ManifestVersionMismatch

        self = cls(tests_root, url_base=obj.get("url_base", "/"))
        if not hasattr(obj, "items"):
            raise ManifestError

        for test_type, type_paths in obj["items"].items():
            if test_type not in item_classes:
                raise ManifestError

            if types and test_type not in types:
                continue

            if not callee_owns_obj:
                type_paths = deepcopy(type_paths)

            self._data[test_type].set_json(type_paths)

        return self


def load(tests_root, manifest, types=None):
    # type: (Text, Union[IO[bytes], Text], Optional[Container[Text]]) -> Optional[Manifest]
    logger = get_logger()

    logger.warning("Prefer load_and_update instead")
    return _load(logger, tests_root, manifest, types)


__load_cache = {}  # type: Dict[Text, Manifest]


def _load(logger,  # type: Logger
          tests_root,  # type: Text
          manifest,  # type: Union[IO[bytes], Text]
          types=None,  # type: Optional[Container[Text]]
          allow_cached=True  # type: bool
          ):
    # type: (...) -> Optional[Manifest]
    manifest_path = (manifest if isinstance(manifest, str)
                     else manifest.name)
    if allow_cached and manifest_path in __load_cache:
        return __load_cache[manifest_path]

    if isinstance(manifest, str):
        if os.path.exists(manifest):
            logger.debug("Opening manifest at %s" % manifest)
        else:
            logger.debug("Creating new manifest at %s" % manifest)
        try:
            with open(manifest, encoding="utf-8") as f:
                rv = Manifest.from_json(tests_root,
                                        jsonlib.load(f),
                                        types=types,
                                        callee_owns_obj=True)
        except OSError:
            return None
        except ValueError:
            logger.warning("%r may be corrupted", manifest)
            return None
    else:
        rv = Manifest.from_json(tests_root,
                                jsonlib.load(manifest),
                                types=types,
                                callee_owns_obj=True)

    if allow_cached:
        __load_cache[manifest_path] = rv
    return rv


def load_and_update(tests_root,  # type: Text
                    manifest_path,  # type: Text
                    url_base,  # type: Text
                    update=True,  # type: bool
                    rebuild=False,  # type: bool
                    metadata_path=None,  # type: Optional[Text]
                    cache_root=None,  # type: Optional[Text]
                    working_copy=True,  # type: bool
                    types=None,  # type: Optional[Container[Text]]
                    write_manifest=True,  # type: bool
                    allow_cached=True,  # type: bool
                    parallel=True  # type: bool
                    ):
    # type: (...) -> Manifest

    logger = get_logger()

    manifest = None
    if not rebuild:
        try:
            manifest = _load(logger,
                             tests_root,
                             manifest_path,
                             types=types,
                             allow_cached=allow_cached)
        except ManifestVersionMismatch:
            logger.info("Manifest version changed, rebuilding")
        except ManifestError:
            logger.warning("Failed to load manifest, rebuilding")

        if manifest is not None and manifest.url_base != url_base:
            logger.info("Manifest url base did not match, rebuilding")
            manifest = None

    if manifest is None:
        manifest = Manifest(tests_root, url_base)
        rebuild = True
        update = True

    if rebuild or update:
        logger.info("Updating manifest")
        for retry in range(2):
            try:
                tree = vcs.get_tree(tests_root, manifest, manifest_path, cache_root,
                                    working_copy, rebuild)
                changed = manifest.update(tree, parallel)
                break
            except InvalidCacheError:
                logger.warning("Manifest cache was invalid, doing a complete rebuild")
                rebuild = True
        else:
            # If we didn't break there was an error
            raise
        if write_manifest and changed:
            write(manifest, manifest_path)
        tree.dump_caches()

    return manifest


def write(manifest, manifest_path):
    # type: (Manifest, Text) -> None
    dir_name = os.path.dirname(manifest_path)
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    with atomic_write(manifest_path, overwrite=True) as f:
        # Use ',' instead of the default ', ' separator to prevent trailing
        # spaces: https://docs.python.org/2/library/json.html#json.dump
        jsonlib.dump_dist(manifest.to_json(caller_owns_obj=True), f)
        f.write("\n")