1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
|
import os
import sys
from atomicwrites import atomic_write
from copy import deepcopy
from multiprocessing import Pool, cpu_count
from . import jsonlib
from . import vcs
from .item import (ConformanceCheckerTest,
CrashTest,
ManifestItem,
ManualTest,
PrintRefTest,
RefTest,
SupportFile,
TestharnessTest,
VisualTest,
WebDriverSpecTest)
from .log import get_logger
from .sourcefile import SourceFile
from .typedata import TypeData
MYPY = False
if MYPY:
# MYPY is set to True when run under Mypy.
from logging import Logger
from typing import Any
from typing import Container
from typing import Dict
from typing import IO
from typing import Iterator
from typing import Iterable
from typing import Optional
from typing import Set
from typing import Text
from typing import Tuple
from typing import Type
from typing import Union
CURRENT_VERSION = 8 # type: int
class ManifestError(Exception):
pass
class ManifestVersionMismatch(ManifestError):
pass
class InvalidCacheError(Exception):
pass
item_classes = {"testharness": TestharnessTest,
"reftest": RefTest,
"print-reftest": PrintRefTest,
"crashtest": CrashTest,
"manual": ManualTest,
"wdspec": WebDriverSpecTest,
"conformancechecker": ConformanceCheckerTest,
"visual": VisualTest,
"support": SupportFile} # type: Dict[Text, Type[ManifestItem]]
def compute_manifest_items(source_file):
# type: (SourceFile) -> Tuple[Tuple[Text, ...], Text, Set[ManifestItem], Text]
rel_path_parts = source_file.rel_path_parts
new_type, manifest_items = source_file.manifest_items()
file_hash = source_file.hash
return rel_path_parts, new_type, set(manifest_items), file_hash
if MYPY:
ManifestDataType = Dict[Any, TypeData]
else:
ManifestDataType = dict
class ManifestData(ManifestDataType):
def __init__(self, manifest):
# type: (Manifest) -> None
"""Dictionary subclass containing a TypeData instance for each test type,
keyed by type name"""
self.initialized = False # type: bool
for key, value in item_classes.items():
self[key] = TypeData(manifest, value)
self.initialized = True
self.json_obj = None # type: None
def __setitem__(self, key, value):
# type: (Text, TypeData) -> None
if self.initialized:
raise AttributeError
dict.__setitem__(self, key, value)
def paths(self):
# type: () -> Set[Text]
"""Get a list of all paths containing test items
without actually constructing all the items"""
rv = set() # type: Set[Text]
for item_data in self.values():
for item in item_data:
rv.add(os.path.sep.join(item))
return rv
def type_by_path(self):
# type: () -> Dict[Tuple[Text, ...], Text]
rv = {}
for item_type, item_data in self.items():
for item in item_data:
rv[item] = item_type
return rv
class Manifest:
def __init__(self, tests_root, url_base="/"):
# type: (Text, Text) -> None
assert url_base is not None
self._data = ManifestData(self) # type: ManifestData
self.tests_root = tests_root # type: Text
self.url_base = url_base # type: Text
def __iter__(self):
# type: () -> Iterator[Tuple[Text, Text, Set[ManifestItem]]]
return self.itertypes()
def itertypes(self, *types):
# type: (*Text) -> Iterator[Tuple[Text, Text, Set[ManifestItem]]]
for item_type in (types or sorted(self._data.keys())):
for path in self._data[item_type]:
rel_path = os.sep.join(path)
tests = self._data[item_type][path]
yield item_type, rel_path, tests
def iterpath(self, path):
# type: (Text) -> Iterable[ManifestItem]
tpath = tuple(path.split(os.path.sep))
for type_tests in self._data.values():
i = type_tests.get(tpath, set())
assert i is not None
yield from i
def iterdir(self, dir_name):
# type: (Text) -> Iterable[ManifestItem]
tpath = tuple(dir_name.split(os.path.sep))
tpath_len = len(tpath)
for type_tests in self._data.values():
for path, tests in type_tests.items():
if path[:tpath_len] == tpath:
yield from tests
def update(self, tree, parallel=True):
# type: (Iterable[Tuple[Text, Optional[Text], bool]], bool) -> bool
"""Update the manifest given an iterable of items that make up the updated manifest.
The iterable must either generate tuples of the form (SourceFile, True) for paths
that are to be updated, or (path, False) for items that are not to be updated. This
unusual API is designed as an optimistaion meaning that SourceFile items need not be
constructed in the case we are not updating a path, but the absence of an item from
the iterator may be used to remove defunct entries from the manifest."""
logger = get_logger()
changed = False
# Create local variable references to these dicts so we avoid the
# attribute access in the hot loop below
data = self._data
types = data.type_by_path()
remaining_manifest_paths = set(types)
to_update = []
for path, file_hash, updated in tree:
path_parts = tuple(path.split(os.path.sep))
is_new = path_parts not in remaining_manifest_paths
if not updated and is_new:
# This is kind of a bandaid; if we ended up here the cache
# was invalid but we've been using it anyway. That's obviously
# bad; we should fix the underlying issue that we sometimes
# use an invalid cache. But at least this fixes the immediate
# problem
raise InvalidCacheError
if not updated:
remaining_manifest_paths.remove(path_parts)
else:
assert self.tests_root is not None
source_file = SourceFile(self.tests_root,
path,
self.url_base,
file_hash)
hash_changed = False # type: bool
if not is_new:
if file_hash is None:
file_hash = source_file.hash
remaining_manifest_paths.remove(path_parts)
old_type = types[path_parts]
old_hash = data[old_type].hashes[path_parts]
if old_hash != file_hash:
hash_changed = True
del data[old_type][path_parts]
if is_new or hash_changed:
to_update.append(source_file)
if to_update:
logger.debug("Computing manifest update for %s items" % len(to_update))
changed = True
# 25 items was derived experimentally (2020-01) to be approximately the
# point at which it is quicker to create a Pool and parallelize update.
pool = None
if parallel and len(to_update) > 25 and cpu_count() > 1:
# On Python 3 on Windows, using >= MAXIMUM_WAIT_OBJECTS processes
# causes a crash in the multiprocessing module. Whilst this enum
# can technically have any value, it is usually 64. For safety,
# restrict manifest regeneration to 48 processes on Windows.
#
# See https://bugs.python.org/issue26903 and https://bugs.python.org/issue40263
processes = cpu_count()
if sys.platform == "win32" and processes > 48:
processes = 48
pool = Pool(processes)
# chunksize set > 1 when more than 10000 tests, because
# chunking is a net-gain once we get to very large numbers
# of items (again, experimentally, 2020-01)
chunksize = max(1, len(to_update) // 10000)
logger.debug("Doing a multiprocessed update. CPU count: %s, "
"processes: %s, chunksize: %s" % (cpu_count(), processes, chunksize))
results = pool.imap_unordered(compute_manifest_items,
to_update,
chunksize=chunksize
) # type: Iterator[Tuple[Tuple[Text, ...], Text, Set[ManifestItem], Text]]
else:
results = map(compute_manifest_items, to_update)
for result in results:
rel_path_parts, new_type, manifest_items, file_hash = result
data[new_type][rel_path_parts] = manifest_items
data[new_type].hashes[rel_path_parts] = file_hash
# Make sure to terminate the Pool, to avoid hangs on Python 3.
# https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool
if pool is not None:
pool.terminate()
if remaining_manifest_paths:
changed = True
for rel_path_parts in remaining_manifest_paths:
for test_data in data.values():
if rel_path_parts in test_data:
del test_data[rel_path_parts]
return changed
def to_json(self, caller_owns_obj=True):
# type: (bool) -> Dict[Text, Any]
"""Dump a manifest into a object which can be serialized as JSON
If caller_owns_obj is False, then the return value remains
owned by the manifest; it is _vitally important_ that _no_
(even read) operation is done on the manifest, as otherwise
objects within the object graph rooted at the return value can
be mutated. This essentially makes this mode very dangerous
and only to be used under extreme care.
"""
out_items = {
test_type: type_paths.to_json()
for test_type, type_paths in self._data.items() if type_paths
}
if caller_owns_obj:
out_items = deepcopy(out_items)
rv = {"url_base": self.url_base,
"items": out_items,
"version": CURRENT_VERSION} # type: Dict[Text, Any]
return rv
@classmethod
def from_json(cls, tests_root, obj, types=None, callee_owns_obj=False):
# type: (Text, Dict[Text, Any], Optional[Container[Text]], bool) -> Manifest
"""Load a manifest from a JSON object
This loads a manifest for a given local test_root path from an
object obj, potentially partially loading it to only load the
types given by types.
If callee_owns_obj is True, then ownership of obj transfers
to this function when called, and the caller must never mutate
the obj or anything referred to in the object graph rooted at
obj.
"""
version = obj.get("version")
if version != CURRENT_VERSION:
raise ManifestVersionMismatch
self = cls(tests_root, url_base=obj.get("url_base", "/"))
if not hasattr(obj, "items"):
raise ManifestError
for test_type, type_paths in obj["items"].items():
if test_type not in item_classes:
raise ManifestError
if types and test_type not in types:
continue
if not callee_owns_obj:
type_paths = deepcopy(type_paths)
self._data[test_type].set_json(type_paths)
return self
def load(tests_root, manifest, types=None):
# type: (Text, Union[IO[bytes], Text], Optional[Container[Text]]) -> Optional[Manifest]
logger = get_logger()
logger.warning("Prefer load_and_update instead")
return _load(logger, tests_root, manifest, types)
__load_cache = {} # type: Dict[Text, Manifest]
def _load(logger, # type: Logger
tests_root, # type: Text
manifest, # type: Union[IO[bytes], Text]
types=None, # type: Optional[Container[Text]]
allow_cached=True # type: bool
):
# type: (...) -> Optional[Manifest]
manifest_path = (manifest if isinstance(manifest, str)
else manifest.name)
if allow_cached and manifest_path in __load_cache:
return __load_cache[manifest_path]
if isinstance(manifest, str):
if os.path.exists(manifest):
logger.debug("Opening manifest at %s" % manifest)
else:
logger.debug("Creating new manifest at %s" % manifest)
try:
with open(manifest, encoding="utf-8") as f:
rv = Manifest.from_json(tests_root,
jsonlib.load(f),
types=types,
callee_owns_obj=True)
except OSError:
return None
except ValueError:
logger.warning("%r may be corrupted", manifest)
return None
else:
rv = Manifest.from_json(tests_root,
jsonlib.load(manifest),
types=types,
callee_owns_obj=True)
if allow_cached:
__load_cache[manifest_path] = rv
return rv
def load_and_update(tests_root, # type: Text
manifest_path, # type: Text
url_base, # type: Text
update=True, # type: bool
rebuild=False, # type: bool
metadata_path=None, # type: Optional[Text]
cache_root=None, # type: Optional[Text]
working_copy=True, # type: bool
types=None, # type: Optional[Container[Text]]
write_manifest=True, # type: bool
allow_cached=True, # type: bool
parallel=True # type: bool
):
# type: (...) -> Manifest
logger = get_logger()
manifest = None
if not rebuild:
try:
manifest = _load(logger,
tests_root,
manifest_path,
types=types,
allow_cached=allow_cached)
except ManifestVersionMismatch:
logger.info("Manifest version changed, rebuilding")
except ManifestError:
logger.warning("Failed to load manifest, rebuilding")
if manifest is not None and manifest.url_base != url_base:
logger.info("Manifest url base did not match, rebuilding")
manifest = None
if manifest is None:
manifest = Manifest(tests_root, url_base)
rebuild = True
update = True
if rebuild or update:
logger.info("Updating manifest")
for retry in range(2):
try:
tree = vcs.get_tree(tests_root, manifest, manifest_path, cache_root,
working_copy, rebuild)
changed = manifest.update(tree, parallel)
break
except InvalidCacheError:
logger.warning("Manifest cache was invalid, doing a complete rebuild")
rebuild = True
else:
# If we didn't break there was an error
raise
if write_manifest and changed:
write(manifest, manifest_path)
tree.dump_caches()
return manifest
def write(manifest, manifest_path):
# type: (Manifest, Text) -> None
dir_name = os.path.dirname(manifest_path)
if not os.path.exists(dir_name):
os.makedirs(dir_name)
with atomic_write(manifest_path, overwrite=True) as f:
# Use ',' instead of the default ', ' separator to prevent trailing
# spaces: https://docs.python.org/2/library/json.html#json.dump
jsonlib.dump_dist(manifest.to_json(caller_owns_obj=True), f)
f.write("\n")
|