diff options
Diffstat (limited to 'python/mozbuild/mozbuild/artifact_cache.py')
-rw-r--r-- | python/mozbuild/mozbuild/artifact_cache.py | 251 |
1 files changed, 251 insertions, 0 deletions
diff --git a/python/mozbuild/mozbuild/artifact_cache.py b/python/mozbuild/mozbuild/artifact_cache.py new file mode 100644 index 0000000000..572953e1f7 --- /dev/null +++ b/python/mozbuild/mozbuild/artifact_cache.py @@ -0,0 +1,251 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +Fetch and cache artifacts from URLs. + +This module manages fetching artifacts from URLS and purging old +artifacts using a simple Least Recently Used cache. + +This module requires certain modules be importable from the ambient Python +environment. Consumers will need to arrange this themselves. + +The bulk of the complexity is in managing and persisting several caches. If +we found a Python LRU cache that pickled cleanly, we could remove a lot of +this code! Sadly, I found no such candidate implementations, so we pickle +pylru caches manually. + +None of the instances (or the underlying caches) are safe for concurrent use. +A future need, perhaps. +""" + + +import binascii +import hashlib +import logging +import os + +import dlmanager +import mozpack.path as mozpath +import six +import six.moves.urllib.parse as urlparse + +from mozbuild.util import mkdir + +# Using 'DownloadManager' through the provided interface we +# can't directly specify a 'chunk_size' for the 'Download' it manages. +# One way to get it to use the 'chunk_size' we want is to monkeypatch +# the defaults of the init function for the 'Download' class. +CHUNK_SIZE = 16 * 1024 * 1024 # 16 MB in bytes. +dl_init = dlmanager.Download.__init__ +dl_init.__defaults__ = ( + dl_init.__defaults__[:1] + (CHUNK_SIZE,) + dl_init.__defaults__[2:] +) + + +# Minimum number of downloaded artifacts to keep. Each artifact can be very large, +# so don't make this to large! +MIN_CACHED_ARTIFACTS = 12 + +# Maximum size of the downloaded artifacts to keep in cache, in bytes (2GiB). +MAX_CACHED_ARTIFACTS_SIZE = 2 * 1024 * 1024 * 1024 + + +class ArtifactPersistLimit(dlmanager.PersistLimit): + """Handle persistence for a cache of artifacts. + + When instantiating a DownloadManager, it starts by filling the + PersistLimit instance it's given with register_dir_content. + In practice, this registers all the files already in the cache directory. + After a download finishes, the newly downloaded file is registered, and the + oldest files registered to the PersistLimit instance are removed depending + on the size and file limits it's configured for. + + This is all good, but there are a few tweaks we want here: + + - We have pickle files in the cache directory that we don't want purged. + - Files that were just downloaded in the same session shouldn't be + purged. (if for some reason we end up downloading more than the default + max size, we don't want the files to be purged) + + To achieve this, this subclass of PersistLimit inhibits the register_file + method for pickle files and tracks what files were downloaded in the same + session to avoid removing them. + + The register_file method may be used to register cache matches too, so that + later sessions know they were freshly used. + """ + + def __init__(self, log=None): + super(ArtifactPersistLimit, self).__init__( + size_limit=MAX_CACHED_ARTIFACTS_SIZE, file_limit=MIN_CACHED_ARTIFACTS + ) + self._log = log + self._registering_dir = False + self._downloaded_now = set() + + def log(self, *args, **kwargs): + if self._log: + self._log(*args, **kwargs) + + def register_file(self, path): + if ( + path.endswith(".pickle") + or path.endswith(".checksum") + or os.path.basename(path) == ".metadata_never_index" + ): + return + if not self._registering_dir: + # Touch the file so that subsequent calls to a mach artifact + # command know it was recently used. While remove_old_files + # is based on access time, in various cases, the access time is not + # updated when just reading the file, so we force an update. + try: + os.utime(path, None) + except OSError: + pass + self._downloaded_now.add(path) + super(ArtifactPersistLimit, self).register_file(path) + + def register_dir_content(self, directory, pattern="*"): + self._registering_dir = True + super(ArtifactPersistLimit, self).register_dir_content(directory, pattern) + self._registering_dir = False + + def remove_old_files(self): + from dlmanager import fs + + files = sorted(self.files, key=lambda f: f.stat.st_atime) + kept = [] + while len(files) > self.file_limit and self._files_size >= self.size_limit: + f = files.pop(0) + if f.path in self._downloaded_now: + kept.append(f) + continue + try: + fs.remove(f.path) + except WindowsError: + # For some reason, on automation, we can't remove those files. + # So for now, ignore the error. + kept.append(f) + continue + self.log( + logging.INFO, + "artifact", + {"filename": f.path}, + "Purged artifact {filename}", + ) + self._files_size -= f.stat.st_size + self.files = files + kept + + def remove_all(self): + from dlmanager import fs + + for f in self.files: + fs.remove(f.path) + self._files_size = 0 + self.files = [] + + +class ArtifactCache(object): + """Fetch artifacts from URLS and purge least recently used artifacts from disk.""" + + def __init__(self, cache_dir, log=None, skip_cache=False): + mkdir(cache_dir, not_indexed=True) + self._cache_dir = cache_dir + self._log = log + self._skip_cache = skip_cache + self._persist_limit = ArtifactPersistLimit(log) + self._download_manager = dlmanager.DownloadManager( + self._cache_dir, persist_limit=self._persist_limit + ) + self._last_dl_update = -1 + + def log(self, *args, **kwargs): + if self._log: + self._log(*args, **kwargs) + + def fetch(self, url, force=False): + fname = os.path.basename(url) + try: + # Use the file name from the url if it looks like a hash digest. + if len(fname) not in (32, 40, 56, 64, 96, 128): + raise TypeError() + binascii.unhexlify(fname) + except (TypeError, binascii.Error): + # We download to a temporary name like HASH[:16]-basename to + # differentiate among URLs with the same basenames. We used to then + # extract the build ID from the downloaded artifact and use it to make a + # human readable unique name, but extracting build IDs is time consuming + # (especially on Mac OS X, where we must mount a large DMG file). + hash = hashlib.sha256(six.ensure_binary(url)).hexdigest()[:16] + # Strip query string and fragments. + basename = os.path.basename(urlparse.urlparse(url).path) + fname = hash + "-" + basename + + path = os.path.abspath(mozpath.join(self._cache_dir, fname)) + if self._skip_cache and os.path.exists(path): + self.log( + logging.INFO, + "artifact", + {"path": path}, + "Skipping cache: removing cached downloaded artifact {path}", + ) + os.remove(path) + + try: + dl = self._download_manager.download(url, fname) + + def download_progress(dl, bytes_so_far, total_size): + if not total_size: + return + percent = (float(bytes_so_far) / total_size) * 100 + now = int(percent / 5) + if now == self._last_dl_update: + return + self._last_dl_update = now + self.log( + logging.INFO, + "artifact", + { + "bytes_so_far": bytes_so_far, + "total_size": total_size, + "percent": percent, + }, + "Downloading... {percent:02.1f} %", + ) + + if dl: + self.log( + logging.INFO, + "artifact", + {"path": path}, + "Downloading artifact to local cache: {path}", + ) + dl.set_progress(download_progress) + dl.wait() + else: + self.log( + logging.INFO, + "artifact", + {"path": path}, + "Using artifact from local cache: {path}", + ) + # Avoid the file being removed if it was in the cache already. + path = os.path.join(self._cache_dir, fname) + self._persist_limit.register_file(path) + + return os.path.abspath(mozpath.join(self._cache_dir, fname)) + finally: + # Cancel any background downloads in progress. + self._download_manager.cancel() + + def clear_cache(self): + if self._skip_cache: + self.log( + logging.INFO, "artifact", {}, "Skipping cache: ignoring clear_cache!" + ) + return + + self._persist_limit.remove_all() |