summaryrefslogtreecommitdiffstats
path: root/python/mozbuild/mozbuild/artifact_cache.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/mozbuild/mozbuild/artifact_cache.py')
-rw-r--r--python/mozbuild/mozbuild/artifact_cache.py251
1 files changed, 251 insertions, 0 deletions
diff --git a/python/mozbuild/mozbuild/artifact_cache.py b/python/mozbuild/mozbuild/artifact_cache.py
new file mode 100644
index 0000000000..572953e1f7
--- /dev/null
+++ b/python/mozbuild/mozbuild/artifact_cache.py
@@ -0,0 +1,251 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+Fetch and cache artifacts from URLs.
+
+This module manages fetching artifacts from URLS and purging old
+artifacts using a simple Least Recently Used cache.
+
+This module requires certain modules be importable from the ambient Python
+environment. Consumers will need to arrange this themselves.
+
+The bulk of the complexity is in managing and persisting several caches. If
+we found a Python LRU cache that pickled cleanly, we could remove a lot of
+this code! Sadly, I found no such candidate implementations, so we pickle
+pylru caches manually.
+
+None of the instances (or the underlying caches) are safe for concurrent use.
+A future need, perhaps.
+"""
+
+
+import binascii
+import hashlib
+import logging
+import os
+
+import dlmanager
+import mozpack.path as mozpath
+import six
+import six.moves.urllib.parse as urlparse
+
+from mozbuild.util import mkdir
+
+# Using 'DownloadManager' through the provided interface we
+# can't directly specify a 'chunk_size' for the 'Download' it manages.
+# One way to get it to use the 'chunk_size' we want is to monkeypatch
+# the defaults of the init function for the 'Download' class.
+CHUNK_SIZE = 16 * 1024 * 1024 # 16 MB in bytes.
+dl_init = dlmanager.Download.__init__
+dl_init.__defaults__ = (
+ dl_init.__defaults__[:1] + (CHUNK_SIZE,) + dl_init.__defaults__[2:]
+)
+
+
+# Minimum number of downloaded artifacts to keep. Each artifact can be very large,
+# so don't make this to large!
+MIN_CACHED_ARTIFACTS = 12
+
+# Maximum size of the downloaded artifacts to keep in cache, in bytes (2GiB).
+MAX_CACHED_ARTIFACTS_SIZE = 2 * 1024 * 1024 * 1024
+
+
+class ArtifactPersistLimit(dlmanager.PersistLimit):
+ """Handle persistence for a cache of artifacts.
+
+ When instantiating a DownloadManager, it starts by filling the
+ PersistLimit instance it's given with register_dir_content.
+ In practice, this registers all the files already in the cache directory.
+ After a download finishes, the newly downloaded file is registered, and the
+ oldest files registered to the PersistLimit instance are removed depending
+ on the size and file limits it's configured for.
+
+ This is all good, but there are a few tweaks we want here:
+
+ - We have pickle files in the cache directory that we don't want purged.
+ - Files that were just downloaded in the same session shouldn't be
+ purged. (if for some reason we end up downloading more than the default
+ max size, we don't want the files to be purged)
+
+ To achieve this, this subclass of PersistLimit inhibits the register_file
+ method for pickle files and tracks what files were downloaded in the same
+ session to avoid removing them.
+
+ The register_file method may be used to register cache matches too, so that
+ later sessions know they were freshly used.
+ """
+
+ def __init__(self, log=None):
+ super(ArtifactPersistLimit, self).__init__(
+ size_limit=MAX_CACHED_ARTIFACTS_SIZE, file_limit=MIN_CACHED_ARTIFACTS
+ )
+ self._log = log
+ self._registering_dir = False
+ self._downloaded_now = set()
+
+ def log(self, *args, **kwargs):
+ if self._log:
+ self._log(*args, **kwargs)
+
+ def register_file(self, path):
+ if (
+ path.endswith(".pickle")
+ or path.endswith(".checksum")
+ or os.path.basename(path) == ".metadata_never_index"
+ ):
+ return
+ if not self._registering_dir:
+ # Touch the file so that subsequent calls to a mach artifact
+ # command know it was recently used. While remove_old_files
+ # is based on access time, in various cases, the access time is not
+ # updated when just reading the file, so we force an update.
+ try:
+ os.utime(path, None)
+ except OSError:
+ pass
+ self._downloaded_now.add(path)
+ super(ArtifactPersistLimit, self).register_file(path)
+
+ def register_dir_content(self, directory, pattern="*"):
+ self._registering_dir = True
+ super(ArtifactPersistLimit, self).register_dir_content(directory, pattern)
+ self._registering_dir = False
+
+ def remove_old_files(self):
+ from dlmanager import fs
+
+ files = sorted(self.files, key=lambda f: f.stat.st_atime)
+ kept = []
+ while len(files) > self.file_limit and self._files_size >= self.size_limit:
+ f = files.pop(0)
+ if f.path in self._downloaded_now:
+ kept.append(f)
+ continue
+ try:
+ fs.remove(f.path)
+ except WindowsError:
+ # For some reason, on automation, we can't remove those files.
+ # So for now, ignore the error.
+ kept.append(f)
+ continue
+ self.log(
+ logging.INFO,
+ "artifact",
+ {"filename": f.path},
+ "Purged artifact {filename}",
+ )
+ self._files_size -= f.stat.st_size
+ self.files = files + kept
+
+ def remove_all(self):
+ from dlmanager import fs
+
+ for f in self.files:
+ fs.remove(f.path)
+ self._files_size = 0
+ self.files = []
+
+
+class ArtifactCache(object):
+ """Fetch artifacts from URLS and purge least recently used artifacts from disk."""
+
+ def __init__(self, cache_dir, log=None, skip_cache=False):
+ mkdir(cache_dir, not_indexed=True)
+ self._cache_dir = cache_dir
+ self._log = log
+ self._skip_cache = skip_cache
+ self._persist_limit = ArtifactPersistLimit(log)
+ self._download_manager = dlmanager.DownloadManager(
+ self._cache_dir, persist_limit=self._persist_limit
+ )
+ self._last_dl_update = -1
+
+ def log(self, *args, **kwargs):
+ if self._log:
+ self._log(*args, **kwargs)
+
+ def fetch(self, url, force=False):
+ fname = os.path.basename(url)
+ try:
+ # Use the file name from the url if it looks like a hash digest.
+ if len(fname) not in (32, 40, 56, 64, 96, 128):
+ raise TypeError()
+ binascii.unhexlify(fname)
+ except (TypeError, binascii.Error):
+ # We download to a temporary name like HASH[:16]-basename to
+ # differentiate among URLs with the same basenames. We used to then
+ # extract the build ID from the downloaded artifact and use it to make a
+ # human readable unique name, but extracting build IDs is time consuming
+ # (especially on Mac OS X, where we must mount a large DMG file).
+ hash = hashlib.sha256(six.ensure_binary(url)).hexdigest()[:16]
+ # Strip query string and fragments.
+ basename = os.path.basename(urlparse.urlparse(url).path)
+ fname = hash + "-" + basename
+
+ path = os.path.abspath(mozpath.join(self._cache_dir, fname))
+ if self._skip_cache and os.path.exists(path):
+ self.log(
+ logging.INFO,
+ "artifact",
+ {"path": path},
+ "Skipping cache: removing cached downloaded artifact {path}",
+ )
+ os.remove(path)
+
+ try:
+ dl = self._download_manager.download(url, fname)
+
+ def download_progress(dl, bytes_so_far, total_size):
+ if not total_size:
+ return
+ percent = (float(bytes_so_far) / total_size) * 100
+ now = int(percent / 5)
+ if now == self._last_dl_update:
+ return
+ self._last_dl_update = now
+ self.log(
+ logging.INFO,
+ "artifact",
+ {
+ "bytes_so_far": bytes_so_far,
+ "total_size": total_size,
+ "percent": percent,
+ },
+ "Downloading... {percent:02.1f} %",
+ )
+
+ if dl:
+ self.log(
+ logging.INFO,
+ "artifact",
+ {"path": path},
+ "Downloading artifact to local cache: {path}",
+ )
+ dl.set_progress(download_progress)
+ dl.wait()
+ else:
+ self.log(
+ logging.INFO,
+ "artifact",
+ {"path": path},
+ "Using artifact from local cache: {path}",
+ )
+ # Avoid the file being removed if it was in the cache already.
+ path = os.path.join(self._cache_dir, fname)
+ self._persist_limit.register_file(path)
+
+ return os.path.abspath(mozpath.join(self._cache_dir, fname))
+ finally:
+ # Cancel any background downloads in progress.
+ self._download_manager.cancel()
+
+ def clear_cache(self):
+ if self._skip_cache:
+ self.log(
+ logging.INFO, "artifact", {}, "Skipping cache: ignoring clear_cache!"
+ )
+ return
+
+ self._persist_limit.remove_all()