# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. """ Downloads Heavy profiles from TaskCluster. """ import datetime import functools import os import tarfile from email.utils import parsedate import requests from mozlog import get_proxy_logger from requests.adapters import HTTPAdapter LOG = get_proxy_logger() TC_LINK = ( "https://index.taskcluster.net/v1/task/garbage.heavyprofile/" "artifacts/public/today-%s.tgz" ) class ProgressBar(object): def __init__(self, size, template="\r%d%%"): self.size = size self.current = 0 self.tens = 0 self.template = template def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): return False def incr(self): if self.current == self.size: return percent = float(self.current) / float(self.size) * 100 tens, __ = divmod(percent, 10) if tens > self.tens: LOG.info(self.template % percent) self.tens = tens self.current += 1 def follow_redirects(url, max=3): location = url current = 0 page = requests.head(url) while page.status_code == 303 and current < max: current += 1 location = page.headers["Location"] page = requests.head(location) if page.status_code == 303 and current == max: raise ValueError("Max redirects Reached") last_modified = page.headers.get("Last-Modified", None) if last_modified is not None: last_modified = datetime.datetime(*parsedate(last_modified)[:6]) return location, last_modified def _recursive_mtime(path): max = os.path.getmtime(path) for root, dirs, files in os.walk(path): for element in dirs + files: age = os.path.getmtime(os.path.join(root, element)) if age > max: max = age return max def profile_age(profile_dir, last_modified=None): if last_modified is None: last_modified = datetime.datetime.now() profile_ts = _recursive_mtime(profile_dir) profile_ts = datetime.datetime.fromtimestamp(profile_ts) return (last_modified - profile_ts).days def download_profile(name, profiles_dir=None): if profiles_dir is None: profiles_dir = os.path.join(os.path.expanduser("~"), ".mozilla", "profiles") profiles_dir = os.path.abspath(profiles_dir) if not os.path.exists(profiles_dir): os.makedirs(profiles_dir) target = os.path.join(profiles_dir, name) url = TC_LINK % name cache_dir = os.path.join(profiles_dir, ".cache") if not os.path.exists(cache_dir): os.makedirs(cache_dir) archive_file = os.path.join(cache_dir, "today-%s.tgz" % name) url, last_modified = follow_redirects(url) if os.path.exists(target): age = profile_age(target, last_modified) if age < 7: # profile is not older than a week, we're good LOG.info("Local copy of %r is fresh enough" % name) LOG.info("%d days old" % age) return target LOG.info("Downloading from %r" % url) session = requests.Session() session.mount("https://", HTTPAdapter(max_retries=5)) req = session.get(url, stream=True, timeout=20) req.raise_for_status() total_length = int(req.headers.get("content-length")) # XXX implement Range to resume download on disconnects template = "Download progress %d%%" with open(archive_file, "wb") as f: iter = req.iter_content(chunk_size=1024) # pylint --py3k W1619 size = total_length / 1024 + 1 with ProgressBar(size=size, template=template) as bar: for chunk in iter: if chunk: f.write(chunk) bar.incr() LOG.info("Extracting profile in %r" % target) template = "Extraction progress %d%%" with tarfile.open(archive_file, "r:gz") as tar: LOG.info("Checking the tarball content...") size = len(list(tar)) with ProgressBar(size=size, template=template) as bar: def _extract(self, *args, **kw): bar.incr() return self.old(*args, **kw) tar.old = tar.extract tar.extract = functools.partial(_extract, tar) tar.extractall(target) LOG.info("Profile downloaded.") return target