summaryrefslogtreecommitdiffstats
path: root/scripts/debootsnap
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/debootsnap')
-rwxr-xr-xscripts/debootsnap694
1 files changed, 694 insertions, 0 deletions
diff --git a/scripts/debootsnap b/scripts/debootsnap
new file mode 100755
index 0000000..81297f5
--- /dev/null
+++ b/scripts/debootsnap
@@ -0,0 +1,694 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021 Johannes Schauer Marin Rodrigues <josch@debian.org>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# This tool is similar to debootstrap but is able to recreate a chroot
+# containing precisely the given package and version selection. The package
+# list is expected on standard input and may be of the format produced by:
+#
+# dpkg-query --showformat '${binary:Package}=${Version}\n' --show
+
+# The name was suggested by Adrian Bunk as a portmanteau of debootstrap and
+# snapshot.debian.org.
+
+# TODO: Adress invalid names
+# pylint: disable=invalid-name
+
+import argparse
+import dataclasses
+import http.server
+import os
+import pathlib
+import re
+import shutil
+import socketserver
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+from functools import partial
+from http import HTTPStatus
+from operator import itemgetter
+
+import pycurl
+import requests
+from debian.deb822 import BuildInfo
+
+
+class MyHTTPException(Exception):
+ pass
+
+
+class MyHTTP404Exception(Exception):
+ pass
+
+
+class MyHTTPTimeoutException(Exception):
+ pass
+
+
+class RetryCountExceeded(Exception):
+ pass
+
+
+# pylint: disable=c-extension-no-member
+class Proxy(http.server.SimpleHTTPRequestHandler):
+ last_request = None
+ maxretries = 10
+
+ def do_GET(self): # pylint: disable=too-many-branches,too-many-statements
+ # check validity and extract the timestamp
+ url = "http://snapshot.debian.org/" + self.path
+ start = None
+ state = ""
+ written = 0
+ for retrynum in range(self.maxretries):
+ try:
+ c = pycurl.Curl()
+ c.setopt(c.URL, url)
+ # even 100 kB/s is too much sometimes
+ c.setopt(c.MAX_RECV_SPEED_LARGE, 1000 * 1024) # bytes per second
+ c.setopt(c.CONNECTTIMEOUT, 30) # the default is 300
+ # sometimes, curl stalls forever and even ctrl+c doesn't work
+ start = time.time()
+
+ def progress(*_):
+ # a download must not last more than 10 minutes
+ # with 100 kB/s this means files cannot be larger than 62MB
+ if time.time() - start > 10 * 60:
+ print("transfer took too long")
+ # the code will not see this exception but instead get a
+ # pycurl.error
+ raise MyHTTPTimeoutException(url)
+
+ c.setopt(pycurl.NOPROGRESS, 0)
+ c.setopt(pycurl.XFERINFOFUNCTION, progress)
+ # $ host snapshot.debian.org
+ # snapshot.debian.org has address 185.17.185.185
+ # snapshot.debian.org has address 193.62.202.27
+ # c.setopt(c.RESOLVE, ["snapshot.debian.org:80:185.17.185.185"])
+ if written > 0:
+ c.setopt(pycurl.RESUME_FROM, written)
+
+ def writer_cb(data):
+ assert state == "headers sent", state
+ nonlocal written
+ written += len(data)
+ return self.wfile.write(data)
+
+ c.setopt(c.WRITEFUNCTION, writer_cb)
+
+ # using a header callback allows us to send headers of our own
+ # with the correct content-length value out without having to
+ # wait for perform() to finish
+ def header_cb(line):
+ nonlocal state
+ # if this is a retry, then the headers have already been
+ # sent and there is nothing to do
+ if state == "headers sent":
+ return
+ # HTTP standard specifies that headers are encoded in iso-8859-1
+ line = line.decode("iso-8859-1").rstrip()
+ # the first try must be a http 200
+ if line == "HTTP/1.1 200 OK":
+ assert state == ""
+ self.send_response(HTTPStatus.OK)
+ state = "http200 sent"
+ return
+ # the header is done
+ if line == "":
+ assert state == "length sent"
+ self.end_headers()
+ state = "headers sent"
+ return
+ field, value = line.split(":", 1)
+ field = field.strip().lower()
+ value = value.strip()
+ # we are only interested in content-length
+ if field != "content-length":
+ return
+ assert state == "http200 sent"
+ self.send_header("Content-Length", value)
+ state = "length sent"
+
+ c.setopt(c.HEADERFUNCTION, header_cb)
+ c.perform()
+ if c.getinfo(c.RESPONSE_CODE) == 404:
+ raise MyHTTP404Exception(f"got HTTP 404 for {url}")
+ if c.getinfo(c.RESPONSE_CODE) not in [200, 206]:
+ raise MyHTTPException(
+ f"got HTTP {c.getinfo(c.RESPONSE_CODE)} for {url}"
+ )
+ c.close()
+ # if the requests finished too quickly, sleep the remaining time
+ # s/r r/h
+ # 3 1020
+ # 2.5 1384
+ # 2.4 1408
+ # 2 1466
+ # 1.5 2267
+ seconds_per_request = 1.5
+ if self.last_request is not None:
+ sleep_time = seconds_per_request - (time.time() - self.last_request)
+ if sleep_time > 0:
+ time.sleep(sleep_time)
+ self.last_request = time.time()
+ break
+ except pycurl.error as e:
+ code, _ = e.args
+ if code in [
+ pycurl.E_PARTIAL_FILE,
+ pycurl.E_COULDNT_CONNECT,
+ pycurl.E_ABORTED_BY_CALLBACK,
+ ]:
+ if retrynum == self.maxretries - 1:
+ break
+ if code == pycurl.E_ABORTED_BY_CALLBACK:
+ # callback was aborted due to timeout
+ pass
+ sleep_time = 4 ** (retrynum + 1)
+ print(f"retrying after {sleep_time} s...")
+ time.sleep(sleep_time)
+ continue
+ raise
+ except MyHTTPException as e:
+ print("got HTTP error:", repr(e))
+ if retrynum == self.maxretries - 1:
+ break
+ sleep_time = 4 ** (retrynum + 1)
+ print(f"retrying after {sleep_time} s...")
+ time.sleep(sleep_time)
+ # restart from the beginning or otherwise, the result might
+ # include a varnish cache error message
+ else:
+ raise RetryCountExceeded("failed too often...")
+
+
+@dataclasses.dataclass
+class Source:
+ archive: str
+ timestamp: str
+ suite: str
+ components: list[str]
+
+ def deb_line(self, host: str = "snapshot.debian.org") -> str:
+ return (
+ f"deb [check-valid-until=no] http://{host}/archive/{self.archive}"
+ f"/{self.timestamp}/ {self.suite} {' '.join(self.components)}\n"
+ )
+
+
+def parse_buildinfo(val):
+ with open(val, encoding="utf8") as f:
+ buildinfo = BuildInfo(f)
+ pkgs = []
+ for dep in buildinfo.relations["installed-build-depends"]:
+ assert len(dep) == 1
+ dep = dep[0]
+ assert dep["arch"] is None
+ assert dep["restrictions"] is None
+ assert len(dep["version"]) == 2
+ rel, version = dep["version"]
+ assert rel == "="
+ pkgs.append((dep["name"], dep["archqual"], version))
+ return pkgs, buildinfo.get("Build-Architecture")
+
+
+def parse_pkgs(val):
+ if val == "-":
+ val = sys.stdin.read()
+ if val.startswith("./") or val.startswith("/"):
+ val = pathlib.Path(val)
+ if not val.exists():
+ print(f"{val} does not exist", file=sys.stderr)
+ sys.exit(1)
+ val = val.read_text(encoding="utf8")
+ pkgs = []
+ pattern = re.compile(
+ r"""
+ ^[^a-z0-9]* # garbage at the beginning
+ ([a-z0-9][a-z0-9+.-]+) # package name
+ (?:[^a-z0-9+.-]+([a-z0-9-]+))? # optional version
+ [^A-Za-z0-9.+~:-]+ # optional garbage
+ ([A-Za-z0-9.+~:-]+) # version
+ [^A-Za-z0-9.+~:-]*$ # garbage at the end
+ """,
+ re.VERBOSE,
+ )
+ for line in re.split(r"[,\r\n]+", val):
+ if not line:
+ continue
+ match = pattern.fullmatch(line)
+ if match is None:
+ print(f"cannot parse: {line}", file=sys.stderr)
+ sys.exit(1)
+ pkgs.append(match.groups())
+ return [pkgs]
+
+
+def parse_args(args: list[str]) -> argparse.Namespace:
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description="""\
+
+Combines debootstrap and snapshot.debian.org to create a chroot with exact
+package versions from the past either to reproduce bugs or to test source
+package reproducibility.
+
+To obtain a list of packages run the following command on one machine:
+
+ $ dpkg-query --showformat '${binary:Package}=${Version}\\n' --show
+
+And pass the output to debootsnap with the --packages argument. The result
+will be a chroot tarball with precisely the package versions as they were
+found on the system that ran dpkg-query.
+""",
+ epilog="""\
+
+*EXAMPLES*
+
+On one system run:
+
+ $ dpkg-query --showformat '${binary:Package}=${Version}\\n' --show > pkglist
+
+Then copy over "pkglist" and on another system run:
+
+ $ debootsnap --pkgs=./pkglist chroot.tar
+
+Or use a buildinfo file as input:
+
+ $ debootsnap --buildinfo=./package.buildinfo chroot.tar
+
+""",
+ )
+ parser.add_argument(
+ "--architecture",
+ "--nativearch",
+ help="native architecture of the chroot. Ignored if --buildinfo is"
+ " used. Foreign architectures are inferred from the package list."
+ " Not required if packages are architecture qualified.",
+ )
+ parser.add_argument(
+ "--ignore-notfound",
+ action="store_true",
+ help="only warn about packages that cannot be found on "
+ "snapshot.debian.org instead of exiting",
+ )
+ group = parser.add_mutually_exclusive_group(required=True)
+ group.add_argument(
+ "--buildinfo",
+ type=parse_buildinfo,
+ help="use packages from a buildinfo file. Read buildinfo file from "
+ 'standard input if value is "-".',
+ )
+ group.add_argument(
+ "--packages",
+ "--pkgs",
+ action="extend",
+ type=parse_pkgs,
+ help="list of packages, optional architecture and version, separated "
+ "by comma or linebreak. Read list from standard input if value is "
+ '"-". Read list from a file if value starts with "./" or "/". The '
+ "option can be specified multiple times. Package name, "
+ "version and architecture are separated by one or more characters "
+ "that are not legal in the respective adjacent field. Leading and "
+ "trailing illegal characters are allowed. Example: "
+ "pkg1:arch=ver1,pkg2:arch=ver2",
+ )
+ parser.add_argument(
+ "--sources-list-only",
+ action="store_true",
+ help="only query metasnap.debian.net and print the sources.list "
+ "needed to create chroot and exit",
+ )
+ parser.add_argument(
+ "output", nargs="?", default="-", help="path to output chroot tarball"
+ )
+ return parser.parse_args(args)
+
+
+def query_metasnap(pkgsleft, archive, nativearch):
+ handled_pkgs = set(pkgsleft)
+ r = requests.post(
+ "http://metasnap.debian.net/cgi-bin/api",
+ files={
+ "archive": archive,
+ "arch": nativearch,
+ "pkgs": ",".join([n + ":" + a + "=" + v for n, a, v in handled_pkgs]),
+ },
+ timeout=60,
+ )
+ if r.status_code == 404:
+ for line in r.text.splitlines():
+ n, a, v = line.split()
+ handled_pkgs.remove((n, a, v))
+ r = requests.post(
+ "http://metasnap.debian.net/cgi-bin/api",
+ files={
+ "archive": archive,
+ "arch": nativearch,
+ "pkgs": ",".join([n + ":" + a + "=" + v for n, a, v in handled_pkgs]),
+ },
+ timeout=60,
+ )
+ assert r.status_code == 200, r.text
+
+ suite2pkgs = defaultdict(set)
+ pkg2range = {}
+ for line in r.text.splitlines():
+ n, a, v, s, c, b, e = line.split()
+ assert (n, a, v) in handled_pkgs
+ suite2pkgs[s].add((n, a, v))
+ # this will only keep one range of packages with multiple
+ # ranges but we don't care because we only need one
+ pkg2range[((n, a, v), s)] = (c, b, e)
+
+ return handled_pkgs, suite2pkgs, pkg2range
+
+
+def comp_ts(ranges):
+ last = "19700101T000000Z" # impossibly early date
+ res = []
+ for c, b, e in ranges:
+ if last >= b:
+ # add the component the current timestamp needs
+ res[-1][1].add(c)
+ continue
+ # add new timestamp with initial component
+ last = e
+ res.append((last, set([c])))
+ return res
+
+
+def compute_sources(pkgs, nativearch, ignore_notfound) -> list[Source]:
+ sources = []
+ pkgsleft = set(pkgs)
+ for archive in [
+ "debian",
+ "debian-debug",
+ "debian-security",
+ "debian-ports",
+ "debian-volatile",
+ "debian-backports",
+ ]:
+ if len(pkgsleft) == 0:
+ break
+
+ handled_pkgs, suite2pkgs, pkg2range = query_metasnap(
+ pkgsleft, archive, nativearch
+ )
+
+ # greedy algorithm:
+ # pick the suite covering most packages first
+ while len(handled_pkgs) > 0:
+ bestsuite = sorted(suite2pkgs.items(), key=lambda v: len(v[1]))[-1][0]
+ ranges = [pkg2range[nav, bestsuite] for nav in suite2pkgs[bestsuite]]
+ # sort by end-time
+ ranges.sort(key=itemgetter(2))
+
+ for ts, comps in comp_ts(ranges):
+ sources.append(Source(archive, ts, bestsuite, comps))
+
+ for nav in suite2pkgs[bestsuite]:
+ handled_pkgs.remove(nav)
+ pkgsleft.remove(nav)
+ for suite in suite2pkgs:
+ if suite == bestsuite:
+ continue
+ if nav in suite2pkgs[suite]:
+ suite2pkgs[suite].remove(nav)
+ del suite2pkgs[bestsuite]
+ if pkgsleft:
+ print("cannot find:", file=sys.stderr)
+ print(
+ "\n".join([f"{pkg[0]}:{pkg[1]}={pkg[2]}" for pkg in pkgsleft]),
+ file=sys.stderr,
+ )
+ if not ignore_notfound:
+ sys.exit(1)
+
+ return sources
+
+
+def create_repo(tmpdirname, pkgs):
+ with open(tmpdirname + "/control", "w", encoding="utf8") as f:
+
+ def pkg2name(n, a, v):
+ if a is None:
+ return f"{n} (= {v})"
+ return f"{n}:{a} (= {v})"
+
+ f.write("Package: debootsnap-dummy\n")
+ f.write(f"Depends: {', '.join([pkg2name(*pkg) for pkg in pkgs])}\n")
+ subprocess.check_call(
+ ["equivs-build", tmpdirname + "/control"], cwd=tmpdirname + "/cache"
+ )
+
+ packages_content = subprocess.check_output(
+ ["apt-ftparchive", "packages", "."], cwd=tmpdirname + "/cache"
+ )
+ with open(tmpdirname + "/cache/Packages", "wb") as f:
+ f.write(packages_content)
+ release_content = subprocess.check_output(
+ [
+ "apt-ftparchive",
+ "release",
+ "-oAPT::FTPArchive::Release::Suite=dummysuite",
+ ".",
+ ],
+ cwd=tmpdirname + "/cache",
+ )
+ with open(tmpdirname + "/cache/Release", "wb") as f:
+ f.write(release_content)
+
+
+@contextmanager
+def serve_repo(tmpdirname):
+ httpd = http.server.HTTPServer(
+ ("localhost", 0),
+ partial(http.server.SimpleHTTPRequestHandler, directory=tmpdirname + "/cache"),
+ )
+ # run server in a new thread
+ server_thread = threading.Thread(target=httpd.serve_forever)
+ server_thread.daemon = True
+ # start thread
+ server_thread.start()
+ # retrieve port (in case it was generated automatically)
+ _, port = httpd.server_address
+ try:
+ yield port
+ finally:
+ httpd.shutdown()
+ httpd.server_close()
+ server_thread.join()
+
+
+def run_mmdebstrap(
+ tmpdirname, sources: list[Source], nativearch, foreignarches, output
+):
+ with open(tmpdirname + "/sources.list", "w", encoding="utf8") as f:
+ for source in sources:
+ f.write(source.deb_line())
+ # we serve the directory via http instead of using a copy:// mirror
+ # because the temporary directory is not accessible to the unshared
+ # user
+ with serve_repo(tmpdirname) as port:
+ cmd = [
+ "mmdebstrap",
+ f"--architectures={','.join([nativearch] + list(foreignarches))}",
+ "--variant=essential",
+ "--include=debootsnap-dummy",
+ '--aptopt=Apt::Key::gpgvcommand "/usr/libexec/mmdebstrap/gpgvnoexpkeysig"',
+ '--customize-hook=chroot "$1" dpkg -r debootsnap-dummy',
+ '--customize-hook=chroot "$1" dpkg-query --showformat '
+ "'${binary:Package}=${Version}\\n' --show > \"$1/pkglist\"",
+ "--customize-hook=download /pkglist ./pkglist",
+ '--customize-hook=rm "$1/pkglist"',
+ "--customize-hook=upload sources.list /etc/apt/sources.list",
+ "dummysuite",
+ output,
+ f"deb [trusted=yes] http://localhost:{port}/ ./",
+ ]
+ subprocess.check_call(cmd, cwd=tmpdirname)
+
+ newpkgs = set()
+ with open(tmpdirname + "/pkglist", encoding="utf8") as f:
+ for line in f:
+ line = line.rstrip()
+ n, v = line.split("=")
+ a = nativearch
+ if ":" in n:
+ n, a = n.split(":")
+ newpkgs.add((n, a, v))
+
+ return newpkgs
+
+
+@contextmanager
+def proxy_snapshot(tmpdirname):
+ httpd = socketserver.TCPServer(
+ # the default address family for socketserver is AF_INET so we
+ # explicitly bind to ipv4 localhost
+ ("localhost", 0),
+ partial(Proxy, directory=tmpdirname + "/cache"),
+ )
+ # run server in a new thread
+ server_thread = threading.Thread(target=httpd.serve_forever)
+ server_thread.daemon = True
+ # start thread
+ server_thread.start()
+ # retrieve port (in case it was generated automatically)
+ _, port = httpd.server_address
+ try:
+ yield port
+ finally:
+ httpd.shutdown()
+ httpd.server_close()
+ server_thread.join()
+
+
+def download_packages(
+ tmpdirname, sources: list[Source], pkgs, nativearch, foreignarches
+):
+ for d in [
+ "/etc/apt/apt.conf.d",
+ "/etc/apt/sources.list.d",
+ "/etc/apt/preferences.d",
+ "/var/cache/apt",
+ "/var/lib/apt/lists/partial",
+ "/var/lib/dpkg",
+ ]:
+ os.makedirs(tmpdirname + "/" + d)
+ # apt-get update requires /var/lib/dpkg/status
+ with open(tmpdirname + "/var/lib/dpkg/status", "w", encoding="utf8") as f:
+ pass
+ with open(tmpdirname + "/apt.conf", "w", encoding="utf8") as f:
+ f.write(f'Apt::Architecture "{nativearch}";\n')
+ f.write("Apt::Architectures { " + f'"{nativearch}"; ')
+ for a in foreignarches:
+ f.write(f'"{a}"; ')
+ f.write("};\n")
+ f.write('Dir "' + tmpdirname + '";\n')
+ f.write('Dir::Etc::Trusted "/etc/apt/trusted.gpg";\n')
+ f.write('Dir::Etc::TrustedParts "/usr/share/keyrings/";\n')
+ f.write('Acquire::Languages "none";\n')
+ # f.write("Acquire::http::Dl-Limit \"1000\";\n")
+ # f.write("Acquire::https::Dl-Limit \"1000\";\n")
+ f.write('Acquire::Retries "5";\n')
+ # ignore expired signatures
+ f.write('Apt::Key::gpgvcommand "/usr/libexec/mmdebstrap/gpgvnoexpkeysig";\n')
+
+ os.makedirs(tmpdirname + "/cache")
+
+ with proxy_snapshot(tmpdirname) as port:
+ with open(tmpdirname + "/etc/apt/sources.list", "w", encoding="utf8") as f:
+ for source in sources:
+ f.write(source.deb_line(f"localhost:{port}"))
+ subprocess.check_call(
+ ["apt-get", "update", "--error-on=any"],
+ env={"APT_CONFIG": tmpdirname + "/apt.conf"},
+ )
+ for i, nav in enumerate(pkgs):
+ print(f"{i + 1} of {len(pkgs)}")
+ with tempfile.TemporaryDirectory() as tmpdir2:
+ subprocess.check_call(
+ ["apt-get", "download", "--yes", f"{nav[0]}:{nav[1]}={nav[2]}"],
+ cwd=tmpdir2,
+ env={"APT_CONFIG": tmpdirname + "/apt.conf"},
+ )
+ debs = os.listdir(tmpdir2)
+ assert len(debs) == 1
+ # Normalize the package name to how it appears in the archive.
+ # Mainly this removes the epoch from the filename, see
+ # https://bugs.debian.org/645895
+ # This avoids apt bugs connected with a percent sign in the
+ # filename as they occasionally appear, for example as
+ # introduced in apt 2.1.15 and later fixed by DonKult:
+ # https://salsa.debian.org/apt-team/apt/-/merge_requests/175
+ subprocess.check_call(["dpkg-name", tmpdir2 + "/" + debs[0]])
+ debs = os.listdir(tmpdir2)
+ assert len(debs) == 1
+ shutil.move(tmpdir2 + "/" + debs[0], tmpdirname + "/cache")
+
+
+def main(arguments: list[str]) -> None:
+ args = parse_args(arguments)
+ if args.packages:
+ pkgs = [v for sublist in args.packages for v in sublist]
+ if args.architecture is None:
+ arches = {a for _, a, _ in pkgs if a is not None}
+ if len(arches) == 0:
+ print("packages are not architecture qualified", file=sys.stderr)
+ print(
+ "use --architecture to set the native architecture", file=sys.stderr
+ )
+ sys.exit(1)
+ elif len(arches) > 1:
+ print("more than one architecture in the package list", file=sys.stderr)
+ print(
+ "use --architecture to set the native architecture", file=sys.stderr
+ )
+ sys.exit(1)
+ nativearch = arches.pop()
+ assert arches == set()
+ else:
+ nativearch = args.architecture
+ else:
+ pkgs, nativearch = args.buildinfo
+ # unknown architectures are the native architecture
+ pkgs = [(n, a if a is not None else nativearch, v) for n, a, v in pkgs]
+ # make package list unique
+ pkgs = list(set(pkgs))
+ # compute foreign architectures
+ foreignarches = set()
+ for _, a, _ in pkgs:
+ if a != nativearch:
+ foreignarches.add(a)
+
+ for tool in [
+ "equivs-build",
+ "apt-ftparchive",
+ "mmdebstrap",
+ "apt-get",
+ "dpkg-name",
+ ]:
+ if shutil.which(tool) is None:
+ print(f"{tool} is required but not installed", file=sys.stderr)
+ sys.exit(1)
+
+ sources = compute_sources(pkgs, nativearch, args.ignore_notfound)
+
+ if args.sources_list_only:
+ for source in sources:
+ print(source.deb_line(), end="")
+ sys.exit(0)
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ download_packages(tmpdirname, sources, pkgs, nativearch, foreignarches)
+
+ create_repo(tmpdirname, pkgs)
+
+ newpkgs = run_mmdebstrap(
+ tmpdirname, sources, nativearch, foreignarches, args.output
+ )
+
+ # make sure that the installed packages match the requested package
+ # list
+ assert set(newpkgs) == set(pkgs)
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])