235 lines
9.3 KiB
Python
235 lines
9.3 KiB
Python
# SPDX-FileCopyrightText: 2024 Johannes Schauer Marin Rodrigues <josch@debian.org>
|
|
# SPDX-License-Identifier: MIT
|
|
|
|
import http.server
|
|
import logging
|
|
import os
|
|
import pathlib
|
|
import shutil
|
|
import tempfile
|
|
import threading
|
|
import urllib
|
|
from functools import partial
|
|
from http import HTTPStatus
|
|
|
|
|
|
# we use a http proxy for two reasons
|
|
# 1. it allows us to cache package data locally which is useful even for
|
|
# single runs because temporally close snapshot timestamps share packages
|
|
# and thus we reduce the load on snapshot.d.o which is also useful because
|
|
# 2. snapshot.d.o requires manual bandwidth throttling or else it will cut
|
|
# our TCP connection. Instead of using Acquire::http::Dl-Limit as an apt
|
|
# option we use a proxy to only throttle on the initial download and then
|
|
# serve the data with full speed once we have it locally
|
|
#
|
|
# We use SimpleHTTPRequestHandler over BaseHTTPRequestHandler for its directory
|
|
# member. We disable its other features, namely do_HEAD
|
|
class Proxy(http.server.SimpleHTTPRequestHandler):
|
|
def do_HEAD(self):
|
|
raise NotImplementedError
|
|
|
|
# no idea how to split this function into parts without making it
|
|
# unreadable
|
|
def do_GET(self):
|
|
assert int(self.headers.get("Content-Length", 0)) == 0
|
|
assert self.headers["Host"]
|
|
pathprefix = "http://" + self.headers["Host"] + "/"
|
|
assert self.path.startswith(pathprefix)
|
|
sanitizedpath = urllib.parse.unquote(self.path.removeprefix(pathprefix))
|
|
|
|
# check validity and extract the timestamp
|
|
try:
|
|
chunk1, chunk2, timestamp, _ = sanitizedpath.split("/", 3)
|
|
except ValueError:
|
|
logging.error("don't know how to handle this request: %s", self.path)
|
|
self.send_error(HTTPStatus.BAD_REQUEST, f"Bad request path ({self.path})")
|
|
return
|
|
if ["archive", "debian"] != [chunk1, chunk2]:
|
|
logging.error("don't know how to handle this request: %s", self.path)
|
|
self.send_error(HTTPStatus.BAD_REQUEST, f"Bad request path ({self.path})")
|
|
return
|
|
# make sure the pool directory is symlinked to the global pool
|
|
linkname = os.path.join(self.directory, chunk1, chunk2, timestamp, "pool")
|
|
if not os.path.exists(linkname):
|
|
os.makedirs(
|
|
os.path.join(self.directory, chunk1, chunk2, timestamp), exist_ok=True
|
|
)
|
|
try:
|
|
os.symlink("../../../pool", linkname)
|
|
except FileExistsError:
|
|
pass
|
|
|
|
cachedir = pathlib.Path(self.directory)
|
|
path = cachedir / sanitizedpath
|
|
|
|
# just send back to client
|
|
if path.exists() and path.stat().st_size > 0:
|
|
self.wfile.write(b"HTTP/1.1 200 OK\r\n")
|
|
self.send_header("Content-Length", path.stat().st_size)
|
|
self.end_headers()
|
|
with path.open(mode="rb") as new:
|
|
while True:
|
|
buf = new.read(64 * 1024) # same as shutil uses
|
|
if not buf:
|
|
break
|
|
self.wfile.write(buf)
|
|
self.wfile.flush()
|
|
return
|
|
|
|
self.do_download(path)
|
|
|
|
# pylint: disable=too-many-branches,too-many-statements
|
|
def do_download(self, path):
|
|
# download fresh copy
|
|
todownload = downloaded_bytes = 0
|
|
partial_size = None
|
|
# The PID is part of the name of the temporary file. That way, multiple
|
|
# concurrent processes can write out partial files without conflicting
|
|
# with each other and while still maintaining reproducible paths
|
|
# between individual calls of do_download() by the same process.
|
|
tmppath = path.with_suffix(f".{os.getpid()}.part")
|
|
if self.headers.get("Range"):
|
|
assert tmppath.is_file()
|
|
assert self.headers["Range"].startswith("bytes=")
|
|
assert self.headers["Range"].endswith("-")
|
|
reqrange = int(
|
|
self.headers["Range"].removeprefix("bytes=").removesuffix("-")
|
|
)
|
|
assert reqrange <= tmppath.stat().st_size
|
|
partial_size = reqrange
|
|
else:
|
|
tmppath.parent.mkdir(parents=True, exist_ok=True)
|
|
conn = http.client.HTTPConnection(self.headers["Host"], timeout=30)
|
|
conn.request("GET", self.path, None, dict(self.headers))
|
|
try:
|
|
res = conn.getresponse()
|
|
except TimeoutError:
|
|
try:
|
|
self.send_error(504) # Gateway Timeout
|
|
except BrokenPipeError:
|
|
pass
|
|
return
|
|
if res.status == 302:
|
|
# clean up connection so it can be reused for the 302 redirect
|
|
res.read()
|
|
res.close()
|
|
newpath = res.getheader("Location")
|
|
assert newpath.startswith("/file/"), newpath
|
|
conn.request("GET", newpath, None, dict(self.headers))
|
|
try:
|
|
res = conn.getresponse()
|
|
except TimeoutError:
|
|
try:
|
|
self.send_error(504) # Gateway Timeout
|
|
except BrokenPipeError:
|
|
pass
|
|
return
|
|
if partial_size is not None:
|
|
if res.status != 206:
|
|
try:
|
|
self.send_error(res.status)
|
|
except BrokenPipeError:
|
|
pass
|
|
return
|
|
self.wfile.write(b"HTTP/1.1 206 Partial Content\r\n")
|
|
logging.info("proxy: resuming download from byte %d", partial_size)
|
|
else:
|
|
if res.status != 200:
|
|
try:
|
|
self.send_error(res.status)
|
|
except BrokenPipeError:
|
|
pass
|
|
return
|
|
self.wfile.write(b"HTTP/1.1 200 OK\r\n")
|
|
todownload = int(res.getheader("Content-Length"))
|
|
for key, value in res.getheaders():
|
|
# do not allow a persistent connection
|
|
if key == "connection":
|
|
continue
|
|
self.send_header(key, value)
|
|
self.end_headers()
|
|
if partial_size is not None:
|
|
total_size = todownload + partial_size
|
|
assert (
|
|
res.getheader("Content-Range")
|
|
== f"bytes {partial_size}-{total_size - 1}/{total_size}"
|
|
), (
|
|
res.getheader("Content-Range"),
|
|
f"bytes {partial_size}-{total_size - 1}/{total_size}",
|
|
)
|
|
downloaded_bytes = 0
|
|
with tmppath.open(mode="ab") as file:
|
|
if partial_size is not None and file.tell() != partial_size:
|
|
file.seek(partial_size, os.SEEK_SET)
|
|
# we are not using shutil.copyfileobj() because we want to
|
|
# write to two file objects simultaneously and throttle the
|
|
# writing speed to 1024 kB/s
|
|
while True:
|
|
buf = res.read(64 * 1024) # same as shutil uses
|
|
if not buf:
|
|
break
|
|
downloaded_bytes += len(buf)
|
|
try:
|
|
self.wfile.write(buf)
|
|
except BrokenPipeError:
|
|
break
|
|
file.write(buf)
|
|
# now that snapshot.d.o is fixed, we do not need to throttle
|
|
# the download speed anymore
|
|
# sleep(0.5) # 128 kB/s
|
|
self.wfile.flush()
|
|
if todownload == downloaded_bytes and downloaded_bytes > 0:
|
|
tmppath.rename(path)
|
|
|
|
# pylint: disable=redefined-builtin
|
|
def log_message(self, format, *args):
|
|
pass
|
|
|
|
|
|
def setupcache(cache, port):
|
|
if cache:
|
|
cachedir = cache
|
|
for path in pathlib.Path(cachedir).glob("**/*.part"):
|
|
# we are not deleting *.part files so that multiple processes can
|
|
# use the cache at the same time without having their *.part files
|
|
# deleted by another process
|
|
logging.warning(
|
|
"found partial file in cache, consider deleting it manually: %s", path
|
|
)
|
|
else:
|
|
cachedir = tempfile.mkdtemp(prefix="debbisect")
|
|
logging.info("using cache directory: %s", cachedir)
|
|
os.makedirs(cachedir + "/pool", exist_ok=True)
|
|
|
|
# we are not using a ThreadedHTTPServer because
|
|
# - additional complexity needed if one download created a .part file
|
|
# then apt stops reading while we still try to read from snapshot and
|
|
# apt retries the same download trying to write to the same .part file
|
|
# opened in another threat
|
|
# - snapshot.d.o really doesn't like fast downloads, so we do it serially
|
|
httpd = http.server.HTTPServer(
|
|
server_address=("127.0.0.1", port),
|
|
RequestHandlerClass=partial(Proxy, directory=cachedir),
|
|
)
|
|
# run server in a new thread
|
|
server_thread = threading.Thread(target=httpd.serve_forever)
|
|
server_thread.daemon = True
|
|
# start thread
|
|
server_thread.start()
|
|
# retrieve port (in case it was generated automatically)
|
|
_, port = httpd.server_address
|
|
|
|
def teardown():
|
|
httpd.shutdown()
|
|
httpd.server_close()
|
|
server_thread.join()
|
|
if not cache:
|
|
# this should be a temporary directory but lets still be super
|
|
# careful
|
|
if os.path.exists(cachedir + "/pool"):
|
|
shutil.rmtree(cachedir + "/pool")
|
|
if os.path.exists(cachedir + "/archive"):
|
|
shutil.rmtree(cachedir + "/archive")
|
|
os.rmdir(cachedir)
|
|
|
|
return port, teardown
|