1
0
Fork 0
devscripts/scripts/devscripts/proxy.py
Daniel Baumann b543f2e88d
Adding upstream version 2.25.15.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
2025-06-21 11:04:07 +02:00

235 lines
9.3 KiB
Python

# SPDX-FileCopyrightText: 2024 Johannes Schauer Marin Rodrigues <josch@debian.org>
# SPDX-License-Identifier: MIT
import http.server
import logging
import os
import pathlib
import shutil
import tempfile
import threading
import urllib
from functools import partial
from http import HTTPStatus
# we use a http proxy for two reasons
# 1. it allows us to cache package data locally which is useful even for
# single runs because temporally close snapshot timestamps share packages
# and thus we reduce the load on snapshot.d.o which is also useful because
# 2. snapshot.d.o requires manual bandwidth throttling or else it will cut
# our TCP connection. Instead of using Acquire::http::Dl-Limit as an apt
# option we use a proxy to only throttle on the initial download and then
# serve the data with full speed once we have it locally
#
# We use SimpleHTTPRequestHandler over BaseHTTPRequestHandler for its directory
# member. We disable its other features, namely do_HEAD
class Proxy(http.server.SimpleHTTPRequestHandler):
def do_HEAD(self):
raise NotImplementedError
# no idea how to split this function into parts without making it
# unreadable
def do_GET(self):
assert int(self.headers.get("Content-Length", 0)) == 0
assert self.headers["Host"]
pathprefix = "http://" + self.headers["Host"] + "/"
assert self.path.startswith(pathprefix)
sanitizedpath = urllib.parse.unquote(self.path.removeprefix(pathprefix))
# check validity and extract the timestamp
try:
chunk1, chunk2, timestamp, _ = sanitizedpath.split("/", 3)
except ValueError:
logging.error("don't know how to handle this request: %s", self.path)
self.send_error(HTTPStatus.BAD_REQUEST, f"Bad request path ({self.path})")
return
if ["archive", "debian"] != [chunk1, chunk2]:
logging.error("don't know how to handle this request: %s", self.path)
self.send_error(HTTPStatus.BAD_REQUEST, f"Bad request path ({self.path})")
return
# make sure the pool directory is symlinked to the global pool
linkname = os.path.join(self.directory, chunk1, chunk2, timestamp, "pool")
if not os.path.exists(linkname):
os.makedirs(
os.path.join(self.directory, chunk1, chunk2, timestamp), exist_ok=True
)
try:
os.symlink("../../../pool", linkname)
except FileExistsError:
pass
cachedir = pathlib.Path(self.directory)
path = cachedir / sanitizedpath
# just send back to client
if path.exists() and path.stat().st_size > 0:
self.wfile.write(b"HTTP/1.1 200 OK\r\n")
self.send_header("Content-Length", path.stat().st_size)
self.end_headers()
with path.open(mode="rb") as new:
while True:
buf = new.read(64 * 1024) # same as shutil uses
if not buf:
break
self.wfile.write(buf)
self.wfile.flush()
return
self.do_download(path)
# pylint: disable=too-many-branches,too-many-statements
def do_download(self, path):
# download fresh copy
todownload = downloaded_bytes = 0
partial_size = None
# The PID is part of the name of the temporary file. That way, multiple
# concurrent processes can write out partial files without conflicting
# with each other and while still maintaining reproducible paths
# between individual calls of do_download() by the same process.
tmppath = path.with_suffix(f".{os.getpid()}.part")
if self.headers.get("Range"):
assert tmppath.is_file()
assert self.headers["Range"].startswith("bytes=")
assert self.headers["Range"].endswith("-")
reqrange = int(
self.headers["Range"].removeprefix("bytes=").removesuffix("-")
)
assert reqrange <= tmppath.stat().st_size
partial_size = reqrange
else:
tmppath.parent.mkdir(parents=True, exist_ok=True)
conn = http.client.HTTPConnection(self.headers["Host"], timeout=30)
conn.request("GET", self.path, None, dict(self.headers))
try:
res = conn.getresponse()
except TimeoutError:
try:
self.send_error(504) # Gateway Timeout
except BrokenPipeError:
pass
return
if res.status == 302:
# clean up connection so it can be reused for the 302 redirect
res.read()
res.close()
newpath = res.getheader("Location")
assert newpath.startswith("/file/"), newpath
conn.request("GET", newpath, None, dict(self.headers))
try:
res = conn.getresponse()
except TimeoutError:
try:
self.send_error(504) # Gateway Timeout
except BrokenPipeError:
pass
return
if partial_size is not None:
if res.status != 206:
try:
self.send_error(res.status)
except BrokenPipeError:
pass
return
self.wfile.write(b"HTTP/1.1 206 Partial Content\r\n")
logging.info("proxy: resuming download from byte %d", partial_size)
else:
if res.status != 200:
try:
self.send_error(res.status)
except BrokenPipeError:
pass
return
self.wfile.write(b"HTTP/1.1 200 OK\r\n")
todownload = int(res.getheader("Content-Length"))
for key, value in res.getheaders():
# do not allow a persistent connection
if key == "connection":
continue
self.send_header(key, value)
self.end_headers()
if partial_size is not None:
total_size = todownload + partial_size
assert (
res.getheader("Content-Range")
== f"bytes {partial_size}-{total_size - 1}/{total_size}"
), (
res.getheader("Content-Range"),
f"bytes {partial_size}-{total_size - 1}/{total_size}",
)
downloaded_bytes = 0
with tmppath.open(mode="ab") as file:
if partial_size is not None and file.tell() != partial_size:
file.seek(partial_size, os.SEEK_SET)
# we are not using shutil.copyfileobj() because we want to
# write to two file objects simultaneously and throttle the
# writing speed to 1024 kB/s
while True:
buf = res.read(64 * 1024) # same as shutil uses
if not buf:
break
downloaded_bytes += len(buf)
try:
self.wfile.write(buf)
except BrokenPipeError:
break
file.write(buf)
# now that snapshot.d.o is fixed, we do not need to throttle
# the download speed anymore
# sleep(0.5) # 128 kB/s
self.wfile.flush()
if todownload == downloaded_bytes and downloaded_bytes > 0:
tmppath.rename(path)
# pylint: disable=redefined-builtin
def log_message(self, format, *args):
pass
def setupcache(cache, port):
if cache:
cachedir = cache
for path in pathlib.Path(cachedir).glob("**/*.part"):
# we are not deleting *.part files so that multiple processes can
# use the cache at the same time without having their *.part files
# deleted by another process
logging.warning(
"found partial file in cache, consider deleting it manually: %s", path
)
else:
cachedir = tempfile.mkdtemp(prefix="debbisect")
logging.info("using cache directory: %s", cachedir)
os.makedirs(cachedir + "/pool", exist_ok=True)
# we are not using a ThreadedHTTPServer because
# - additional complexity needed if one download created a .part file
# then apt stops reading while we still try to read from snapshot and
# apt retries the same download trying to write to the same .part file
# opened in another threat
# - snapshot.d.o really doesn't like fast downloads, so we do it serially
httpd = http.server.HTTPServer(
server_address=("127.0.0.1", port),
RequestHandlerClass=partial(Proxy, directory=cachedir),
)
# run server in a new thread
server_thread = threading.Thread(target=httpd.serve_forever)
server_thread.daemon = True
# start thread
server_thread.start()
# retrieve port (in case it was generated automatically)
_, port = httpd.server_address
def teardown():
httpd.shutdown()
httpd.server_close()
server_thread.join()
if not cache:
# this should be a temporary directory but lets still be super
# careful
if os.path.exists(cachedir + "/pool"):
shutil.rmtree(cachedir + "/pool")
if os.path.exists(cachedir + "/archive"):
shutil.rmtree(cachedir + "/archive")
os.rmdir(cachedir)
return port, teardown