From 3c33e01482cb0481e2472ee49fa55b0d7f818c26 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 29 Apr 2024 06:25:33 +0200 Subject: Adding upstream version 0.1.2. Signed-off-by: Daniel Baumann --- src/mdurl/__init__.py | 18 +++ src/mdurl/_decode.py | 104 +++++++++++++++++ src/mdurl/_encode.py | 85 ++++++++++++++ src/mdurl/_format.py | 27 +++++ src/mdurl/_parse.py | 304 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/mdurl/_url.py | 14 +++ src/mdurl/py.typed | 1 + 7 files changed, 553 insertions(+) create mode 100644 src/mdurl/__init__.py create mode 100644 src/mdurl/_decode.py create mode 100644 src/mdurl/_encode.py create mode 100644 src/mdurl/_format.py create mode 100644 src/mdurl/_parse.py create mode 100644 src/mdurl/_url.py create mode 100644 src/mdurl/py.typed (limited to 'src') diff --git a/src/mdurl/__init__.py b/src/mdurl/__init__.py new file mode 100644 index 0000000..cdbb640 --- /dev/null +++ b/src/mdurl/__init__.py @@ -0,0 +1,18 @@ +__all__ = ( + "decode", + "DECODE_DEFAULT_CHARS", + "DECODE_COMPONENT_CHARS", + "encode", + "ENCODE_DEFAULT_CHARS", + "ENCODE_COMPONENT_CHARS", + "format", + "parse", + "URL", +) +__version__ = "0.1.2" # DO NOT EDIT THIS LINE MANUALLY. LET bump2version UTILITY DO IT + +from mdurl._decode import DECODE_COMPONENT_CHARS, DECODE_DEFAULT_CHARS, decode +from mdurl._encode import ENCODE_COMPONENT_CHARS, ENCODE_DEFAULT_CHARS, encode +from mdurl._format import format +from mdurl._parse import url_parse as parse +from mdurl._url import URL diff --git a/src/mdurl/_decode.py b/src/mdurl/_decode.py new file mode 100644 index 0000000..9b50a2d --- /dev/null +++ b/src/mdurl/_decode.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from collections.abc import Sequence +import functools +import re + +DECODE_DEFAULT_CHARS = ";/?:@&=+$,#" +DECODE_COMPONENT_CHARS = "" + +decode_cache: dict[str, list[str]] = {} + + +def get_decode_cache(exclude: str) -> Sequence[str]: + if exclude in decode_cache: + return decode_cache[exclude] + + cache: list[str] = [] + decode_cache[exclude] = cache + + for i in range(128): + ch = chr(i) + cache.append(ch) + + for i in range(len(exclude)): + ch_code = ord(exclude[i]) + cache[ch_code] = "%" + ("0" + hex(ch_code)[2:].upper())[-2:] + + return cache + + +# Decode percent-encoded string. +# +def decode(string: str, exclude: str = DECODE_DEFAULT_CHARS) -> str: + cache = get_decode_cache(exclude) + repl_func = functools.partial(repl_func_with_cache, cache=cache) + return re.sub(r"(%[a-f0-9]{2})+", repl_func, string, flags=re.IGNORECASE) + + +def repl_func_with_cache(match: re.Match, cache: Sequence[str]) -> str: + seq = match.group() + result = "" + + i = 0 + l = len(seq) # noqa: E741 + while i < l: + b1 = int(seq[i + 1 : i + 3], 16) + + if b1 < 0x80: + result += cache[b1] + i += 3 # emulate JS for loop statement3 + continue + + if (b1 & 0xE0) == 0xC0 and (i + 3 < l): + # 110xxxxx 10xxxxxx + b2 = int(seq[i + 4 : i + 6], 16) + + if (b2 & 0xC0) == 0x80: + all_bytes = bytes((b1, b2)) + try: + result += all_bytes.decode() + except UnicodeDecodeError: + result += "\ufffd" * 2 + + i += 3 + i += 3 # emulate JS for loop statement3 + continue + + if (b1 & 0xF0) == 0xE0 and (i + 6 < l): + # 1110xxxx 10xxxxxx 10xxxxxx + b2 = int(seq[i + 4 : i + 6], 16) + b3 = int(seq[i + 7 : i + 9], 16) + + if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80: + all_bytes = bytes((b1, b2, b3)) + try: + result += all_bytes.decode() + except UnicodeDecodeError: + result += "\ufffd" * 3 + + i += 6 + i += 3 # emulate JS for loop statement3 + continue + + if (b1 & 0xF8) == 0xF0 and (i + 9 < l): + # 111110xx 10xxxxxx 10xxxxxx 10xxxxxx + b2 = int(seq[i + 4 : i + 6], 16) + b3 = int(seq[i + 7 : i + 9], 16) + b4 = int(seq[i + 10 : i + 12], 16) + + if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80 and (b4 & 0xC0) == 0x80: + all_bytes = bytes((b1, b2, b3, b4)) + try: + result += all_bytes.decode() + except UnicodeDecodeError: + result += "\ufffd" * 4 + + i += 9 + i += 3 # emulate JS for loop statement3 + continue + + result += "\ufffd" + i += 3 # emulate JS for loop statement3 + + return result diff --git a/src/mdurl/_encode.py b/src/mdurl/_encode.py new file mode 100644 index 0000000..bc2e5b9 --- /dev/null +++ b/src/mdurl/_encode.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from collections.abc import Sequence +from string import ascii_letters, digits, hexdigits +from urllib.parse import quote as encode_uri_component + +ASCII_LETTERS_AND_DIGITS = ascii_letters + digits + +ENCODE_DEFAULT_CHARS = ";/?:@&=+$,-_.!~*'()#" +ENCODE_COMPONENT_CHARS = "-_.!~*'()" + +encode_cache: dict[str, list[str]] = {} + + +# Create a lookup array where anything but characters in `chars` string +# and alphanumeric chars is percent-encoded. +def get_encode_cache(exclude: str) -> Sequence[str]: + if exclude in encode_cache: + return encode_cache[exclude] + + cache: list[str] = [] + encode_cache[exclude] = cache + + for i in range(128): + ch = chr(i) + + if ch in ASCII_LETTERS_AND_DIGITS: + # always allow unencoded alphanumeric characters + cache.append(ch) + else: + cache.append("%" + ("0" + hex(i)[2:].upper())[-2:]) + + for i in range(len(exclude)): + cache[ord(exclude[i])] = exclude[i] + + return cache + + +# Encode unsafe characters with percent-encoding, skipping already +# encoded sequences. +# +# - string - string to encode +# - exclude - list of characters to ignore (in addition to a-zA-Z0-9) +# - keepEscaped - don't encode '%' in a correct escape sequence (default: true) +def encode( + string: str, exclude: str = ENCODE_DEFAULT_CHARS, *, keep_escaped: bool = True +) -> str: + result = "" + + cache = get_encode_cache(exclude) + + l = len(string) # noqa: E741 + i = 0 + while i < l: + code = ord(string[i]) + + # % + if keep_escaped and code == 0x25 and i + 2 < l: + if all(c in hexdigits for c in string[i + 1 : i + 3]): + result += string[i : i + 3] + i += 2 + i += 1 # JS for loop statement3 + continue + + if code < 128: + result += cache[code] + i += 1 # JS for loop statement3 + continue + + if code >= 0xD800 and code <= 0xDFFF: + if code >= 0xD800 and code <= 0xDBFF and i + 1 < l: + next_code = ord(string[i + 1]) + if next_code >= 0xDC00 and next_code <= 0xDFFF: + result += encode_uri_component(string[i] + string[i + 1]) + i += 1 + i += 1 # JS for loop statement3 + continue + result += "%EF%BF%BD" + i += 1 # JS for loop statement3 + continue + + result += encode_uri_component(string[i]) + i += 1 # JS for loop statement3 + + return result diff --git a/src/mdurl/_format.py b/src/mdurl/_format.py new file mode 100644 index 0000000..12524ca --- /dev/null +++ b/src/mdurl/_format.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from mdurl._url import URL + + +def format(url: URL) -> str: # noqa: A001 + result = "" + + result += url.protocol or "" + result += "//" if url.slashes else "" + result += url.auth + "@" if url.auth else "" + + if url.hostname and ":" in url.hostname: + # ipv6 address + result += "[" + url.hostname + "]" + else: + result += url.hostname or "" + + result += ":" + url.port if url.port else "" + result += url.pathname or "" + result += url.search or "" + result += url.hash or "" + + return result diff --git a/src/mdurl/_parse.py b/src/mdurl/_parse.py new file mode 100644 index 0000000..ffeeac7 --- /dev/null +++ b/src/mdurl/_parse.py @@ -0,0 +1,304 @@ +# Copyright Joyent, Inc. and other Node contributors. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the +# following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN +# NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE +# USE OR OTHER DEALINGS IN THE SOFTWARE. + + +# Changes from joyent/node: +# +# 1. No leading slash in paths, +# e.g. in `url.parse('http://foo?bar')` pathname is ``, not `/` +# +# 2. Backslashes are not replaced with slashes, +# so `http:\\example.org\` is treated like a relative path +# +# 3. Trailing colon is treated like a part of the path, +# i.e. in `http://example.org:foo` pathname is `:foo` +# +# 4. Nothing is URL-encoded in the resulting object, +# (in joyent/node some chars in auth and paths are encoded) +# +# 5. `url.parse()` does not have `parseQueryString` argument +# +# 6. Removed extraneous result properties: `host`, `path`, `query`, etc., +# which can be constructed using other parts of the url. + +from __future__ import annotations + +from collections import defaultdict +import re + +from mdurl._url import URL + +# Reference: RFC 3986, RFC 1808, RFC 2396 + +# define these here so at least they only have to be +# compiled once on the first module load. +PROTOCOL_PATTERN = re.compile(r"^([a-z0-9.+-]+:)", flags=re.IGNORECASE) +PORT_PATTERN = re.compile(r":[0-9]*$") + +# Special case for a simple path URL +SIMPLE_PATH_PATTERN = re.compile(r"^(//?(?!/)[^?\s]*)(\?[^\s]*)?$") + +# RFC 2396: characters reserved for delimiting URLs. +# We actually just auto-escape these. +DELIMS = ("<", ">", '"', "`", " ", "\r", "\n", "\t") + +# RFC 2396: characters not allowed for various reasons. +UNWISE = ("{", "}", "|", "\\", "^", "`") + DELIMS + +# Allowed by RFCs, but cause of XSS attacks. Always escape these. +AUTO_ESCAPE = ("'",) + UNWISE +# Characters that are never ever allowed in a hostname. +# Note that any invalid chars are also handled, but these +# are the ones that are *expected* to be seen, so we fast-path +# them. +NON_HOST_CHARS = ("%", "/", "?", ";", "#") + AUTO_ESCAPE +HOST_ENDING_CHARS = ("/", "?", "#") +HOSTNAME_MAX_LEN = 255 +HOSTNAME_PART_PATTERN = re.compile(r"^[+a-z0-9A-Z_-]{0,63}$") +HOSTNAME_PART_START = re.compile(r"^([+a-z0-9A-Z_-]{0,63})(.*)$") +# protocols that can allow "unsafe" and "unwise" chars. + +# protocols that never have a hostname. +HOSTLESS_PROTOCOL = defaultdict( + bool, + { + "javascript": True, + "javascript:": True, + }, +) +# protocols that always contain a // bit. +SLASHED_PROTOCOL = defaultdict( + bool, + { + "http": True, + "https": True, + "ftp": True, + "gopher": True, + "file": True, + "http:": True, + "https:": True, + "ftp:": True, + "gopher:": True, + "file:": True, + }, +) + + +class MutableURL: + def __init__(self) -> None: + self.protocol: str | None = None + self.slashes: bool = False + self.auth: str | None = None + self.port: str | None = None + self.hostname: str | None = None + self.hash: str | None = None + self.search: str | None = None + self.pathname: str | None = None + + def parse(self, url: str, slashes_denote_host: bool) -> "MutableURL": + lower_proto = "" + slashes = False + rest = url + + # trim before proceeding. + # This is to support parse stuff like " http://foo.com \n" + rest = rest.strip() + + if not slashes_denote_host and len(url.split("#")) == 1: + # Try fast path regexp + simple_path = SIMPLE_PATH_PATTERN.match(rest) + if simple_path: + self.pathname = simple_path.group(1) + if simple_path.group(2): + self.search = simple_path.group(2) + return self + + proto = "" + proto_match = PROTOCOL_PATTERN.match(rest) + if proto_match: + proto = proto_match.group() + lower_proto = proto.lower() + self.protocol = proto + rest = rest[len(proto) :] + + # figure out if it's got a host + # user@server is *always* interpreted as a hostname, and url + # resolution will treat //foo/bar as host=foo,path=bar because that's + # how the browser resolves relative URLs. + if slashes_denote_host or proto or re.search(r"^//[^@/]+@[^@/]+", rest): + slashes = rest.startswith("//") + if slashes and not (proto and HOSTLESS_PROTOCOL[proto]): + rest = rest[2:] + self.slashes = True + + if not HOSTLESS_PROTOCOL[proto] and ( + slashes or (proto and not SLASHED_PROTOCOL[proto]) + ): + + # there's a hostname. + # the first instance of /, ?, ;, or # ends the host. + # + # If there is an @ in the hostname, then non-host chars *are* allowed + # to the left of the last @ sign, unless some host-ending character + # comes *before* the @-sign. + # URLs are obnoxious. + # + # ex: + # http://a@b@c/ => user:a@b host:c + # http://a@b?@c => user:a host:c path:/?@c + + # v0.12 TODO(isaacs): This is not quite how Chrome does things. + # Review our test case against browsers more comprehensively. + + # find the first instance of any hostEndingChars + host_end = -1 + for i in range(len(HOST_ENDING_CHARS)): + hec = rest.find(HOST_ENDING_CHARS[i]) + if hec != -1 and (host_end == -1 or hec < host_end): + host_end = hec + + # at this point, either we have an explicit point where the + # auth portion cannot go past, or the last @ char is the decider. + if host_end == -1: + # atSign can be anywhere. + at_sign = rest.rfind("@") + else: + # atSign must be in auth portion. + # http://a@b/c@d => host:b auth:a path:/c@d + at_sign = rest.rfind("@", 0, host_end + 1) + + # Now we have a portion which is definitely the auth. + # Pull that off. + if at_sign != -1: + auth = rest[:at_sign] + rest = rest[at_sign + 1 :] + self.auth = auth + + # the host is the remaining to the left of the first non-host char + host_end = -1 + for i in range(len(NON_HOST_CHARS)): + hec = rest.find(NON_HOST_CHARS[i]) + if hec != -1 and (host_end == -1 or hec < host_end): + host_end = hec + # if we still have not hit it, then the entire thing is a host. + if host_end == -1: + host_end = len(rest) + + if host_end > 0 and rest[host_end - 1] == ":": + host_end -= 1 + host = rest[:host_end] + rest = rest[host_end:] + + # pull out port. + self.parse_host(host) + + # we've indicated that there is a hostname, + # so even if it's empty, it has to be present. + self.hostname = self.hostname or "" + + # if hostname begins with [ and ends with ] + # assume that it's an IPv6 address. + ipv6_hostname = self.hostname.startswith("[") and self.hostname.endswith( + "]" + ) + + # validate a little. + if not ipv6_hostname: + hostparts = self.hostname.split(".") + l = len(hostparts) # noqa: E741 + i = 0 + while i < l: + part = hostparts[i] + if not part: + i += 1 # emulate statement3 in JS for loop + continue + if not HOSTNAME_PART_PATTERN.search(part): + newpart = "" + k = len(part) + j = 0 + while j < k: + if ord(part[j]) > 127: + # we replace non-ASCII char with a temporary placeholder + # we need this to make sure size of hostname is not + # broken by replacing non-ASCII by nothing + newpart += "x" + else: + newpart += part[j] + j += 1 # emulate statement3 in JS for loop + + # we test again with ASCII char only + if not HOSTNAME_PART_PATTERN.search(newpart): + valid_parts = hostparts[:i] + not_host = hostparts[i + 1 :] + bit = HOSTNAME_PART_START.search(part) + if bit: + valid_parts.append(bit.group(1)) + not_host.insert(0, bit.group(2)) + if not_host: + rest = ".".join(not_host) + rest + self.hostname = ".".join(valid_parts) + break + i += 1 # emulate statement3 in JS for loop + + if len(self.hostname) > HOSTNAME_MAX_LEN: + self.hostname = "" + + # strip [ and ] from the hostname + # the host field still retains them, though + if ipv6_hostname: + self.hostname = self.hostname[1:-1] + + # chop off from the tail first. + hash = rest.find("#") # noqa: A001 + if hash != -1: + # got a fragment string. + self.hash = rest[hash:] + rest = rest[:hash] + qm = rest.find("?") + if qm != -1: + self.search = rest[qm:] + rest = rest[:qm] + if rest: + self.pathname = rest + if SLASHED_PROTOCOL[lower_proto] and self.hostname and not self.pathname: + self.pathname = "" + + return self + + def parse_host(self, host: str) -> None: + port_match = PORT_PATTERN.search(host) + if port_match: + port = port_match.group() + if port != ":": + self.port = port[1:] + host = host[: -len(port)] + if host: + self.hostname = host + + +def url_parse(url: URL | str, *, slashes_denote_host: bool = False) -> URL: + if isinstance(url, URL): + return url + u = MutableURL() + u.parse(url, slashes_denote_host) + return URL( + u.protocol, u.slashes, u.auth, u.port, u.hostname, u.hash, u.search, u.pathname + ) diff --git a/src/mdurl/_url.py b/src/mdurl/_url.py new file mode 100644 index 0000000..f866e7a --- /dev/null +++ b/src/mdurl/_url.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +from typing import NamedTuple + + +class URL(NamedTuple): + protocol: str | None + slashes: bool + auth: str | None + port: str | None + hostname: str | None + hash: str | None # noqa: A003 + search: str | None + pathname: str | None diff --git a/src/mdurl/py.typed b/src/mdurl/py.typed new file mode 100644 index 0000000..7632ecf --- /dev/null +++ b/src/mdurl/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561 -- cgit v1.2.3