# Copyright Joyent, Inc. and other Node contributors. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to permit # persons to whom the Software is furnished to do so, subject to the # following conditions: # # The above copyright notice and this permission notice shall be included # in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN # NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE # USE OR OTHER DEALINGS IN THE SOFTWARE. # Changes from joyent/node: # # 1. No leading slash in paths, # e.g. in `url.parse('http://foo?bar')` pathname is ``, not `/` # # 2. Backslashes are not replaced with slashes, # so `http:\\example.org\` is treated like a relative path # # 3. Trailing colon is treated like a part of the path, # i.e. in `http://example.org:foo` pathname is `:foo` # # 4. Nothing is URL-encoded in the resulting object, # (in joyent/node some chars in auth and paths are encoded) # # 5. `url.parse()` does not have `parseQueryString` argument # # 6. Removed extraneous result properties: `host`, `path`, `query`, etc., # which can be constructed using other parts of the url. from __future__ import annotations from collections import defaultdict import re from mdurl._url import URL # Reference: RFC 3986, RFC 1808, RFC 2396 # define these here so at least they only have to be # compiled once on the first module load. PROTOCOL_PATTERN = re.compile(r"^([a-z0-9.+-]+:)", flags=re.IGNORECASE) PORT_PATTERN = re.compile(r":[0-9]*$") # Special case for a simple path URL SIMPLE_PATH_PATTERN = re.compile(r"^(//?(?!/)[^?\s]*)(\?[^\s]*)?$") # RFC 2396: characters reserved for delimiting URLs. # We actually just auto-escape these. DELIMS = ("<", ">", '"', "`", " ", "\r", "\n", "\t") # RFC 2396: characters not allowed for various reasons. UNWISE = ("{", "}", "|", "\\", "^", "`") + DELIMS # Allowed by RFCs, but cause of XSS attacks. Always escape these. AUTO_ESCAPE = ("'",) + UNWISE # Characters that are never ever allowed in a hostname. # Note that any invalid chars are also handled, but these # are the ones that are *expected* to be seen, so we fast-path # them. NON_HOST_CHARS = ("%", "/", "?", ";", "#") + AUTO_ESCAPE HOST_ENDING_CHARS = ("/", "?", "#") HOSTNAME_MAX_LEN = 255 HOSTNAME_PART_PATTERN = re.compile(r"^[+a-z0-9A-Z_-]{0,63}$") HOSTNAME_PART_START = re.compile(r"^([+a-z0-9A-Z_-]{0,63})(.*)$") # protocols that can allow "unsafe" and "unwise" chars. # protocols that never have a hostname. HOSTLESS_PROTOCOL = defaultdict( bool, { "javascript": True, "javascript:": True, }, ) # protocols that always contain a // bit. SLASHED_PROTOCOL = defaultdict( bool, { "http": True, "https": True, "ftp": True, "gopher": True, "file": True, "http:": True, "https:": True, "ftp:": True, "gopher:": True, "file:": True, }, ) class MutableURL: def __init__(self) -> None: self.protocol: str | None = None self.slashes: bool = False self.auth: str | None = None self.port: str | None = None self.hostname: str | None = None self.hash: str | None = None self.search: str | None = None self.pathname: str | None = None def parse(self, url: str, slashes_denote_host: bool) -> "MutableURL": lower_proto = "" slashes = False rest = url # trim before proceeding. # This is to support parse stuff like " http://foo.com \n" rest = rest.strip() if not slashes_denote_host and len(url.split("#")) == 1: # Try fast path regexp simple_path = SIMPLE_PATH_PATTERN.match(rest) if simple_path: self.pathname = simple_path.group(1) if simple_path.group(2): self.search = simple_path.group(2) return self proto = "" proto_match = PROTOCOL_PATTERN.match(rest) if proto_match: proto = proto_match.group() lower_proto = proto.lower() self.protocol = proto rest = rest[len(proto) :] # figure out if it's got a host # user@server is *always* interpreted as a hostname, and url # resolution will treat //foo/bar as host=foo,path=bar because that's # how the browser resolves relative URLs. if slashes_denote_host or proto or re.search(r"^//[^@/]+@[^@/]+", rest): slashes = rest.startswith("//") if slashes and not (proto and HOSTLESS_PROTOCOL[proto]): rest = rest[2:] self.slashes = True if not HOSTLESS_PROTOCOL[proto] and ( slashes or (proto and not SLASHED_PROTOCOL[proto]) ): # there's a hostname. # the first instance of /, ?, ;, or # ends the host. # # If there is an @ in the hostname, then non-host chars *are* allowed # to the left of the last @ sign, unless some host-ending character # comes *before* the @-sign. # URLs are obnoxious. # # ex: # http://a@b@c/ => user:a@b host:c # http://a@b?@c => user:a host:c path:/?@c # v0.12 TODO(isaacs): This is not quite how Chrome does things. # Review our test case against browsers more comprehensively. # find the first instance of any hostEndingChars host_end = -1 for i in range(len(HOST_ENDING_CHARS)): hec = rest.find(HOST_ENDING_CHARS[i]) if hec != -1 and (host_end == -1 or hec < host_end): host_end = hec # at this point, either we have an explicit point where the # auth portion cannot go past, or the last @ char is the decider. if host_end == -1: # atSign can be anywhere. at_sign = rest.rfind("@") else: # atSign must be in auth portion. # http://a@b/c@d => host:b auth:a path:/c@d at_sign = rest.rfind("@", 0, host_end + 1) # Now we have a portion which is definitely the auth. # Pull that off. if at_sign != -1: auth = rest[:at_sign] rest = rest[at_sign + 1 :] self.auth = auth # the host is the remaining to the left of the first non-host char host_end = -1 for i in range(len(NON_HOST_CHARS)): hec = rest.find(NON_HOST_CHARS[i]) if hec != -1 and (host_end == -1 or hec < host_end): host_end = hec # if we still have not hit it, then the entire thing is a host. if host_end == -1: host_end = len(rest) if host_end > 0 and rest[host_end - 1] == ":": host_end -= 1 host = rest[:host_end] rest = rest[host_end:] # pull out port. self.parse_host(host) # we've indicated that there is a hostname, # so even if it's empty, it has to be present. self.hostname = self.hostname or "" # if hostname begins with [ and ends with ] # assume that it's an IPv6 address. ipv6_hostname = self.hostname.startswith("[") and self.hostname.endswith( "]" ) # validate a little. if not ipv6_hostname: hostparts = self.hostname.split(".") l = len(hostparts) # noqa: E741 i = 0 while i < l: part = hostparts[i] if not part: i += 1 # emulate statement3 in JS for loop continue if not HOSTNAME_PART_PATTERN.search(part): newpart = "" k = len(part) j = 0 while j < k: if ord(part[j]) > 127: # we replace non-ASCII char with a temporary placeholder # we need this to make sure size of hostname is not # broken by replacing non-ASCII by nothing newpart += "x" else: newpart += part[j] j += 1 # emulate statement3 in JS for loop # we test again with ASCII char only if not HOSTNAME_PART_PATTERN.search(newpart): valid_parts = hostparts[:i] not_host = hostparts[i + 1 :] bit = HOSTNAME_PART_START.search(part) if bit: valid_parts.append(bit.group(1)) not_host.insert(0, bit.group(2)) if not_host: rest = ".".join(not_host) + rest self.hostname = ".".join(valid_parts) break i += 1 # emulate statement3 in JS for loop if len(self.hostname) > HOSTNAME_MAX_LEN: self.hostname = "" # strip [ and ] from the hostname # the host field still retains them, though if ipv6_hostname: self.hostname = self.hostname[1:-1] # chop off from the tail first. hash = rest.find("#") # noqa: A001 if hash != -1: # got a fragment string. self.hash = rest[hash:] rest = rest[:hash] qm = rest.find("?") if qm != -1: self.search = rest[qm:] rest = rest[:qm] if rest: self.pathname = rest if SLASHED_PROTOCOL[lower_proto] and self.hostname and not self.pathname: self.pathname = "" return self def parse_host(self, host: str) -> None: port_match = PORT_PATTERN.search(host) if port_match: port = port_match.group() if port != ":": self.port = port[1:] host = host[: -len(port)] if host: self.hostname = host def url_parse(url: URL | str, *, slashes_denote_host: bool = False) -> URL: if isinstance(url, URL): return url u = MutableURL() u.parse(url, slashes_denote_host) return URL( u.protocol, u.slashes, u.auth, u.port, u.hostname, u.hash, u.search, u.pathname )