diff options
-rw-r--r-- | .bumpversion.cfg | 13 | ||||
-rw-r--r-- | .flake8 | 5 | ||||
-rw-r--r-- | .github/workflows/tests.yaml | 90 | ||||
-rw-r--r-- | .gitignore | 135 | ||||
-rw-r--r-- | .pre-commit-config.yaml | 52 | ||||
-rw-r--r-- | LICENSE | 46 | ||||
-rw-r--r-- | README.md | 8 | ||||
-rw-r--r-- | pyproject.toml | 99 | ||||
-rw-r--r-- | src/mdurl/__init__.py | 18 | ||||
-rw-r--r-- | src/mdurl/_decode.py | 104 | ||||
-rw-r--r-- | src/mdurl/_encode.py | 85 | ||||
-rw-r--r-- | src/mdurl/_format.py | 27 | ||||
-rw-r--r-- | src/mdurl/_parse.py | 304 | ||||
-rw-r--r-- | src/mdurl/_url.py | 14 | ||||
-rw-r--r-- | src/mdurl/py.typed | 1 | ||||
-rw-r--r-- | tests/__init__.py | 0 | ||||
-rw-r--r-- | tests/decode.js | 123 | ||||
-rw-r--r-- | tests/fixtures/__init__.py | 0 | ||||
-rw-r--r-- | tests/fixtures/url.py | 610 | ||||
-rw-r--r-- | tests/requirements.txt | 3 | ||||
-rw-r--r-- | tests/test_decode.py | 5 | ||||
-rw-r--r-- | tests/test_encode.py | 50 | ||||
-rw-r--r-- | tests/test_format.py | 10 | ||||
-rw-r--r-- | tests/test_parse.py | 26 |
24 files changed, 1828 insertions, 0 deletions
diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..058dbaa --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,13 @@ +[bumpversion] +commit = True +tag = True +tag_name = {new_version} +current_version = 0.1.2 + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" # DO NOT EDIT THIS LINE MANUALLY. LET bump2version UTILITY DO IT +replace = version = "{new_version}" # DO NOT EDIT THIS LINE MANUALLY. LET bump2version UTILITY DO IT + +[bumpversion:file:src/mdurl/__init__.py] +search = __version__ = "{current_version}" +replace = __version__ = "{new_version}" @@ -0,0 +1,5 @@ +[flake8] +max-line-length = 88 +# These checks violate PEP8 so let's ignore them +extend-ignore = E203 +extend-exclude = */site-packages/* diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml new file mode 100644 index 0000000..d88b88e --- /dev/null +++ b/.github/workflows/tests.yaml @@ -0,0 +1,90 @@ +name: Tests + +on: + push: + branches: [ master ] + tags: [ '[0-9]+.[0-9]+.[0-9]+*' ] + pull_request: + branches: [ master ] + +jobs: + + linters: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: '3.8' + + - name: Install pre-commit + run: | + pip install pre-commit + + - name: run linters + # pre-commit also runs in pre-commit.ci, but let's have it here too + # to block `pypi-publish` job from triggering if pre-commit fails + run: | + pre-commit run -a + + tests: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: ['pypy-3.7', '3.7', '3.8', '3.9', '3.10', '3.11-dev'] + os: [ubuntu-latest, macos-latest, windows-latest] + continue-on-error: ${{ matrix.python-version == '3.11-dev' }} + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install test deps + run: | + pip install . -r tests/requirements.txt + + - name: Test with pytest + run: | + # TODO: bump coverage % up to 100 + pytest --cov --cov-fail-under=75 + + - name: Report coverage + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' + uses: codecov/codecov-action@v1 + + allgood: + runs-on: ubuntu-latest + needs: + - tests + - linters + steps: + - run: echo "Great success!" + + pypi-publish: + # Only publish if all other jobs succeed + needs: [ allgood ] + if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: '3.7' + - name: Install build and publish tools + run: | + pip install build twine + - name: Build and check + run: | + rm -rf dist/ && python -m build + twine check --strict dist/* + - name: Publish + run: | + twine upload dist/* + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..18bffe1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,135 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# IntelliJ +.idea/ + +# VS Code +.vscode/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..db92e4f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,52 @@ +repos: +- repo: https://github.com/executablebooks/mdformat + rev: b8c05ae822d53326e967da45367d0408afc56a81 # frozen: 0.7.14 + hooks: + - id: mdformat + additional_dependencies: + - mdformat-gfm +- repo: https://github.com/asottile/yesqa + rev: 265e9ff7c83add4949f81bb5fe14f4a743ffb04c # frozen: v1.4.0 + hooks: + - id: yesqa + additional_dependencies: + - flake8-bugbear + - flake8-builtins + - flake8-comprehensions +- repo: https://github.com/PyCQA/isort + rev: c5e8fa75dda5f764d20f66a215d71c21cfa198e1 # frozen: 5.10.1 + hooks: + - id: isort +- repo: https://github.com/psf/black + rev: f6c139c5215ce04fd3e73a900f1372942d58eca0 # frozen: 22.6.0 + hooks: + - id: black +- repo: https://github.com/myint/docformatter + rev: 67919ee01837761f2d954d7fbb08c12cdd38ec5a # frozen: v1.4 + hooks: + - id: docformatter +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: 3298ddab3c13dd77d6ce1fc0baf97691430d84b0 # frozen: v4.3.0 + hooks: + - id: check-yaml +- repo: https://github.com/pre-commit/pygrep-hooks + rev: 6f51a66bba59954917140ec2eeeaa4d5e630e6ce # frozen: v1.9.0 + hooks: + - id: python-use-type-annotations + - id: python-check-blanket-noqa + - id: python-check-blanket-type-ignore +- repo: https://github.com/PyCQA/flake8 + rev: f8e1b317742036ff11ff86356fd2b68147e169f7 # frozen: 5.0.4 + hooks: + - id: flake8 + additional_dependencies: + - flake8-bugbear + - flake8-builtins + - flake8-comprehensions +- repo: https://github.com/pre-commit/mirrors-mypy + rev: fde4bb992b03943ecb94207a52739ba07957bd06 # frozen: v0.971 + hooks: + - id: mypy + args: ["--scripts-are-modules"] + additional_dependencies: + - pytest @@ -0,0 +1,46 @@ +Copyright (c) 2015 Vitaly Puzrin, Alex Kocharin. +Copyright (c) 2021 Taneli Hukkinen + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +.parse() is based on Joyent's node.js `url` code: + +Copyright Joyent, Inc. and other Node contributors. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..55b1d2e --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +# mdurl + +[![Build Status](https://github.com/executablebooks/mdurl/workflows/Tests/badge.svg?branch=master)](https://github.com/executablebooks/mdurl/actions?query=workflow%3ATests+branch%3Amaster+event%3Apush) +[![codecov.io](https://codecov.io/gh/executablebooks/mdurl/branch/master/graph/badge.svg)](https://codecov.io/gh/executablebooks/mdurl) +[![PyPI version](https://img.shields.io/pypi/v/mdurl)](https://pypi.org/project/mdurl) + +This is a Python port of the JavaScript [mdurl](https://www.npmjs.com/package/mdurl) package. +See the [upstream README.md file](https://github.com/markdown-it/mdurl/blob/master/README.md) for API documentation. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..916a588 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,99 @@ +[build-system] +requires = ["flit_core>=3.2.0,<4"] +build-backend = "flit_core.buildapi" + +[project] +name = "mdurl" +version = "0.1.2" # DO NOT EDIT THIS LINE MANUALLY. LET bump2version UTILITY DO IT +description = "Markdown URL utilities" +authors = [ + { name = "Taneli Hukkinen", email = "hukkin@users.noreply.github.com" }, +] +license = { file = "LICENSE" } +requires-python = ">=3.7" +readme = "README.md" +classifiers = [ + "License :: OSI Approved :: MIT License", + "Operating System :: MacOS", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Software Development :: Libraries :: Python Modules", + "Typing :: Typed", +] +keywords = ["markdown", "commonmark"] + +[project.urls] +"Homepage" = "https://github.com/executablebooks/mdurl" + + +[tool.isort] +# Force imports to be sorted by module, independent of import type +force_sort_within_sections = true +# Group first party and local folder imports together +no_lines_before = ["LOCALFOLDER"] + +# Configure isort to work without access to site-packages +known_first_party = ["mdurl", "tests"] + +# Settings for Black compatibility +profile = "black" + + +[tool.pytest.ini_options] +addopts = "--strict-markers --strict-config" +xfail_strict = true + + +[tool.tox] +legacy_tox_ini = ''' +[tox] +# Only run pytest envs when no args given to tox +envlist = py{37,38,39,310} +isolated_build = True + +[testenv:py{37,38,39,310}] +description = run tests +deps = -r tests/requirements.txt +commands = + pytest {posargs} +''' + + +[tool.coverage.run] +source = ["mdurl"] + +[tool.coverage.report] +# Regexes for lines to exclude from consideration +exclude_lines = [ + # Have to re-enable the standard pragma + "pragma: no cover", + # Code for static type checkers + "if TYPE_CHECKING:", + # Scripts + 'if __name__ == .__main__.:', +] + + +[tool.mypy] +show_error_codes = true +warn_unreachable = true +warn_unused_ignores = true +warn_redundant_casts = true +warn_unused_configs = true +# Disabling incremental mode is required for `warn_unused_configs = true` to work +incremental = false +disallow_untyped_defs = true +check_untyped_defs = true +strict_equality = true +implicit_reexport = false +no_implicit_optional = true +overrides = [ + { module = "tests.*", disallow_untyped_defs = false }, +] diff --git a/src/mdurl/__init__.py b/src/mdurl/__init__.py new file mode 100644 index 0000000..cdbb640 --- /dev/null +++ b/src/mdurl/__init__.py @@ -0,0 +1,18 @@ +__all__ = ( + "decode", + "DECODE_DEFAULT_CHARS", + "DECODE_COMPONENT_CHARS", + "encode", + "ENCODE_DEFAULT_CHARS", + "ENCODE_COMPONENT_CHARS", + "format", + "parse", + "URL", +) +__version__ = "0.1.2" # DO NOT EDIT THIS LINE MANUALLY. LET bump2version UTILITY DO IT + +from mdurl._decode import DECODE_COMPONENT_CHARS, DECODE_DEFAULT_CHARS, decode +from mdurl._encode import ENCODE_COMPONENT_CHARS, ENCODE_DEFAULT_CHARS, encode +from mdurl._format import format +from mdurl._parse import url_parse as parse +from mdurl._url import URL diff --git a/src/mdurl/_decode.py b/src/mdurl/_decode.py new file mode 100644 index 0000000..9b50a2d --- /dev/null +++ b/src/mdurl/_decode.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from collections.abc import Sequence +import functools +import re + +DECODE_DEFAULT_CHARS = ";/?:@&=+$,#" +DECODE_COMPONENT_CHARS = "" + +decode_cache: dict[str, list[str]] = {} + + +def get_decode_cache(exclude: str) -> Sequence[str]: + if exclude in decode_cache: + return decode_cache[exclude] + + cache: list[str] = [] + decode_cache[exclude] = cache + + for i in range(128): + ch = chr(i) + cache.append(ch) + + for i in range(len(exclude)): + ch_code = ord(exclude[i]) + cache[ch_code] = "%" + ("0" + hex(ch_code)[2:].upper())[-2:] + + return cache + + +# Decode percent-encoded string. +# +def decode(string: str, exclude: str = DECODE_DEFAULT_CHARS) -> str: + cache = get_decode_cache(exclude) + repl_func = functools.partial(repl_func_with_cache, cache=cache) + return re.sub(r"(%[a-f0-9]{2})+", repl_func, string, flags=re.IGNORECASE) + + +def repl_func_with_cache(match: re.Match, cache: Sequence[str]) -> str: + seq = match.group() + result = "" + + i = 0 + l = len(seq) # noqa: E741 + while i < l: + b1 = int(seq[i + 1 : i + 3], 16) + + if b1 < 0x80: + result += cache[b1] + i += 3 # emulate JS for loop statement3 + continue + + if (b1 & 0xE0) == 0xC0 and (i + 3 < l): + # 110xxxxx 10xxxxxx + b2 = int(seq[i + 4 : i + 6], 16) + + if (b2 & 0xC0) == 0x80: + all_bytes = bytes((b1, b2)) + try: + result += all_bytes.decode() + except UnicodeDecodeError: + result += "\ufffd" * 2 + + i += 3 + i += 3 # emulate JS for loop statement3 + continue + + if (b1 & 0xF0) == 0xE0 and (i + 6 < l): + # 1110xxxx 10xxxxxx 10xxxxxx + b2 = int(seq[i + 4 : i + 6], 16) + b3 = int(seq[i + 7 : i + 9], 16) + + if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80: + all_bytes = bytes((b1, b2, b3)) + try: + result += all_bytes.decode() + except UnicodeDecodeError: + result += "\ufffd" * 3 + + i += 6 + i += 3 # emulate JS for loop statement3 + continue + + if (b1 & 0xF8) == 0xF0 and (i + 9 < l): + # 111110xx 10xxxxxx 10xxxxxx 10xxxxxx + b2 = int(seq[i + 4 : i + 6], 16) + b3 = int(seq[i + 7 : i + 9], 16) + b4 = int(seq[i + 10 : i + 12], 16) + + if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80 and (b4 & 0xC0) == 0x80: + all_bytes = bytes((b1, b2, b3, b4)) + try: + result += all_bytes.decode() + except UnicodeDecodeError: + result += "\ufffd" * 4 + + i += 9 + i += 3 # emulate JS for loop statement3 + continue + + result += "\ufffd" + i += 3 # emulate JS for loop statement3 + + return result diff --git a/src/mdurl/_encode.py b/src/mdurl/_encode.py new file mode 100644 index 0000000..bc2e5b9 --- /dev/null +++ b/src/mdurl/_encode.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from collections.abc import Sequence +from string import ascii_letters, digits, hexdigits +from urllib.parse import quote as encode_uri_component + +ASCII_LETTERS_AND_DIGITS = ascii_letters + digits + +ENCODE_DEFAULT_CHARS = ";/?:@&=+$,-_.!~*'()#" +ENCODE_COMPONENT_CHARS = "-_.!~*'()" + +encode_cache: dict[str, list[str]] = {} + + +# Create a lookup array where anything but characters in `chars` string +# and alphanumeric chars is percent-encoded. +def get_encode_cache(exclude: str) -> Sequence[str]: + if exclude in encode_cache: + return encode_cache[exclude] + + cache: list[str] = [] + encode_cache[exclude] = cache + + for i in range(128): + ch = chr(i) + + if ch in ASCII_LETTERS_AND_DIGITS: + # always allow unencoded alphanumeric characters + cache.append(ch) + else: + cache.append("%" + ("0" + hex(i)[2:].upper())[-2:]) + + for i in range(len(exclude)): + cache[ord(exclude[i])] = exclude[i] + + return cache + + +# Encode unsafe characters with percent-encoding, skipping already +# encoded sequences. +# +# - string - string to encode +# - exclude - list of characters to ignore (in addition to a-zA-Z0-9) +# - keepEscaped - don't encode '%' in a correct escape sequence (default: true) +def encode( + string: str, exclude: str = ENCODE_DEFAULT_CHARS, *, keep_escaped: bool = True +) -> str: + result = "" + + cache = get_encode_cache(exclude) + + l = len(string) # noqa: E741 + i = 0 + while i < l: + code = ord(string[i]) + + # % + if keep_escaped and code == 0x25 and i + 2 < l: + if all(c in hexdigits for c in string[i + 1 : i + 3]): + result += string[i : i + 3] + i += 2 + i += 1 # JS for loop statement3 + continue + + if code < 128: + result += cache[code] + i += 1 # JS for loop statement3 + continue + + if code >= 0xD800 and code <= 0xDFFF: + if code >= 0xD800 and code <= 0xDBFF and i + 1 < l: + next_code = ord(string[i + 1]) + if next_code >= 0xDC00 and next_code <= 0xDFFF: + result += encode_uri_component(string[i] + string[i + 1]) + i += 1 + i += 1 # JS for loop statement3 + continue + result += "%EF%BF%BD" + i += 1 # JS for loop statement3 + continue + + result += encode_uri_component(string[i]) + i += 1 # JS for loop statement3 + + return result diff --git a/src/mdurl/_format.py b/src/mdurl/_format.py new file mode 100644 index 0000000..12524ca --- /dev/null +++ b/src/mdurl/_format.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from mdurl._url import URL + + +def format(url: URL) -> str: # noqa: A001 + result = "" + + result += url.protocol or "" + result += "//" if url.slashes else "" + result += url.auth + "@" if url.auth else "" + + if url.hostname and ":" in url.hostname: + # ipv6 address + result += "[" + url.hostname + "]" + else: + result += url.hostname or "" + + result += ":" + url.port if url.port else "" + result += url.pathname or "" + result += url.search or "" + result += url.hash or "" + + return result diff --git a/src/mdurl/_parse.py b/src/mdurl/_parse.py new file mode 100644 index 0000000..ffeeac7 --- /dev/null +++ b/src/mdurl/_parse.py @@ -0,0 +1,304 @@ +# Copyright Joyent, Inc. and other Node contributors. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the +# following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN +# NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE +# USE OR OTHER DEALINGS IN THE SOFTWARE. + + +# Changes from joyent/node: +# +# 1. No leading slash in paths, +# e.g. in `url.parse('http://foo?bar')` pathname is ``, not `/` +# +# 2. Backslashes are not replaced with slashes, +# so `http:\\example.org\` is treated like a relative path +# +# 3. Trailing colon is treated like a part of the path, +# i.e. in `http://example.org:foo` pathname is `:foo` +# +# 4. Nothing is URL-encoded in the resulting object, +# (in joyent/node some chars in auth and paths are encoded) +# +# 5. `url.parse()` does not have `parseQueryString` argument +# +# 6. Removed extraneous result properties: `host`, `path`, `query`, etc., +# which can be constructed using other parts of the url. + +from __future__ import annotations + +from collections import defaultdict +import re + +from mdurl._url import URL + +# Reference: RFC 3986, RFC 1808, RFC 2396 + +# define these here so at least they only have to be +# compiled once on the first module load. +PROTOCOL_PATTERN = re.compile(r"^([a-z0-9.+-]+:)", flags=re.IGNORECASE) +PORT_PATTERN = re.compile(r":[0-9]*$") + +# Special case for a simple path URL +SIMPLE_PATH_PATTERN = re.compile(r"^(//?(?!/)[^?\s]*)(\?[^\s]*)?$") + +# RFC 2396: characters reserved for delimiting URLs. +# We actually just auto-escape these. +DELIMS = ("<", ">", '"', "`", " ", "\r", "\n", "\t") + +# RFC 2396: characters not allowed for various reasons. +UNWISE = ("{", "}", "|", "\\", "^", "`") + DELIMS + +# Allowed by RFCs, but cause of XSS attacks. Always escape these. +AUTO_ESCAPE = ("'",) + UNWISE +# Characters that are never ever allowed in a hostname. +# Note that any invalid chars are also handled, but these +# are the ones that are *expected* to be seen, so we fast-path +# them. +NON_HOST_CHARS = ("%", "/", "?", ";", "#") + AUTO_ESCAPE +HOST_ENDING_CHARS = ("/", "?", "#") +HOSTNAME_MAX_LEN = 255 +HOSTNAME_PART_PATTERN = re.compile(r"^[+a-z0-9A-Z_-]{0,63}$") +HOSTNAME_PART_START = re.compile(r"^([+a-z0-9A-Z_-]{0,63})(.*)$") +# protocols that can allow "unsafe" and "unwise" chars. + +# protocols that never have a hostname. +HOSTLESS_PROTOCOL = defaultdict( + bool, + { + "javascript": True, + "javascript:": True, + }, +) +# protocols that always contain a // bit. +SLASHED_PROTOCOL = defaultdict( + bool, + { + "http": True, + "https": True, + "ftp": True, + "gopher": True, + "file": True, + "http:": True, + "https:": True, + "ftp:": True, + "gopher:": True, + "file:": True, + }, +) + + +class MutableURL: + def __init__(self) -> None: + self.protocol: str | None = None + self.slashes: bool = False + self.auth: str | None = None + self.port: str | None = None + self.hostname: str | None = None + self.hash: str | None = None + self.search: str | None = None + self.pathname: str | None = None + + def parse(self, url: str, slashes_denote_host: bool) -> "MutableURL": + lower_proto = "" + slashes = False + rest = url + + # trim before proceeding. + # This is to support parse stuff like " http://foo.com \n" + rest = rest.strip() + + if not slashes_denote_host and len(url.split("#")) == 1: + # Try fast path regexp + simple_path = SIMPLE_PATH_PATTERN.match(rest) + if simple_path: + self.pathname = simple_path.group(1) + if simple_path.group(2): + self.search = simple_path.group(2) + return self + + proto = "" + proto_match = PROTOCOL_PATTERN.match(rest) + if proto_match: + proto = proto_match.group() + lower_proto = proto.lower() + self.protocol = proto + rest = rest[len(proto) :] + + # figure out if it's got a host + # user@server is *always* interpreted as a hostname, and url + # resolution will treat //foo/bar as host=foo,path=bar because that's + # how the browser resolves relative URLs. + if slashes_denote_host or proto or re.search(r"^//[^@/]+@[^@/]+", rest): + slashes = rest.startswith("//") + if slashes and not (proto and HOSTLESS_PROTOCOL[proto]): + rest = rest[2:] + self.slashes = True + + if not HOSTLESS_PROTOCOL[proto] and ( + slashes or (proto and not SLASHED_PROTOCOL[proto]) + ): + + # there's a hostname. + # the first instance of /, ?, ;, or # ends the host. + # + # If there is an @ in the hostname, then non-host chars *are* allowed + # to the left of the last @ sign, unless some host-ending character + # comes *before* the @-sign. + # URLs are obnoxious. + # + # ex: + # http://a@b@c/ => user:a@b host:c + # http://a@b?@c => user:a host:c path:/?@c + + # v0.12 TODO(isaacs): This is not quite how Chrome does things. + # Review our test case against browsers more comprehensively. + + # find the first instance of any hostEndingChars + host_end = -1 + for i in range(len(HOST_ENDING_CHARS)): + hec = rest.find(HOST_ENDING_CHARS[i]) + if hec != -1 and (host_end == -1 or hec < host_end): + host_end = hec + + # at this point, either we have an explicit point where the + # auth portion cannot go past, or the last @ char is the decider. + if host_end == -1: + # atSign can be anywhere. + at_sign = rest.rfind("@") + else: + # atSign must be in auth portion. + # http://a@b/c@d => host:b auth:a path:/c@d + at_sign = rest.rfind("@", 0, host_end + 1) + + # Now we have a portion which is definitely the auth. + # Pull that off. + if at_sign != -1: + auth = rest[:at_sign] + rest = rest[at_sign + 1 :] + self.auth = auth + + # the host is the remaining to the left of the first non-host char + host_end = -1 + for i in range(len(NON_HOST_CHARS)): + hec = rest.find(NON_HOST_CHARS[i]) + if hec != -1 and (host_end == -1 or hec < host_end): + host_end = hec + # if we still have not hit it, then the entire thing is a host. + if host_end == -1: + host_end = len(rest) + + if host_end > 0 and rest[host_end - 1] == ":": + host_end -= 1 + host = rest[:host_end] + rest = rest[host_end:] + + # pull out port. + self.parse_host(host) + + # we've indicated that there is a hostname, + # so even if it's empty, it has to be present. + self.hostname = self.hostname or "" + + # if hostname begins with [ and ends with ] + # assume that it's an IPv6 address. + ipv6_hostname = self.hostname.startswith("[") and self.hostname.endswith( + "]" + ) + + # validate a little. + if not ipv6_hostname: + hostparts = self.hostname.split(".") + l = len(hostparts) # noqa: E741 + i = 0 + while i < l: + part = hostparts[i] + if not part: + i += 1 # emulate statement3 in JS for loop + continue + if not HOSTNAME_PART_PATTERN.search(part): + newpart = "" + k = len(part) + j = 0 + while j < k: + if ord(part[j]) > 127: + # we replace non-ASCII char with a temporary placeholder + # we need this to make sure size of hostname is not + # broken by replacing non-ASCII by nothing + newpart += "x" + else: + newpart += part[j] + j += 1 # emulate statement3 in JS for loop + + # we test again with ASCII char only + if not HOSTNAME_PART_PATTERN.search(newpart): + valid_parts = hostparts[:i] + not_host = hostparts[i + 1 :] + bit = HOSTNAME_PART_START.search(part) + if bit: + valid_parts.append(bit.group(1)) + not_host.insert(0, bit.group(2)) + if not_host: + rest = ".".join(not_host) + rest + self.hostname = ".".join(valid_parts) + break + i += 1 # emulate statement3 in JS for loop + + if len(self.hostname) > HOSTNAME_MAX_LEN: + self.hostname = "" + + # strip [ and ] from the hostname + # the host field still retains them, though + if ipv6_hostname: + self.hostname = self.hostname[1:-1] + + # chop off from the tail first. + hash = rest.find("#") # noqa: A001 + if hash != -1: + # got a fragment string. + self.hash = rest[hash:] + rest = rest[:hash] + qm = rest.find("?") + if qm != -1: + self.search = rest[qm:] + rest = rest[:qm] + if rest: + self.pathname = rest + if SLASHED_PROTOCOL[lower_proto] and self.hostname and not self.pathname: + self.pathname = "" + + return self + + def parse_host(self, host: str) -> None: + port_match = PORT_PATTERN.search(host) + if port_match: + port = port_match.group() + if port != ":": + self.port = port[1:] + host = host[: -len(port)] + if host: + self.hostname = host + + +def url_parse(url: URL | str, *, slashes_denote_host: bool = False) -> URL: + if isinstance(url, URL): + return url + u = MutableURL() + u.parse(url, slashes_denote_host) + return URL( + u.protocol, u.slashes, u.auth, u.port, u.hostname, u.hash, u.search, u.pathname + ) diff --git a/src/mdurl/_url.py b/src/mdurl/_url.py new file mode 100644 index 0000000..f866e7a --- /dev/null +++ b/src/mdurl/_url.py @@ -0,0 +1,14 @@ +from __future__ import annotations + +from typing import NamedTuple + + +class URL(NamedTuple): + protocol: str | None + slashes: bool + auth: str | None + port: str | None + hostname: str | None + hash: str | None # noqa: A003 + search: str | None + pathname: str | None diff --git a/src/mdurl/py.typed b/src/mdurl/py.typed new file mode 100644 index 0000000..7632ecf --- /dev/null +++ b/src/mdurl/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561 diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tests/__init__.py diff --git a/tests/decode.js b/tests/decode.js new file mode 100644 index 0000000..c9457ba --- /dev/null +++ b/tests/decode.js @@ -0,0 +1,123 @@ +// TODO: port to Python +'use strict'; + + +var assert = require('assert'); +var decode = require('../decode'); + +function encodeBinary(str) { + var result = ''; + + str = str.replace(/\s+/g, ''); + while (str.length) { + result = '%' + ('0' + parseInt(str.slice(-8), 2).toString(16)).slice(-2) + result; + str = str.slice(0, -8); + } + + return result; +} + +var samples = { + '00000000': true, + '01010101': true, + '01111111': true, + + // invalid as 1st byte + '10000000': true, + '10111111': true, + + // invalid sequences, 2nd byte should be >= 0x80 + '11000111 01010101': false, + '11100011 01010101': false, + '11110001 01010101': false, + + // invalid sequences, 2nd byte should be < 0xc0 + '11000111 11000000': false, + '11100011 11000000': false, + '11110001 11000000': false, + + // invalid 3rd byte + '11100011 10010101 01010101': false, + '11110001 10010101 01010101': false, + + // invalid 4th byte + '11110001 10010101 10010101 01010101': false, + + // valid sequences + '11000111 10101010': true, + '11100011 10101010 10101010': true, + '11110001 10101010 10101010 10101010': true, + + // minimal chars with given length + '11000010 10000000': true, + '11100000 10100000 10000000': true, + + // impossible sequences + '11000001 10111111': false, + '11100000 10011111 10111111': false, + '11000001 10000000': false, + '11100000 10010000 10000000': false, + + // maximum chars with given length + '11011111 10111111': true, + '11101111 10111111 10111111': true, + + '11110000 10010000 10000000 10000000': true, + '11110000 10010000 10001111 10001111': true, + '11110100 10001111 10110000 10000000': true, + '11110100 10001111 10111111 10111111': true, + + // too low + '11110000 10001111 10111111 10111111': false, + + // too high + '11110100 10010000 10000000 10000000': false, + '11110100 10011111 10111111 10111111': false, + + // surrogate range + '11101101 10011111 10111111': true, + '11101101 10100000 10000000': false, + '11101101 10111111 10111111': false, + '11101110 10000000 10000000': true +}; + +describe('decode', function() { + it('should decode %xx', function() { + assert.equal(decode('x%20xx%20%2520'), 'x xx %20'); + }); + + it('should not decode invalid sequences', function() { + assert.equal(decode('%2g%z1%%'), '%2g%z1%%'); + }); + + it('should not decode reservedSet', function() { + assert.equal(decode('%20%25%20', '%'), ' %25 '); + assert.equal(decode('%20%25%20', ' '), '%20%%20'); + assert.equal(decode('%20%25%20', ' %'), '%20%25%20'); + }); + + describe('utf8', function() { + Object.keys(samples).forEach(function(k) { + it(k, function() { + var res1, res2, + er = null, + str = encodeBinary(k); + + try { + res1 = decodeURIComponent(str); + } catch(e) { + er = e; + } + + res2 = decode(str); + + if (er) { + assert.notEqual(res2.indexOf('\ufffd'), -1); + } else { + assert.equal(res1, res2); + assert.equal(res2.indexOf('\ufffd'), -1); + } + }); + }); + }); +}); diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tests/fixtures/__init__.py diff --git a/tests/fixtures/url.py b/tests/fixtures/url.py new file mode 100644 index 0000000..29431ec --- /dev/null +++ b/tests/fixtures/url.py @@ -0,0 +1,610 @@ +# Copyright Joyent, Inc. and other Node contributors. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the +# following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN +# NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE +# USE OR OTHER DEALINGS IN THE SOFTWARE. + + +# URLs to parse, and expected data +# { url : parsed } +PARSED = { + "//some_path": {"pathname": "//some_path"}, + "HTTP://www.example.com/": { + "protocol": "HTTP:", + "slashes": True, + "hostname": "www.example.com", + "pathname": "/", + }, + "HTTP://www.example.com": { + "protocol": "HTTP:", + "slashes": True, + "hostname": "www.example.com", + "pathname": "", + }, + "http://www.ExAmPlE.com/": { + "protocol": "http:", + "slashes": True, + "hostname": "www.ExAmPlE.com", + "pathname": "/", + }, + "http://user:pw@www.ExAmPlE.com/": { + "protocol": "http:", + "slashes": True, + "auth": "user:pw", + "hostname": "www.ExAmPlE.com", + "pathname": "/", + }, + "http://USER:PW@www.ExAmPlE.com/": { + "protocol": "http:", + "slashes": True, + "auth": "USER:PW", + "hostname": "www.ExAmPlE.com", + "pathname": "/", + }, + "http://user@www.example.com/": { + "protocol": "http:", + "slashes": True, + "auth": "user", + "hostname": "www.example.com", + "pathname": "/", + }, + "http://user%3Apw@www.example.com/": { + "protocol": "http:", + "slashes": True, + "auth": "user%3Apw", + "hostname": "www.example.com", + "pathname": "/", + }, + "http://x.com/path?that's#all, folks": { + "protocol": "http:", + "hostname": "x.com", + "slashes": True, + "search": "?that's", + "pathname": "/path", + "hash": "#all, folks", + }, + "HTTP://X.COM/Y": { + "protocol": "HTTP:", + "slashes": True, + "hostname": "X.COM", + "pathname": "/Y", + }, + # + not an invalid host character + # per https://url.spec.whatwg.org/#host-parsing + "http://x.y.com+a/b/c": { + "protocol": "http:", + "slashes": True, + "hostname": "x.y.com+a", + "pathname": "/b/c", + }, + # an unexpected invalid char in the hostname. + "HtTp://x.y.cOm;a/b/c?d=e#f g<h>i": { + "protocol": "HtTp:", + "slashes": True, + "hostname": "x.y.cOm", + "pathname": ";a/b/c", + "search": "?d=e", + "hash": "#f g<h>i", + }, + # make sure that we don't accidentally lcast the path parts. + "HtTp://x.y.cOm;A/b/c?d=e#f g<h>i": { + "protocol": "HtTp:", + "slashes": True, + "hostname": "x.y.cOm", + "pathname": ";A/b/c", + "search": "?d=e", + "hash": "#f g<h>i", + }, + "http://x...y...#p": { + "protocol": "http:", + "slashes": True, + "hostname": "x...y...", + "hash": "#p", + "pathname": "", + }, + 'http://x/p/"quoted"': { + "protocol": "http:", + "slashes": True, + "hostname": "x", + "pathname": '/p/"quoted"', + }, + "<http://goo.corn/bread> Is a URL!": { + "pathname": "<http://goo.corn/bread> Is a URL!" + }, + "http://www.narwhaljs.org/blog/categories?id=news": { + "protocol": "http:", + "slashes": True, + "hostname": "www.narwhaljs.org", + "search": "?id=news", + "pathname": "/blog/categories", + }, + "http://mt0.google.com/vt/lyrs=m@114&hl=en&src=api&x=2&y=2&z=3&s=": { + "protocol": "http:", + "slashes": True, + "hostname": "mt0.google.com", + "pathname": "/vt/lyrs=m@114&hl=en&src=api&x=2&y=2&z=3&s=", + }, + "http://mt0.google.com/vt/lyrs=m@114???&hl=en&src=api&x=2&y=2&z=3&s=": { + "protocol": "http:", + "slashes": True, + "hostname": "mt0.google.com", + "search": "???&hl=en&src=api&x=2&y=2&z=3&s=", + "pathname": "/vt/lyrs=m@114", + }, + "http://user:pass@mt0.google.com/vt/lyrs=m@114???&hl=en&src=api&x=2&y=2&z=3&s=": { + "protocol": "http:", + "slashes": True, + "auth": "user:pass", + "hostname": "mt0.google.com", + "search": "???&hl=en&src=api&x=2&y=2&z=3&s=", + "pathname": "/vt/lyrs=m@114", + }, + "file:///etc/passwd": { + "slashes": True, + "protocol": "file:", + "pathname": "/etc/passwd", + "hostname": "", + }, + "file://localhost/etc/passwd": { + "protocol": "file:", + "slashes": True, + "pathname": "/etc/passwd", + "hostname": "localhost", + }, + "file://foo/etc/passwd": { + "protocol": "file:", + "slashes": True, + "pathname": "/etc/passwd", + "hostname": "foo", + }, + "file:///etc/node/": { + "slashes": True, + "protocol": "file:", + "pathname": "/etc/node/", + "hostname": "", + }, + "file://localhost/etc/node/": { + "protocol": "file:", + "slashes": True, + "pathname": "/etc/node/", + "hostname": "localhost", + }, + "file://foo/etc/node/": { + "protocol": "file:", + "slashes": True, + "pathname": "/etc/node/", + "hostname": "foo", + }, + "http:/baz/../foo/bar": {"protocol": "http:", "pathname": "/baz/../foo/bar"}, + "http://user:pass@example.com:8000/foo/bar?baz=quux#frag": { + "protocol": "http:", + "slashes": True, + "auth": "user:pass", + "port": "8000", + "hostname": "example.com", + "hash": "#frag", + "search": "?baz=quux", + "pathname": "/foo/bar", + }, + "//user:pass@example.com:8000/foo/bar?baz=quux#frag": { + "slashes": True, + "auth": "user:pass", + "port": "8000", + "hostname": "example.com", + "hash": "#frag", + "search": "?baz=quux", + "pathname": "/foo/bar", + }, + "/foo/bar?baz=quux#frag": { + "hash": "#frag", + "search": "?baz=quux", + "pathname": "/foo/bar", + }, + "http:/foo/bar?baz=quux#frag": { + "protocol": "http:", + "hash": "#frag", + "search": "?baz=quux", + "pathname": "/foo/bar", + }, + "mailto:foo@bar.com?subject=hello": { + "protocol": "mailto:", + "auth": "foo", + "hostname": "bar.com", + "search": "?subject=hello", + }, + "javascript:alert('hello');": { + "protocol": "javascript:", + "pathname": "alert('hello');", + }, + "xmpp:isaacschlueter@jabber.org": { + "protocol": "xmpp:", + "auth": "isaacschlueter", + "hostname": "jabber.org", + }, + "http://atpass:foo%40bar@127.0.0.1:8080/path?search=foo#bar": { + "protocol": "http:", + "slashes": True, + "auth": "atpass:foo%40bar", + "hostname": "127.0.0.1", + "port": "8080", + "pathname": "/path", + "search": "?search=foo", + "hash": "#bar", + }, + "svn+ssh://foo/bar": { + "hostname": "foo", + "protocol": "svn+ssh:", + "pathname": "/bar", + "slashes": True, + }, + "dash-test://foo/bar": { + "hostname": "foo", + "protocol": "dash-test:", + "pathname": "/bar", + "slashes": True, + }, + "dash-test:foo/bar": { + "hostname": "foo", + "protocol": "dash-test:", + "pathname": "/bar", + }, + "dot.test://foo/bar": { + "hostname": "foo", + "protocol": "dot.test:", + "pathname": "/bar", + "slashes": True, + }, + "dot.test:foo/bar": { + "hostname": "foo", + "protocol": "dot.test:", + "pathname": "/bar", + }, + # IDNA tests + "http://www.日本語.com/": { + "protocol": "http:", + "slashes": True, + "hostname": "www.日本語.com", + "pathname": "/", + }, + "http://example.Bücher.com/": { + "protocol": "http:", + "slashes": True, + "hostname": "example.Bücher.com", + "pathname": "/", + }, + "http://www.Äffchen.com/": { + "protocol": "http:", + "slashes": True, + "hostname": "www.Äffchen.com", + "pathname": "/", + }, + "http://www.Äffchen.cOm;A/b/c?d=e#f g<h>i": { + "protocol": "http:", + "slashes": True, + "hostname": "www.Äffchen.cOm", + "pathname": ";A/b/c", + "search": "?d=e", + "hash": "#f g<h>i", + }, + "http://SÉLIER.COM/": { + "protocol": "http:", + "slashes": True, + "hostname": "SÉLIER.COM", + "pathname": "/", + }, + "http://ليهمابتكلموشعربي؟.ي؟/": { + "protocol": "http:", + "slashes": True, + "hostname": "ليهمابتكلموشعربي؟.ي؟", + "pathname": "/", + }, + "http://➡.ws/➡": { + "protocol": "http:", + "slashes": True, + "hostname": "➡.ws", + "pathname": "/➡", + }, + "http://bucket_name.s3.amazonaws.com/image.jpg": { + "protocol": "http:", + "slashes": True, + "hostname": "bucket_name.s3.amazonaws.com", + "pathname": "/image.jpg", + }, + "git+http://github.com/joyent/node.git": { + "protocol": "git+http:", + "slashes": True, + "hostname": "github.com", + "pathname": "/joyent/node.git", + }, + # if local1@domain1 is uses as a relative URL it may + # be parse into auth@hostname, but here there is no + # way to make it work in url.parse, I add the test to be explicit + "local1@domain1": {"pathname": "local1@domain1"}, + # While this may seem counter-intuitive, a browser will parse + # <a href='www.google.com'> as a path. + "www.example.com": {"pathname": "www.example.com"}, + # ipv6 support + "[fe80::1]": {"pathname": "[fe80::1]"}, + "coap://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]": { + "protocol": "coap:", + "slashes": True, + "hostname": "FEDC:BA98:7654:3210:FEDC:BA98:7654:3210", + }, + "coap://[1080:0:0:0:8:800:200C:417A]:61616/": { + "protocol": "coap:", + "slashes": True, + "port": "61616", + "hostname": "1080:0:0:0:8:800:200C:417A", + "pathname": "/", + }, + "http://user:password@[3ffe:2a00:100:7031::1]:8080": { + "protocol": "http:", + "slashes": True, + "auth": "user:password", + "port": "8080", + "hostname": "3ffe:2a00:100:7031::1", + "pathname": "", + }, + "coap://u:p@[::192.9.5.5]:61616/.well-known/r?n=Temperature": { + "protocol": "coap:", + "slashes": True, + "auth": "u:p", + "port": "61616", + "hostname": "::192.9.5.5", + "search": "?n=Temperature", + "pathname": "/.well-known/r", + }, + # empty port + "http://example.com:": { + "protocol": "http:", + "slashes": True, + "hostname": "example.com", + "pathname": ":", + }, + "http://example.com:/a/b.html": { + "protocol": "http:", + "slashes": True, + "hostname": "example.com", + "pathname": ":/a/b.html", + }, + "http://example.com:?a=b": { + "protocol": "http:", + "slashes": True, + "hostname": "example.com", + "search": "?a=b", + "pathname": ":", + }, + "http://example.com:#abc": { + "protocol": "http:", + "slashes": True, + "hostname": "example.com", + "hash": "#abc", + "pathname": ":", + }, + "http://[fe80::1]:/a/b?a=b#abc": { + "protocol": "http:", + "slashes": True, + "hostname": "fe80::1", + "search": "?a=b", + "hash": "#abc", + "pathname": ":/a/b", + }, + "http://-lovemonsterz.tumblr.com/rss": { + "protocol": "http:", + "slashes": True, + "hostname": "-lovemonsterz.tumblr.com", + "pathname": "/rss", + }, + "http://-lovemonsterz.tumblr.com:80/rss": { + "protocol": "http:", + "slashes": True, + "port": "80", + "hostname": "-lovemonsterz.tumblr.com", + "pathname": "/rss", + }, + "http://user:pass@-lovemonsterz.tumblr.com/rss": { + "protocol": "http:", + "slashes": True, + "auth": "user:pass", + "hostname": "-lovemonsterz.tumblr.com", + "pathname": "/rss", + }, + "http://user:pass@-lovemonsterz.tumblr.com:80/rss": { + "protocol": "http:", + "slashes": True, + "auth": "user:pass", + "port": "80", + "hostname": "-lovemonsterz.tumblr.com", + "pathname": "/rss", + }, + "http://_jabber._tcp.google.com/test": { + "protocol": "http:", + "slashes": True, + "hostname": "_jabber._tcp.google.com", + "pathname": "/test", + }, + "http://user:pass@_jabber._tcp.google.com/test": { + "protocol": "http:", + "slashes": True, + "auth": "user:pass", + "hostname": "_jabber._tcp.google.com", + "pathname": "/test", + }, + "http://_jabber._tcp.google.com:80/test": { + "protocol": "http:", + "slashes": True, + "port": "80", + "hostname": "_jabber._tcp.google.com", + "pathname": "/test", + }, + "http://user:pass@_jabber._tcp.google.com:80/test": { + "protocol": "http:", + "slashes": True, + "auth": "user:pass", + "port": "80", + "hostname": "_jabber._tcp.google.com", + "pathname": "/test", + }, + "http://x:1/' <>\"`/{}|\\^~`/": { + "protocol": "http:", + "slashes": True, + "port": "1", + "hostname": "x", + "pathname": "/' <>\"`/{}|\\^~`/", + }, + "http://a@b@c/": { + "protocol": "http:", + "slashes": True, + "auth": "a@b", + "hostname": "c", + "pathname": "/", + }, + "http://a@b?@c": { + "protocol": "http:", + "slashes": True, + "auth": "a", + "hostname": "b", + "pathname": "", + "search": "?@c", + }, + "http://a\r\" \t\n<'b:b@c\r\nd/e?f": { + "protocol": "http:", + "slashes": True, + "auth": "a\r\" \t\n<'b:b", + "hostname": "c", + "search": "?f", + "pathname": "\r\nd/e", + }, + # git urls used by npm + "git+ssh://git@github.com:npm/npm": { + "protocol": "git+ssh:", + "slashes": True, + "auth": "git", + "hostname": "github.com", + "pathname": ":npm/npm", + }, + "http://example.com?foo=bar#frag": { + "protocol": "http:", + "slashes": True, + "hostname": "example.com", + "hash": "#frag", + "search": "?foo=bar", + "pathname": "", + }, + "http://example.com?foo=@bar#frag": { + "protocol": "http:", + "slashes": True, + "hostname": "example.com", + "hash": "#frag", + "search": "?foo=@bar", + "pathname": "", + }, + "http://example.com?foo=/bar/#frag": { + "protocol": "http:", + "slashes": True, + "hostname": "example.com", + "hash": "#frag", + "search": "?foo=/bar/", + "pathname": "", + }, + "http://example.com?foo=?bar/#frag": { + "protocol": "http:", + "slashes": True, + "hostname": "example.com", + "hash": "#frag", + "search": "?foo=?bar/", + "pathname": "", + }, + "http://example.com#frag=?bar/#frag": { + "protocol": "http:", + "slashes": True, + "hostname": "example.com", + "hash": "#frag=?bar/#frag", + "pathname": "", + }, + 'http://google.com" onload="alert(42)/': { + "hostname": "google.com", + "protocol": "http:", + "slashes": True, + "pathname": '" onload="alert(42)/', + }, + "http://a.com/a/b/c?s#h": { + "protocol": "http:", + "slashes": True, + "pathname": "/a/b/c", + "hostname": "a.com", + "hash": "#h", + "search": "?s", + }, + "http://atpass:foo%40bar@127.0.0.1/": { + "auth": "atpass:foo%40bar", + "slashes": True, + "hostname": "127.0.0.1", + "protocol": "http:", + "pathname": "/", + }, + "http://atslash%2F%40:%2F%40@foo/": { + "auth": "atslash%2F%40:%2F%40", + "hostname": "foo", + "protocol": "http:", + "pathname": "/", + "slashes": True, + }, + # ipv6 support + "coap:u:p@[::1]:61616/.well-known/r?n=Temperature": { + "protocol": "coap:", + "auth": "u:p", + "hostname": "::1", + "port": "61616", + "pathname": "/.well-known/r", + "search": "?n=Temperature", + }, + "coap:[fedc:ba98:7654:3210:fedc:ba98:7654:3210]:61616/s/stopButton": { + "hostname": "fedc:ba98:7654:3210:fedc:ba98:7654:3210", + "port": "61616", + "protocol": "coap:", + "pathname": "/s/stopButton", + }, + # encode context-specific delimiters in path and query, but do not touch + # other non-delimiter chars like `%`. + # <https://github.com/joyent/node/issues/4082> + # `?` and `#` in path and search + "http://ex.com/foo%3F100%m%23r?abc=the%231?&foo=bar#frag": { + "protocol": "http:", + "hostname": "ex.com", + "hash": "#frag", + "search": "?abc=the%231?&foo=bar", + "pathname": "/foo%3F100%m%23r", + "slashes": True, + }, + # `?` and `#` in search only + "http://ex.com/fooA100%mBr?abc=the%231?&foo=bar#frag": { + "protocol": "http:", + "hostname": "ex.com", + "hash": "#frag", + "search": "?abc=the%231?&foo=bar", + "pathname": "/fooA100%mBr", + "slashes": True, + }, + # + "http://": { + "protocol": "http:", + "hostname": "", + "slashes": True, + }, +} diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 0000000..6f05550 --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1,3 @@ +pytest +pytest-randomly +pytest-cov diff --git a/tests/test_decode.py b/tests/test_decode.py new file mode 100644 index 0000000..bc58ce0 --- /dev/null +++ b/tests/test_decode.py @@ -0,0 +1,5 @@ +from mdurl import decode + + +def test_decode_multi_byte(): + assert decode("https://host.invalid/%F0%9F%91%A9") == "https://host.invalid/👩" diff --git a/tests/test_encode.py b/tests/test_encode.py new file mode 100644 index 0000000..7414bac --- /dev/null +++ b/tests/test_encode.py @@ -0,0 +1,50 @@ +import pytest + +from mdurl import encode + + +@pytest.mark.parametrize( + "input_,expected", + [ + pytest.param("%%%", "%25%25%25", id="should encode percent"), + pytest.param("\r\n", "%0D%0A", id="should encode control chars"), + pytest.param("?#", "?#", id="should not encode parts of an url"), + pytest.param("[]^", "%5B%5D%5E", id="should not encode []^ - commonmark tests"), + pytest.param("my url", "my%20url", id="should encode spaces"), + pytest.param("φου", "%CF%86%CE%BF%CF%85", id="should encode unicode"), + pytest.param( + "%FG", "%25FG", id="should encode % if it doesn't start a valid escape seq" + ), + pytest.param( + "%00%FF", "%00%FF", id="should preserve non-utf8 encoded characters" + ), + pytest.param( + "\x00\x7F\x80", + "%00%7F%C2%80", + id="should encode characters on the cache borders", + ), # protects against off-by-one in cache implementation + ], +) +def test_encode(input_, expected): + assert encode(input_) == expected + + +def test_encode_arguments(): + assert encode("!@#$", exclude="@$") == "%21@%23$" + assert encode("%20%2G", keep_escaped=True) == "%20%252G" + assert encode("%20%2G", keep_escaped=False) == "%2520%252G" + assert encode("!@%25", exclude="@", keep_escaped=False) == "%21@%2525" + + +def test_encode_surrogates(): + # bad surrogates (high) + assert encode("\uD800foo") == "%EF%BF%BDfoo" + assert encode("foo\uD800") == "foo%EF%BF%BD" + + # bad surrogates (low) + assert encode("\uDD00foo") == "%EF%BF%BDfoo" + assert encode("foo\uDD00") == "foo%EF%BF%BD" + + # valid one + # (the codepoint is "D800 DD00" in UTF-16BE) + assert encode("𐄀") == "%F0%90%84%80" diff --git a/tests/test_format.py b/tests/test_format.py new file mode 100644 index 0000000..0cf1219 --- /dev/null +++ b/tests/test_format.py @@ -0,0 +1,10 @@ +import pytest + +from mdurl import format, parse +from tests.fixtures.url import PARSED as FIXTURES + + +@pytest.mark.parametrize("url", FIXTURES.keys()) +def test_format(url): + parsed = parse(url) + assert format(parsed) == url diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..aa4ae44 --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,26 @@ +import pytest + +from mdurl import parse +from tests.fixtures.url import PARSED as FIXTURES + + +def is_url_and_dict_equal(url, url_dict): + return ( + url.protocol == url_dict.get("protocol") + and url.slashes == url_dict.get("slashes", False) + and url.auth == url_dict.get("auth") + and url.port == url_dict.get("port") + and url.hostname == url_dict.get("hostname") + and url.hash == url_dict.get("hash") + and url.search == url_dict.get("search") + and url.pathname == url_dict.get("pathname") + ) + + +@pytest.mark.parametrize( + "url,expected_dict", + FIXTURES.items(), +) +def test_parse(url, expected_dict): + parsed = parse(url) + assert is_url_and_dict_equal(parsed, expected_dict) |