From 9143c0beda96a82579dc6ca87cba5ba74b907dea Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 4 Mar 2021 16:04:39 +0100 Subject: Merging upstream version 2.1.0. Signed-off-by: Daniel Baumann --- .pre-commit-config.yaml | 29 ++++++++++----- README.md | 6 +-- bin/vendor-licenses | 12 ++---- identify/cli.py | 8 ++-- identify/extensions.py | 6 +-- identify/identify.py | 89 ++++++++++++++++++++++++++------------------- identify/interpreters.py | 4 -- identify/vendor/licenses.py | 3 -- setup.cfg | 27 ++++++++++---- setup.py | 4 -- tests/cli_test.py | 4 -- tests/extensions_test.py | 4 -- tests/identify_test.py | 38 +++++++++++++++---- tox.ini | 2 +- 14 files changed, 132 insertions(+), 104 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c3381ff..a1dc351 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,22 +2,24 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v3.4.0 hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - id: check-docstring-first - - id: check-merge-conflict - id: check-yaml - id: debug-statements - id: double-quote-string-fixer + - id: end-of-file-fixer - id: name-tests-test - - id: check-added-large-files - - id: check-byte-order-marker - - id: fix-encoding-pragma + - id: requirements-txt-fixer + - id: trailing-whitespace +- repo: https://github.com/asottile/setup-cfg-fmt + rev: v1.16.0 + hooks: + - id: setup-cfg-fmt - repo: https://gitlab.com/pycqa/flake8 rev: 3.8.4 hooks: - id: flake8 exclude: ^identify/vendor/licenses\.py$ + additional_dependencies: [flake8-typing-imports==1.10.1] - repo: https://github.com/pre-commit/mirrors-autopep8 rev: v1.5.4 hooks: @@ -26,11 +28,18 @@ repos: rev: v2.4.0 hooks: - id: reorder-python-imports - args: [ - '--add-import', 'from __future__ import absolute_import', - '--add-import', 'from __future__ import unicode_literals', - ] + args: [--py3-plus] - repo: https://github.com/asottile/add-trailing-comma rev: v2.1.0 hooks: - id: add-trailing-comma + args: [--py36-plus] +- repo: https://github.com/asottile/pyupgrade + rev: v2.10.0 + hooks: + - id: pyupgrade + args: [--py36-plus] +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.812 + hooks: + - id: mypy diff --git a/README.md b/README.md index accf29b..788b7c4 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ If you have an actual file on disk, you can get the most information possible When using a file on disk, the checks performed are: -* File type (file, symlink, directory) +* File type (file, symlink, directory, socket) * Mode (is it executable?) * File name (mostly based on extension) * If executable, the shebang is read and the interpreter interpreted @@ -76,11 +76,11 @@ optional arguments: --filename-only ``` -```bash +```console $ identify-cli setup.py; echo $? ["file", "non-executable", "python", "text"] 0 -identify setup.py --filename-only; echo $? +$ identify setup.py --filename-only; echo $? ["python", "text"] 0 $ identify-cli wat.wat; echo $? diff --git a/bin/vendor-licenses b/bin/vendor-licenses index 83c0c4e..2dbde36 100755 --- a/bin/vendor-licenses +++ b/bin/vendor-licenses @@ -1,19 +1,15 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- """Usage: ./bin/vendor-licenses > identify/vendor/licenses.py """ -from __future__ import absolute_import -from __future__ import unicode_literals - import argparse import os.path import subprocess import tempfile -def main(): +def main() -> int: parser = argparse.ArgumentParser() parser.add_argument('--revision', default='HEAD') args = parser.parse_args() @@ -45,18 +41,16 @@ def main(): licenses.append((spdx, license_text)) - print('# -*- coding: utf-8 -*-') - print('from __future__ import absolute_import') - print('from __future__ import unicode_literals') print('LICENSES = (') for spdx, text in sorted(licenses): print(' (') - print(' {!r},'.format(spdx)) + print(f' {spdx!r},') print(" '''\\") print(text.replace('\t', ' ').replace(' \n', '').strip()) print("''',") print(' ),') print(')') + return 0 if __name__ == '__main__': diff --git a/identify/cli.py b/identify/cli.py index 511caf6..28e6155 100644 --- a/identify/cli.py +++ b/identify/cli.py @@ -1,14 +1,12 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from __future__ import unicode_literals - import argparse import json +from typing import Optional +from typing import Sequence from identify import identify -def main(argv=None): +def main(argv: Optional[Sequence[str]] = None) -> int: parser = argparse.ArgumentParser() parser.add_argument('--filename-only', action='store_true') parser.add_argument('path') diff --git a/identify/extensions.py b/identify/extensions.py index 62a7f5b..778b695 100644 --- a/identify/extensions.py +++ b/identify/extensions.py @@ -1,13 +1,9 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from __future__ import unicode_literals - - EXTENSIONS = { 'adoc': {'text', 'asciidoc'}, 'asciidoc': {'text', 'asciidoc'}, 'apinotes': {'text', 'apinotes'}, 'asar': {'binary', 'asar'}, + 'avif': {'binary', 'image', 'avif'}, 'bash': {'text', 'shell', 'bash'}, 'bat': {'text', 'batch'}, 'bib': {'text', 'bib'}, diff --git a/identify/identify.py b/identify/identify.py index 1c0e677..51c1288 100644 --- a/identify/identify.py +++ b/identify/identify.py @@ -1,14 +1,14 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from __future__ import division -from __future__ import unicode_literals - -import io import os.path import re import shlex +import stat import string import sys +from typing import IO +from typing import List +from typing import Optional +from typing import Set +from typing import Tuple from identify import extensions from identify import interpreters @@ -19,27 +19,37 @@ printable = frozenset(string.printable) DIRECTORY = 'directory' SYMLINK = 'symlink' +SOCKET = 'socket' FILE = 'file' EXECUTABLE = 'executable' NON_EXECUTABLE = 'non-executable' TEXT = 'text' BINARY = 'binary' -ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY} -ALL_TAGS.update(*extensions.EXTENSIONS.values()) -ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values()) -ALL_TAGS.update(*extensions.NAMES.values()) -ALL_TAGS.update(*interpreters.INTERPRETERS.values()) -ALL_TAGS = frozenset(ALL_TAGS) +TYPE_TAGS = frozenset((DIRECTORY, FILE, SYMLINK, SOCKET)) +MODE_TAGS = frozenset((EXECUTABLE, NON_EXECUTABLE)) +ENCODING_TAGS = frozenset((BINARY, TEXT)) +_ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS} +_ALL_TAGS.update(*extensions.EXTENSIONS.values()) +_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values()) +_ALL_TAGS.update(*extensions.NAMES.values()) +_ALL_TAGS.update(*interpreters.INTERPRETERS.values()) +ALL_TAGS = frozenset(_ALL_TAGS) -def tags_from_path(path): - if not os.path.lexists(path): - raise ValueError('{} does not exist.'.format(path)) - if os.path.isdir(path): +def tags_from_path(path: str) -> Set[str]: + try: + sr = os.lstat(path) + except (OSError, ValueError): # same error-handling as `os.lexists()` + raise ValueError(f'{path} does not exist.') + + mode = sr.st_mode + if stat.S_ISDIR(mode): return {DIRECTORY} - if os.path.islink(path): + if stat.S_ISLNK(mode): return {SYMLINK} + if stat.S_ISSOCK(mode): + return {SOCKET} tags = {FILE} @@ -62,19 +72,19 @@ def tags_from_path(path): # some extensions can be both binary and text # see EXTENSIONS_NEED_BINARY_CHECK - if not {TEXT, BINARY} & tags: + if not ENCODING_TAGS & tags: if file_is_text(path): tags.add(TEXT) else: tags.add(BINARY) - assert {TEXT, BINARY} & tags, tags - assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags + assert ENCODING_TAGS & tags, tags + assert MODE_TAGS & tags, tags return tags -def tags_from_filename(filename): - _, filename = os.path.split(filename) +def tags_from_filename(path: str) -> Set[str]: + _, filename = os.path.split(path) _, ext = os.path.splitext(filename) ret = set() @@ -95,7 +105,7 @@ def tags_from_filename(filename): return ret -def tags_from_interpreter(interpreter): +def tags_from_interpreter(interpreter: str) -> Set[str]: _, _, interpreter = interpreter.rpartition('/') # Try "python3.5.2" => "python3.5" => "python3" until one matches. @@ -108,7 +118,7 @@ def tags_from_interpreter(interpreter): return set() -def is_text(bytesio): +def is_text(bytesio: IO[bytes]) -> bool: """Return whether the first KB of contents seems to be binary. This is roughly based on libmagic's binary/text detection: @@ -122,14 +132,14 @@ def is_text(bytesio): return not bool(bytesio.read(1024).translate(None, text_chars)) -def file_is_text(path): +def file_is_text(path: str) -> bool: if not os.path.lexists(path): - raise ValueError('{} does not exist.'.format(path)) + raise ValueError(f'{path} does not exist.') with open(path, 'rb') as f: return is_text(f) -def _shebang_split(line): +def _shebang_split(line: str) -> List[str]: try: # shebangs aren't supposed to be quoted, though some tools such as # setuptools will write them with quotes so we'll best-guess parse @@ -141,11 +151,14 @@ def _shebang_split(line): return line.split() -def _parse_nix_shebang(bytesio, cmd): +def _parse_nix_shebang( + bytesio: IO[bytes], + cmd: Tuple[str, ...], +) -> Tuple[str, ...]: while bytesio.read(2) == b'#!': - next_line = bytesio.readline() + next_line_b = bytesio.readline() try: - next_line = next_line.decode('UTF-8') + next_line = next_line_b.decode('UTF-8') except UnicodeDecodeError: return cmd @@ -162,13 +175,13 @@ def _parse_nix_shebang(bytesio, cmd): return cmd -def parse_shebang(bytesio): +def parse_shebang(bytesio: IO[bytes]) -> Tuple[str, ...]: """Parse the shebang from a file opened for reading binary.""" if bytesio.read(2) != b'#!': return () - first_line = bytesio.readline() + first_line_b = bytesio.readline() try: - first_line = first_line.decode('UTF-8') + first_line = first_line_b.decode('UTF-8') except UnicodeDecodeError: return () @@ -185,10 +198,10 @@ def parse_shebang(bytesio): return cmd -def parse_shebang_from_file(path): +def parse_shebang_from_file(path: str) -> Tuple[str, ...]: """Parse the shebang given a file path.""" if not os.path.lexists(path): - raise ValueError('{} does not exist.'.format(path)) + raise ValueError(f'{path} does not exist.') if not os.access(path, os.X_OK): return () @@ -200,13 +213,13 @@ COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE) WS_RE = re.compile(r'\s+') -def _norm_license(s): +def _norm_license(s: str) -> str: s = COPYRIGHT_RE.sub('', s) s = WS_RE.sub(' ', s) return s.strip() -def license_id(filename): +def license_id(filename: str) -> Optional[str]: """Return the spdx id for the license contained in `filename`. If no license is detected, returns `None`. @@ -222,7 +235,7 @@ def license_id(filename): """ import editdistance # `pip install identify[license]` - with io.open(filename, encoding='UTF-8') as f: + with open(filename, encoding='UTF-8') as f: contents = f.read() norm = _norm_license(contents) diff --git a/identify/interpreters.py b/identify/interpreters.py index 7feb4b1..dabf36c 100644 --- a/identify/interpreters.py +++ b/identify/interpreters.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from __future__ import unicode_literals - INTERPRETERS = { 'ash': {'shell', 'ash'}, 'awk': {'awk'}, diff --git a/identify/vendor/licenses.py b/identify/vendor/licenses.py index 912b5c8..3478d0f 100644 --- a/identify/vendor/licenses.py +++ b/identify/vendor/licenses.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from __future__ import unicode_literals LICENSES = ( ( '0BSD', diff --git a/setup.cfg b/setup.cfg index c5dce83..4aafd64 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = identify -version = 1.5.14 +version = 2.1.0 description = File identification library for Python long_description = file: README.md long_description_content_type = text/markdown @@ -11,26 +11,26 @@ license = MIT license_file = LICENSE classifiers = License :: OSI Approved :: MIT License - Programming Language :: Python :: 2 - Programming Language :: Python :: 2.7 Programming Language :: Python :: 3 - Programming Language :: Python :: 3.4 - Programming Language :: Python :: 3.5 + Programming Language :: Python :: 3 :: Only Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 Programming Language :: Python :: Implementation :: CPython Programming Language :: Python :: Implementation :: PyPy [options] packages = find: -python_requires = >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.* +python_requires = >=3.6.1 [options.entry_points] console_scripts = identify-cli=identify.cli:main [options.extras_require] -license = editdistance +license = + editdistance [options.packages.find] exclude = @@ -42,3 +42,16 @@ universal = True [coverage:run] plugins = covdefaults + +[mypy] +check_untyped_defs = true +disallow_any_generics = true +disallow_incomplete_defs = true +disallow_untyped_defs = true +no_implicit_optional = true + +[mypy-testing.*] +disallow_untyped_defs = false + +[mypy-tests.*] +disallow_untyped_defs = false diff --git a/setup.py b/setup.py index acf1ad4..8bf1ba9 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,2 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from __future__ import unicode_literals - from setuptools import setup setup() diff --git a/tests/cli_test.py b/tests/cli_test.py index 9369a5e..94fb8ae 100644 --- a/tests/cli_test.py +++ b/tests/cli_test.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from __future__ import unicode_literals - from identify import cli diff --git a/tests/extensions_test.py b/tests/extensions_test.py index 4527a58..c2a828c 100644 --- a/tests/extensions_test.py +++ b/tests/extensions_test.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from __future__ import unicode_literals - import pytest from identify import extensions diff --git a/tests/identify_test.py b/tests/identify_test.py index 8e00f60..8cc5856 100644 --- a/tests/identify_test.py +++ b/tests/identify_test.py @@ -1,10 +1,8 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import -from __future__ import unicode_literals - import io import os +import socket import stat +from tempfile import TemporaryDirectory import pytest @@ -14,6 +12,21 @@ from identify import identify def test_all_tags_includes_basic_ones(): assert 'file' in identify.ALL_TAGS assert 'directory' in identify.ALL_TAGS + assert 'executable' in identify.ALL_TAGS + assert 'text' in identify.ALL_TAGS + assert 'socket' in identify.ALL_TAGS + + +@pytest.mark.parametrize( + 'tag_group', + ( + identify.TYPE_TAGS, + identify.MODE_TAGS, + identify.ENCODING_TAGS, + ), +) +def test_all_tags_contains_all_groups(tag_group): + assert tag_group < identify.ALL_TAGS def test_all_tags_contains_each_type(): @@ -41,6 +54,17 @@ def test_tags_from_path_symlink(tmpdir): assert identify.tags_from_path(x.strpath) == {'symlink'} +def test_tags_from_path_socket(): + tmproot = '/tmp' # short path avoids `OSError: AF_UNIX path too long` + with TemporaryDirectory(dir=tmproot) as tmpdir: + socket_path = os.path.join(tmpdir, 'socket') + with socket.socket(socket.AF_UNIX) as sock: + sock.bind(socket_path) + tags = identify.tags_from_path(socket_path) + + assert tags == {'socket'} + + def test_tags_from_path_broken_symlink(tmpdir): x = tmpdir.join('foo') x.mksymlinkto(tmpdir.join('lol')) @@ -177,9 +201,9 @@ def test_tags_from_interpreter(interpreter, expected): ( (b'hello world', True), (b'', True), - ('éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode('utf8'), True), - (r'¯\_(ツ)_/¯'.encode('utf8'), True), - ('♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪'.encode('utf8'), True), + ('éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode(), True), + (r'¯\_(ツ)_/¯'.encode(), True), + ('♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪'.encode(), True), ('éóñå'.encode('latin1'), True), (b'hello world\x00', False), diff --git a/tox.ini b/tox.ini index 2876390..a063c94 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py27,py35,py36,pypy,pre-commit +envlist = py36,pypy3,pre-commit [testenv] deps = -rrequirements-dev.txt -- cgit v1.2.3