summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.pre-commit-config.yaml29
-rw-r--r--README.md6
-rwxr-xr-xbin/vendor-licenses12
-rw-r--r--identify/cli.py8
-rw-r--r--identify/extensions.py6
-rw-r--r--identify/identify.py89
-rw-r--r--identify/interpreters.py4
-rw-r--r--identify/vendor/licenses.py3
-rw-r--r--setup.cfg27
-rw-r--r--setup.py4
-rw-r--r--tests/cli_test.py4
-rw-r--r--tests/extensions_test.py4
-rw-r--r--tests/identify_test.py38
-rw-r--r--tox.ini2
14 files changed, 132 insertions, 104 deletions
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c3381ff..a1dc351 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,22 +2,24 @@ repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.4.0
hooks:
- - id: trailing-whitespace
- - id: end-of-file-fixer
- id: check-docstring-first
- - id: check-merge-conflict
- id: check-yaml
- id: debug-statements
- id: double-quote-string-fixer
+ - id: end-of-file-fixer
- id: name-tests-test
- - id: check-added-large-files
- - id: check-byte-order-marker
- - id: fix-encoding-pragma
+ - id: requirements-txt-fixer
+ - id: trailing-whitespace
+- repo: https://github.com/asottile/setup-cfg-fmt
+ rev: v1.16.0
+ hooks:
+ - id: setup-cfg-fmt
- repo: https://gitlab.com/pycqa/flake8
rev: 3.8.4
hooks:
- id: flake8
exclude: ^identify/vendor/licenses\.py$
+ additional_dependencies: [flake8-typing-imports==1.10.1]
- repo: https://github.com/pre-commit/mirrors-autopep8
rev: v1.5.4
hooks:
@@ -26,11 +28,18 @@ repos:
rev: v2.4.0
hooks:
- id: reorder-python-imports
- args: [
- '--add-import', 'from __future__ import absolute_import',
- '--add-import', 'from __future__ import unicode_literals',
- ]
+ args: [--py3-plus]
- repo: https://github.com/asottile/add-trailing-comma
rev: v2.1.0
hooks:
- id: add-trailing-comma
+ args: [--py36-plus]
+- repo: https://github.com/asottile/pyupgrade
+ rev: v2.10.0
+ hooks:
+ - id: pyupgrade
+ args: [--py36-plus]
+- repo: https://github.com/pre-commit/mirrors-mypy
+ rev: v0.812
+ hooks:
+ - id: mypy
diff --git a/README.md b/README.md
index accf29b..788b7c4 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ If you have an actual file on disk, you can get the most information possible
When using a file on disk, the checks performed are:
-* File type (file, symlink, directory)
+* File type (file, symlink, directory, socket)
* Mode (is it executable?)
* File name (mostly based on extension)
* If executable, the shebang is read and the interpreter interpreted
@@ -76,11 +76,11 @@ optional arguments:
--filename-only
```
-```bash
+```console
$ identify-cli setup.py; echo $?
["file", "non-executable", "python", "text"]
0
-identify setup.py --filename-only; echo $?
+$ identify setup.py --filename-only; echo $?
["python", "text"]
0
$ identify-cli wat.wat; echo $?
diff --git a/bin/vendor-licenses b/bin/vendor-licenses
index 83c0c4e..2dbde36 100755
--- a/bin/vendor-licenses
+++ b/bin/vendor-licenses
@@ -1,19 +1,15 @@
#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
"""Usage:
./bin/vendor-licenses > identify/vendor/licenses.py
"""
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
import argparse
import os.path
import subprocess
import tempfile
-def main():
+def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument('--revision', default='HEAD')
args = parser.parse_args()
@@ -45,18 +41,16 @@ def main():
licenses.append((spdx, license_text))
- print('# -*- coding: utf-8 -*-')
- print('from __future__ import absolute_import')
- print('from __future__ import unicode_literals')
print('LICENSES = (')
for spdx, text in sorted(licenses):
print(' (')
- print(' {!r},'.format(spdx))
+ print(f' {spdx!r},')
print(" '''\\")
print(text.replace('\t', ' ').replace(' \n', '').strip())
print("''',")
print(' ),')
print(')')
+ return 0
if __name__ == '__main__':
diff --git a/identify/cli.py b/identify/cli.py
index 511caf6..28e6155 100644
--- a/identify/cli.py
+++ b/identify/cli.py
@@ -1,14 +1,12 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
import argparse
import json
+from typing import Optional
+from typing import Sequence
from identify import identify
-def main(argv=None):
+def main(argv: Optional[Sequence[str]] = None) -> int:
parser = argparse.ArgumentParser()
parser.add_argument('--filename-only', action='store_true')
parser.add_argument('path')
diff --git a/identify/extensions.py b/identify/extensions.py
index 62a7f5b..778b695 100644
--- a/identify/extensions.py
+++ b/identify/extensions.py
@@ -1,13 +1,9 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
-
EXTENSIONS = {
'adoc': {'text', 'asciidoc'},
'asciidoc': {'text', 'asciidoc'},
'apinotes': {'text', 'apinotes'},
'asar': {'binary', 'asar'},
+ 'avif': {'binary', 'image', 'avif'},
'bash': {'text', 'shell', 'bash'},
'bat': {'text', 'batch'},
'bib': {'text', 'bib'},
diff --git a/identify/identify.py b/identify/identify.py
index 1c0e677..51c1288 100644
--- a/identify/identify.py
+++ b/identify/identify.py
@@ -1,14 +1,14 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import unicode_literals
-
-import io
import os.path
import re
import shlex
+import stat
import string
import sys
+from typing import IO
+from typing import List
+from typing import Optional
+from typing import Set
+from typing import Tuple
from identify import extensions
from identify import interpreters
@@ -19,27 +19,37 @@ printable = frozenset(string.printable)
DIRECTORY = 'directory'
SYMLINK = 'symlink'
+SOCKET = 'socket'
FILE = 'file'
EXECUTABLE = 'executable'
NON_EXECUTABLE = 'non-executable'
TEXT = 'text'
BINARY = 'binary'
-ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
-ALL_TAGS.update(*extensions.EXTENSIONS.values())
-ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
-ALL_TAGS.update(*extensions.NAMES.values())
-ALL_TAGS.update(*interpreters.INTERPRETERS.values())
-ALL_TAGS = frozenset(ALL_TAGS)
+TYPE_TAGS = frozenset((DIRECTORY, FILE, SYMLINK, SOCKET))
+MODE_TAGS = frozenset((EXECUTABLE, NON_EXECUTABLE))
+ENCODING_TAGS = frozenset((BINARY, TEXT))
+_ALL_TAGS = {*TYPE_TAGS, *MODE_TAGS, *ENCODING_TAGS}
+_ALL_TAGS.update(*extensions.EXTENSIONS.values())
+_ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
+_ALL_TAGS.update(*extensions.NAMES.values())
+_ALL_TAGS.update(*interpreters.INTERPRETERS.values())
+ALL_TAGS = frozenset(_ALL_TAGS)
-def tags_from_path(path):
- if not os.path.lexists(path):
- raise ValueError('{} does not exist.'.format(path))
- if os.path.isdir(path):
+def tags_from_path(path: str) -> Set[str]:
+ try:
+ sr = os.lstat(path)
+ except (OSError, ValueError): # same error-handling as `os.lexists()`
+ raise ValueError(f'{path} does not exist.')
+
+ mode = sr.st_mode
+ if stat.S_ISDIR(mode):
return {DIRECTORY}
- if os.path.islink(path):
+ if stat.S_ISLNK(mode):
return {SYMLINK}
+ if stat.S_ISSOCK(mode):
+ return {SOCKET}
tags = {FILE}
@@ -62,19 +72,19 @@ def tags_from_path(path):
# some extensions can be both binary and text
# see EXTENSIONS_NEED_BINARY_CHECK
- if not {TEXT, BINARY} & tags:
+ if not ENCODING_TAGS & tags:
if file_is_text(path):
tags.add(TEXT)
else:
tags.add(BINARY)
- assert {TEXT, BINARY} & tags, tags
- assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
+ assert ENCODING_TAGS & tags, tags
+ assert MODE_TAGS & tags, tags
return tags
-def tags_from_filename(filename):
- _, filename = os.path.split(filename)
+def tags_from_filename(path: str) -> Set[str]:
+ _, filename = os.path.split(path)
_, ext = os.path.splitext(filename)
ret = set()
@@ -95,7 +105,7 @@ def tags_from_filename(filename):
return ret
-def tags_from_interpreter(interpreter):
+def tags_from_interpreter(interpreter: str) -> Set[str]:
_, _, interpreter = interpreter.rpartition('/')
# Try "python3.5.2" => "python3.5" => "python3" until one matches.
@@ -108,7 +118,7 @@ def tags_from_interpreter(interpreter):
return set()
-def is_text(bytesio):
+def is_text(bytesio: IO[bytes]) -> bool:
"""Return whether the first KB of contents seems to be binary.
This is roughly based on libmagic's binary/text detection:
@@ -122,14 +132,14 @@ def is_text(bytesio):
return not bool(bytesio.read(1024).translate(None, text_chars))
-def file_is_text(path):
+def file_is_text(path: str) -> bool:
if not os.path.lexists(path):
- raise ValueError('{} does not exist.'.format(path))
+ raise ValueError(f'{path} does not exist.')
with open(path, 'rb') as f:
return is_text(f)
-def _shebang_split(line):
+def _shebang_split(line: str) -> List[str]:
try:
# shebangs aren't supposed to be quoted, though some tools such as
# setuptools will write them with quotes so we'll best-guess parse
@@ -141,11 +151,14 @@ def _shebang_split(line):
return line.split()
-def _parse_nix_shebang(bytesio, cmd):
+def _parse_nix_shebang(
+ bytesio: IO[bytes],
+ cmd: Tuple[str, ...],
+) -> Tuple[str, ...]:
while bytesio.read(2) == b'#!':
- next_line = bytesio.readline()
+ next_line_b = bytesio.readline()
try:
- next_line = next_line.decode('UTF-8')
+ next_line = next_line_b.decode('UTF-8')
except UnicodeDecodeError:
return cmd
@@ -162,13 +175,13 @@ def _parse_nix_shebang(bytesio, cmd):
return cmd
-def parse_shebang(bytesio):
+def parse_shebang(bytesio: IO[bytes]) -> Tuple[str, ...]:
"""Parse the shebang from a file opened for reading binary."""
if bytesio.read(2) != b'#!':
return ()
- first_line = bytesio.readline()
+ first_line_b = bytesio.readline()
try:
- first_line = first_line.decode('UTF-8')
+ first_line = first_line_b.decode('UTF-8')
except UnicodeDecodeError:
return ()
@@ -185,10 +198,10 @@ def parse_shebang(bytesio):
return cmd
-def parse_shebang_from_file(path):
+def parse_shebang_from_file(path: str) -> Tuple[str, ...]:
"""Parse the shebang given a file path."""
if not os.path.lexists(path):
- raise ValueError('{} does not exist.'.format(path))
+ raise ValueError(f'{path} does not exist.')
if not os.access(path, os.X_OK):
return ()
@@ -200,13 +213,13 @@ COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
WS_RE = re.compile(r'\s+')
-def _norm_license(s):
+def _norm_license(s: str) -> str:
s = COPYRIGHT_RE.sub('', s)
s = WS_RE.sub(' ', s)
return s.strip()
-def license_id(filename):
+def license_id(filename: str) -> Optional[str]:
"""Return the spdx id for the license contained in `filename`. If no
license is detected, returns `None`.
@@ -222,7 +235,7 @@ def license_id(filename):
"""
import editdistance # `pip install identify[license]`
- with io.open(filename, encoding='UTF-8') as f:
+ with open(filename, encoding='UTF-8') as f:
contents = f.read()
norm = _norm_license(contents)
diff --git a/identify/interpreters.py b/identify/interpreters.py
index 7feb4b1..dabf36c 100644
--- a/identify/interpreters.py
+++ b/identify/interpreters.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
INTERPRETERS = {
'ash': {'shell', 'ash'},
'awk': {'awk'},
diff --git a/identify/vendor/licenses.py b/identify/vendor/licenses.py
index 912b5c8..3478d0f 100644
--- a/identify/vendor/licenses.py
+++ b/identify/vendor/licenses.py
@@ -1,6 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
LICENSES = (
(
'0BSD',
diff --git a/setup.cfg b/setup.cfg
index c5dce83..4aafd64 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
[metadata]
name = identify
-version = 1.5.14
+version = 2.1.0
description = File identification library for Python
long_description = file: README.md
long_description_content_type = text/markdown
@@ -11,26 +11,26 @@ license = MIT
license_file = LICENSE
classifiers =
License :: OSI Approved :: MIT License
- Programming Language :: Python :: 2
- Programming Language :: Python :: 2.7
Programming Language :: Python :: 3
- Programming Language :: Python :: 3.4
- Programming Language :: Python :: 3.5
+ Programming Language :: Python :: 3 :: Only
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
+ Programming Language :: Python :: 3.8
+ Programming Language :: Python :: 3.9
Programming Language :: Python :: Implementation :: CPython
Programming Language :: Python :: Implementation :: PyPy
[options]
packages = find:
-python_requires = >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*
+python_requires = >=3.6.1
[options.entry_points]
console_scripts =
identify-cli=identify.cli:main
[options.extras_require]
-license = editdistance
+license =
+ editdistance
[options.packages.find]
exclude =
@@ -42,3 +42,16 @@ universal = True
[coverage:run]
plugins = covdefaults
+
+[mypy]
+check_untyped_defs = true
+disallow_any_generics = true
+disallow_incomplete_defs = true
+disallow_untyped_defs = true
+no_implicit_optional = true
+
+[mypy-testing.*]
+disallow_untyped_defs = false
+
+[mypy-tests.*]
+disallow_untyped_defs = false
diff --git a/setup.py b/setup.py
index acf1ad4..8bf1ba9 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,2 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
from setuptools import setup
setup()
diff --git a/tests/cli_test.py b/tests/cli_test.py
index 9369a5e..94fb8ae 100644
--- a/tests/cli_test.py
+++ b/tests/cli_test.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
from identify import cli
diff --git a/tests/extensions_test.py b/tests/extensions_test.py
index 4527a58..c2a828c 100644
--- a/tests/extensions_test.py
+++ b/tests/extensions_test.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
import pytest
from identify import extensions
diff --git a/tests/identify_test.py b/tests/identify_test.py
index 8e00f60..8cc5856 100644
--- a/tests/identify_test.py
+++ b/tests/identify_test.py
@@ -1,10 +1,8 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import unicode_literals
-
import io
import os
+import socket
import stat
+from tempfile import TemporaryDirectory
import pytest
@@ -14,6 +12,21 @@ from identify import identify
def test_all_tags_includes_basic_ones():
assert 'file' in identify.ALL_TAGS
assert 'directory' in identify.ALL_TAGS
+ assert 'executable' in identify.ALL_TAGS
+ assert 'text' in identify.ALL_TAGS
+ assert 'socket' in identify.ALL_TAGS
+
+
+@pytest.mark.parametrize(
+ 'tag_group',
+ (
+ identify.TYPE_TAGS,
+ identify.MODE_TAGS,
+ identify.ENCODING_TAGS,
+ ),
+)
+def test_all_tags_contains_all_groups(tag_group):
+ assert tag_group < identify.ALL_TAGS
def test_all_tags_contains_each_type():
@@ -41,6 +54,17 @@ def test_tags_from_path_symlink(tmpdir):
assert identify.tags_from_path(x.strpath) == {'symlink'}
+def test_tags_from_path_socket():
+ tmproot = '/tmp' # short path avoids `OSError: AF_UNIX path too long`
+ with TemporaryDirectory(dir=tmproot) as tmpdir:
+ socket_path = os.path.join(tmpdir, 'socket')
+ with socket.socket(socket.AF_UNIX) as sock:
+ sock.bind(socket_path)
+ tags = identify.tags_from_path(socket_path)
+
+ assert tags == {'socket'}
+
+
def test_tags_from_path_broken_symlink(tmpdir):
x = tmpdir.join('foo')
x.mksymlinkto(tmpdir.join('lol'))
@@ -177,9 +201,9 @@ def test_tags_from_interpreter(interpreter, expected):
(
(b'hello world', True),
(b'', True),
- ('éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode('utf8'), True),
- (r'¯\_(ツ)_/¯'.encode('utf8'), True),
- ('♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪'.encode('utf8'), True),
+ ('éóñəå ⊂(◉‿◉)つ(ノ≥∇≤)ノ'.encode(), True),
+ (r'¯\_(ツ)_/¯'.encode(), True),
+ ('♪┏(・o・)┛♪┗ ( ・o・) ┓♪┏ ( ) ┛♪┗ (・o・ ) ┓♪'.encode(), True),
('éóñå'.encode('latin1'), True),
(b'hello world\x00', False),
diff --git a/tox.ini b/tox.ini
index 2876390..a063c94 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
[tox]
-envlist = py27,py35,py36,pypy,pre-commit
+envlist = py36,pypy3,pre-commit
[testenv]
deps = -rrequirements-dev.txt