From 4bbf6c088551d74da917b0ad9c1e83366afa9a50 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 25 Mar 2020 00:10:43 +0100 Subject: Adding upstream version 1.4.13. Signed-off-by: Daniel Baumann --- identify/identify.py | 230 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 identify/identify.py (limited to 'identify/identify.py') diff --git a/identify/identify.py b/identify/identify.py new file mode 100644 index 0000000..8a21d8b --- /dev/null +++ b/identify/identify.py @@ -0,0 +1,230 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +from __future__ import division +from __future__ import unicode_literals + +import io +import os.path +import re +import shlex +import string +import sys + +from identify import extensions +from identify import interpreters +from identify.vendor import licenses + + +printable = frozenset(string.printable) + +DIRECTORY = 'directory' +SYMLINK = 'symlink' +FILE = 'file' +EXECUTABLE = 'executable' +NON_EXECUTABLE = 'non-executable' +TEXT = 'text' +BINARY = 'binary' + +ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY} +ALL_TAGS.update(*extensions.EXTENSIONS.values()) +ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values()) +ALL_TAGS.update(*extensions.NAMES.values()) +ALL_TAGS.update(*interpreters.INTERPRETERS.values()) +ALL_TAGS = frozenset(ALL_TAGS) + + +def tags_from_path(path): + if not os.path.lexists(path): + raise ValueError('{} does not exist.'.format(path)) + if os.path.isdir(path): + return {DIRECTORY} + if os.path.islink(path): + return {SYMLINK} + + tags = {FILE} + + executable = os.access(path, os.X_OK) + if executable: + tags.add(EXECUTABLE) + else: + tags.add(NON_EXECUTABLE) + + # As an optimization, if we're able to read tags from the filename, then we + # don't peek at the file contents. + t = tags_from_filename(os.path.basename(path)) + if len(t) > 0: + tags.update(t) + else: + if executable: + shebang = parse_shebang_from_file(path) + if len(shebang) > 0: + tags.update(tags_from_interpreter(shebang[0])) + + # some extensions can be both binary and text + # see EXTENSIONS_NEED_BINARY_CHECK + if not {TEXT, BINARY} & tags: + if file_is_text(path): + tags.add(TEXT) + else: + tags.add(BINARY) + + assert {TEXT, BINARY} & tags, tags + assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags + return tags + + +def tags_from_filename(filename): + _, filename = os.path.split(filename) + _, ext = os.path.splitext(filename) + + ret = set() + + # Allow e.g. "Dockerfile.xenial" to match "Dockerfile" + for part in [filename] + filename.split('.'): + if part in extensions.NAMES: + ret.update(extensions.NAMES[part]) + break + + if len(ext) > 0: + ext = ext[1:].lower() + if ext in extensions.EXTENSIONS: + ret.update(extensions.EXTENSIONS[ext]) + elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK: + ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext]) + + return ret + + +def tags_from_interpreter(interpreter): + _, _, interpreter = interpreter.rpartition('/') + + # Try "python3.5.2" => "python3.5" => "python3" until one matches. + while interpreter: + if interpreter in interpreters.INTERPRETERS: + return interpreters.INTERPRETERS[interpreter] + else: + interpreter, _, _ = interpreter.rpartition('.') + + return set() + + +def is_text(bytesio): + """Return whether the first KB of contents seems to be binary. + + This is roughly based on libmagic's binary/text detection: + https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228 + """ + text_chars = ( + bytearray([7, 8, 9, 10, 11, 12, 13, 27]) + + bytearray(range(0x20, 0x7F)) + + bytearray(range(0x80, 0X100)) + ) + return not bool(bytesio.read(1024).translate(None, text_chars)) + + +def file_is_text(path): + if not os.path.lexists(path): + raise ValueError('{} does not exist.'.format(path)) + with open(path, 'rb') as f: + return is_text(f) + + +def _shebang_split(line): + try: + # shebangs aren't supposed to be quoted, though some tools such as + # setuptools will write them with quotes so we'll best-guess parse + # with shlex first + return shlex.split(line) + except ValueError: + # failing that, we'll do a more "traditional" shebang parsing which + # just involves splitting by whitespace + return line.split() + + +def parse_shebang(bytesio): + """Parse the shebang from a file opened for reading binary.""" + if bytesio.read(2) != b'#!': + return () + first_line = bytesio.readline() + try: + first_line = first_line.decode('UTF-8') + except UnicodeDecodeError: + return () + + # Require only printable ascii + for c in first_line: + if c not in printable: + return () + + cmd = tuple(_shebang_split(first_line.strip())) + if cmd and cmd[0] == '/usr/bin/env': + cmd = cmd[1:] + return cmd + + +def parse_shebang_from_file(path): + """Parse the shebang given a file path.""" + if not os.path.lexists(path): + raise ValueError('{} does not exist.'.format(path)) + if not os.access(path, os.X_OK): + return () + + with open(path, 'rb') as f: + return parse_shebang(f) + + +COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE) +WS_RE = re.compile(r'\s+') + + +def _norm_license(s): + s = COPYRIGHT_RE.sub('', s) + s = WS_RE.sub(' ', s) + return s.strip() + + +def license_id(filename): + """Return the spdx id for the license contained in `filename`. If no + license is detected, returns `None`. + + spdx: https://spdx.org/licenses/ + licenses from choosealicense.com: https://github.com/choosealicense.com + + Approximate algorithm: + + 1. strip copyright line + 2. normalize whitespace (replace all whitespace with a single space) + 3. check exact text match with existing licenses + 4. failing that use edit distance + """ + import editdistance # `pip install identify[license]` + + with io.open(filename, encoding='UTF-8') as f: + contents = f.read() + + norm = _norm_license(contents) + + min_edit_dist = sys.maxsize + min_edit_dist_spdx = '' + + # try exact matches + for spdx, text in licenses.LICENSES: + norm_license = _norm_license(text) + if norm == norm_license: + return spdx + + # skip the slow calculation if the lengths are very different + if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05: + continue + + edit_dist = editdistance.eval(norm, norm_license) + if edit_dist < min_edit_dist: + min_edit_dist = edit_dist + min_edit_dist_spdx = spdx + + # if there's less than 5% edited from the license, we found our match + if norm and min_edit_dist / len(norm) < .05: + return min_edit_dist_spdx + else: + # no matches :'( + return None -- cgit v1.2.3