1 files changed, 230 insertions, 0 deletions
diff --git a/identify/identify.py b/identify/identify.py
new file mode 100644
index 0000000..8a21d8b
--- /dev/null
+++ b/identify/identify.py
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import unicode_literals
+
+import io
+import os.path
+import re
+import shlex
+import string
+import sys
+
+from identify import extensions
+from identify import interpreters
+from identify.vendor import licenses
+
+
+printable = frozenset(string.printable)
+
+DIRECTORY = 'directory'
+SYMLINK = 'symlink'
+FILE = 'file'
+EXECUTABLE = 'executable'
+NON_EXECUTABLE = 'non-executable'
+TEXT = 'text'
+BINARY = 'binary'
+
+ALL_TAGS = {DIRECTORY, SYMLINK, FILE, EXECUTABLE, NON_EXECUTABLE, TEXT, BINARY}
+ALL_TAGS.update(*extensions.EXTENSIONS.values())
+ALL_TAGS.update(*extensions.EXTENSIONS_NEED_BINARY_CHECK.values())
+ALL_TAGS.update(*extensions.NAMES.values())
+ALL_TAGS.update(*interpreters.INTERPRETERS.values())
+ALL_TAGS = frozenset(ALL_TAGS)
+
+
+def tags_from_path(path):
+    if not os.path.lexists(path):
+        raise ValueError('{} does not exist.'.format(path))
+    if os.path.isdir(path):
+        return {DIRECTORY}
+    if os.path.islink(path):
+        return {SYMLINK}
+
+    tags = {FILE}
+
+    executable = os.access(path, os.X_OK)
+    if executable:
+        tags.add(EXECUTABLE)
+    else:
+        tags.add(NON_EXECUTABLE)
+
+    # As an optimization, if we're able to read tags from the filename, then we
+    # don't peek at the file contents.
+    t = tags_from_filename(os.path.basename(path))
+    if len(t) > 0:
+        tags.update(t)
+    else:
+        if executable:
+            shebang = parse_shebang_from_file(path)
+            if len(shebang) > 0:
+                tags.update(tags_from_interpreter(shebang[0]))
+
+    # some extensions can be both binary and text
+    # see EXTENSIONS_NEED_BINARY_CHECK
+    if not {TEXT, BINARY} & tags:
+        if file_is_text(path):
+            tags.add(TEXT)
+        else:
+            tags.add(BINARY)
+
+    assert {TEXT, BINARY} & tags, tags
+    assert {EXECUTABLE, NON_EXECUTABLE} & tags, tags
+    return tags
+
+
+def tags_from_filename(filename):
+    _, filename = os.path.split(filename)
+    _, ext = os.path.splitext(filename)
+
+    ret = set()
+
+    # Allow e.g. "Dockerfile.xenial" to match "Dockerfile"
+    for part in [filename] + filename.split('.'):
+        if part in extensions.NAMES:
+            ret.update(extensions.NAMES[part])
+            break
+
+    if len(ext) > 0:
+        ext = ext[1:].lower()
+        if ext in extensions.EXTENSIONS:
+            ret.update(extensions.EXTENSIONS[ext])
+        elif ext in extensions.EXTENSIONS_NEED_BINARY_CHECK:
+            ret.update(extensions.EXTENSIONS_NEED_BINARY_CHECK[ext])
+
+    return ret
+
+
+def tags_from_interpreter(interpreter):
+    _, _, interpreter = interpreter.rpartition('/')
+
+    # Try "python3.5.2" => "python3.5" => "python3" until one matches.
+    while interpreter:
+        if interpreter in interpreters.INTERPRETERS:
+            return interpreters.INTERPRETERS[interpreter]
+        else:
+            interpreter, _, _ = interpreter.rpartition('.')
+
+    return set()
+
+
+def is_text(bytesio):
+    """Return whether the first KB of contents seems to be binary.
+
+    This is roughly based on libmagic's binary/text detection:
+    https://github.com/file/file/blob/df74b09b9027676088c797528edcaae5a9ce9ad0/src/encoding.c#L203-L228
+    """
+    text_chars = (
+        bytearray([7, 8, 9, 10, 11, 12, 13, 27]) +
+        bytearray(range(0x20, 0x7F)) +
+        bytearray(range(0x80, 0X100))
+    )
+    return not bool(bytesio.read(1024).translate(None, text_chars))
+
+
+def file_is_text(path):
+    if not os.path.lexists(path):
+        raise ValueError('{} does not exist.'.format(path))
+    with open(path, 'rb') as f:
+        return is_text(f)
+
+
+def _shebang_split(line):
+    try:
+        # shebangs aren't supposed to be quoted, though some tools such as
+        # setuptools will write them with quotes so we'll best-guess parse
+        # with shlex first
+        return shlex.split(line)
+    except ValueError:
+        # failing that, we'll do a more "traditional" shebang parsing which
+        # just involves splitting by whitespace
+        return line.split()
+
+
+def parse_shebang(bytesio):
+    """Parse the shebang from a file opened for reading binary."""
+    if bytesio.read(2) != b'#!':
+        return ()
+    first_line = bytesio.readline()
+    try:
+        first_line = first_line.decode('UTF-8')
+    except UnicodeDecodeError:
+        return ()
+
+    # Require only printable ascii
+    for c in first_line:
+        if c not in printable:
+            return ()
+
+    cmd = tuple(_shebang_split(first_line.strip()))
+    if cmd and cmd[0] == '/usr/bin/env':
+        cmd = cmd[1:]
+    return cmd
+
+
+def parse_shebang_from_file(path):
+    """Parse the shebang given a file path."""
+    if not os.path.lexists(path):
+        raise ValueError('{} does not exist.'.format(path))
+    if not os.access(path, os.X_OK):
+        return ()
+
+    with open(path, 'rb') as f:
+        return parse_shebang(f)
+
+
+COPYRIGHT_RE = re.compile(r'^\s*(Copyright|\(C\)) .*$', re.I | re.MULTILINE)
+WS_RE = re.compile(r'\s+')
+
+
+def _norm_license(s):
+    s = COPYRIGHT_RE.sub('', s)
+    s = WS_RE.sub(' ', s)
+    return s.strip()
+
+
+def license_id(filename):
+    """Return the spdx id for the license contained in `filename`.  If no
+    license is detected, returns `None`.
+
+    spdx: https://spdx.org/licenses/
+    licenses from choosealicense.com: https://github.com/choosealicense.com
+
+    Approximate algorithm:
+
+    1. strip copyright line
+    2. normalize whitespace (replace all whitespace with a single space)
+    3. check exact text match with existing licenses
+    4. failing that use edit distance
+    """
+    import editdistance  # `pip install identify[license]`
+
+    with io.open(filename, encoding='UTF-8') as f:
+        contents = f.read()
+
+    norm = _norm_license(contents)
+
+    min_edit_dist = sys.maxsize
+    min_edit_dist_spdx = ''
+
+    # try exact matches
+    for spdx, text in licenses.LICENSES:
+        norm_license = _norm_license(text)
+        if norm == norm_license:
+            return spdx
+
+        # skip the slow calculation if the lengths are very different
+        if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
+            continue
+
+        edit_dist = editdistance.eval(norm, norm_license)
+        if edit_dist < min_edit_dist:
+            min_edit_dist = edit_dist
+            min_edit_dist_spdx = spdx
+
+    # if there's less than 5% edited from the license, we found our match
+    if norm and min_edit_dist / len(norm) < .05:
+        return min_edit_dist_spdx
+    else:
+        # no matches :'(
+        return None