diff options
Diffstat (limited to '')
-rw-r--r-- | identify/extensions.py | 3 | ||||
-rw-r--r-- | identify/identify.py | 11 | ||||
-rw-r--r-- | setup.cfg | 4 |
3 files changed, 12 insertions, 6 deletions
diff --git a/identify/extensions.py b/identify/extensions.py index 0c1eb85..de41c1a 100644 --- a/identify/extensions.py +++ b/identify/extensions.py @@ -1,6 +1,7 @@ EXTENSIONS = { 'adoc': {'text', 'asciidoc'}, 'ai': {'binary', 'adobe-illustrator'}, + 'aj': {'text', 'aspectj'}, 'asciidoc': {'text', 'asciidoc'}, 'apinotes': {'text', 'apinotes'}, 'asar': {'binary', 'asar'}, @@ -287,6 +288,8 @@ NAMES = { 'Gemfile': EXTENSIONS['rb'], 'Gemfile.lock': {'text'}, 'GNUmakefile': EXTENSIONS['mk'], + 'go.mod': {'text', 'go-mod'}, + 'go.sum': {'text', 'go-sum'}, 'Jenkinsfile': EXTENSIONS['jenkins'], 'LICENSE': EXTENSIONS['txt'], 'MAINTAINERS': EXTENSIONS['txt'], diff --git a/identify/identify.py b/identify/identify.py index 4d1b555..e61626f 100644 --- a/identify/identify.py +++ b/identify/identify.py @@ -1,4 +1,5 @@ import errno +import math import os.path import re import shlex @@ -244,7 +245,7 @@ def license_id(filename: str) -> Optional[str]: 3. check exact text match with existing licenses 4. failing that use edit distance """ - import editdistance_s # `pip install identify[license]` + import ukkonen # `pip install identify[license]` with open(filename, encoding='UTF-8') as f: contents = f.read() @@ -254,6 +255,8 @@ def license_id(filename: str) -> Optional[str]: min_edit_dist = sys.maxsize min_edit_dist_spdx = '' + cutoff = math.ceil(.05 * len(norm)) + # try exact matches for spdx, text in licenses.LICENSES: norm_license = _norm_license(text) @@ -264,13 +267,13 @@ def license_id(filename: str) -> Optional[str]: if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05: continue - edit_dist = editdistance_s.distance(norm, norm_license) - if edit_dist < min_edit_dist: + edit_dist = ukkonen.distance(norm, norm_license, cutoff) + if edit_dist < cutoff and edit_dist < min_edit_dist: min_edit_dist = edit_dist min_edit_dist_spdx = spdx # if there's less than 5% edited from the license, we found our match - if norm and min_edit_dist / len(norm) < .05: + if norm and min_edit_dist < cutoff: return min_edit_dist_spdx else: # no matches :'( @@ -1,6 +1,6 @@ [metadata] name = identify -version = 2.3.5 +version = 2.4.0 description = File identification library for Python long_description = file: README.md long_description_content_type = text/markdown @@ -36,7 +36,7 @@ console_scripts = [options.extras_require] license = - editdistance-s + ukkonen [bdist_wheel] universal = True |