summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--identify/extensions.py3
-rw-r--r--identify/identify.py11
-rw-r--r--setup.cfg4
3 files changed, 12 insertions, 6 deletions
diff --git a/identify/extensions.py b/identify/extensions.py
index 0c1eb85..de41c1a 100644
--- a/identify/extensions.py
+++ b/identify/extensions.py
@@ -1,6 +1,7 @@
EXTENSIONS = {
'adoc': {'text', 'asciidoc'},
'ai': {'binary', 'adobe-illustrator'},
+ 'aj': {'text', 'aspectj'},
'asciidoc': {'text', 'asciidoc'},
'apinotes': {'text', 'apinotes'},
'asar': {'binary', 'asar'},
@@ -287,6 +288,8 @@ NAMES = {
'Gemfile': EXTENSIONS['rb'],
'Gemfile.lock': {'text'},
'GNUmakefile': EXTENSIONS['mk'],
+ 'go.mod': {'text', 'go-mod'},
+ 'go.sum': {'text', 'go-sum'},
'Jenkinsfile': EXTENSIONS['jenkins'],
'LICENSE': EXTENSIONS['txt'],
'MAINTAINERS': EXTENSIONS['txt'],
diff --git a/identify/identify.py b/identify/identify.py
index 4d1b555..e61626f 100644
--- a/identify/identify.py
+++ b/identify/identify.py
@@ -1,4 +1,5 @@
import errno
+import math
import os.path
import re
import shlex
@@ -244,7 +245,7 @@ def license_id(filename: str) -> Optional[str]:
3. check exact text match with existing licenses
4. failing that use edit distance
"""
- import editdistance_s # `pip install identify[license]`
+ import ukkonen # `pip install identify[license]`
with open(filename, encoding='UTF-8') as f:
contents = f.read()
@@ -254,6 +255,8 @@ def license_id(filename: str) -> Optional[str]:
min_edit_dist = sys.maxsize
min_edit_dist_spdx = ''
+ cutoff = math.ceil(.05 * len(norm))
+
# try exact matches
for spdx, text in licenses.LICENSES:
norm_license = _norm_license(text)
@@ -264,13 +267,13 @@ def license_id(filename: str) -> Optional[str]:
if norm and abs(len(norm) - len(norm_license)) / len(norm) > .05:
continue
- edit_dist = editdistance_s.distance(norm, norm_license)
- if edit_dist < min_edit_dist:
+ edit_dist = ukkonen.distance(norm, norm_license, cutoff)
+ if edit_dist < cutoff and edit_dist < min_edit_dist:
min_edit_dist = edit_dist
min_edit_dist_spdx = spdx
# if there's less than 5% edited from the license, we found our match
- if norm and min_edit_dist / len(norm) < .05:
+ if norm and min_edit_dist < cutoff:
return min_edit_dist_spdx
else:
# no matches :'(
diff --git a/setup.cfg b/setup.cfg
index 88d1aea..43b5aff 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
[metadata]
name = identify
-version = 2.3.5
+version = 2.4.0
description = File identification library for Python
long_description = file: README.md
long_description_content_type = text/markdown
@@ -36,7 +36,7 @@ console_scripts =
[options.extras_require]
license =
- editdistance-s
+ ukkonen
[bdist_wheel]
universal = True