1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
|
import re
import sys
import unicodedata
from html.entities import name2codepoint
try:
import unidecode
except ImportError:
import text_unidecode as unidecode
__all__ = ['slugify', 'smart_truncate']
CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
DECIMAL_PATTERN = re.compile(r'&#(\d+);')
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
QUOTE_PATTERN = re.compile(r'[\']+')
DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
DEFAULT_SEPARATOR = '-'
def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False):
"""
Truncate a string.
:param string (str): string for modification
:param max_length (int): output string length
:param word_boundary (bool):
:param save_order (bool): if True then word order of output string is like input string
:param separator (str): separator between words
:return:
"""
string = string.strip(separator)
if not max_length:
return string
if len(string) < max_length:
return string
if not word_boundary:
return string[:max_length].strip(separator)
if separator not in string:
return string[:max_length]
truncated = ''
for word in string.split(separator):
if word:
next_len = len(truncated) + len(word)
if next_len < max_length:
truncated += '{}{}'.format(word, separator)
elif next_len == max_length:
truncated += '{}'.format(word)
break
else:
if save_order:
break
if not truncated: # pragma: no cover
truncated = string[:max_length]
return truncated.strip(separator)
def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
replacements=(), allow_unicode=False):
"""
Make a slug from the given text.
:param text (str): initial text
:param entities (bool): converts html entities to unicode
:param decimal (bool): converts html decimal to unicode
:param hexadecimal (bool): converts html hexadecimal to unicode
:param max_length (int): output string length
:param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
:param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
:param separator (str): separator between words
:param stopwords (iterable): words to discount
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:param allow_unicode (bool): allow unicode characters
:return (str):
"""
# user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
# ensure text is unicode
if not isinstance(text, str):
text = str(text, 'utf-8', 'ignore')
# replace quotes with dashes - pre-process
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
# decode unicode
if not allow_unicode:
text = unidecode.unidecode(text)
# ensure text is still in unicode
if not isinstance(text, str):
text = str(text, 'utf-8', 'ignore')
# character entity reference
if entities:
text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
# decimal character reference
if decimal:
try:
text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
except Exception:
pass
# hexadecimal character reference
if hexadecimal:
try:
text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
except Exception:
pass
# translate
if allow_unicode:
text = unicodedata.normalize('NFKC', text)
else:
text = unicodedata.normalize('NFKD', text)
if sys.version_info < (3,):
text = text.encode('ascii', 'ignore')
# make the text lowercase (optional)
if lowercase:
text = text.lower()
# remove generated quotes -- post-process
text = QUOTE_PATTERN.sub('', text)
# cleanup numbers
text = NUMBERS_PATTERN.sub('', text)
# replace all other unwanted characters
if allow_unicode:
pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
else:
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
text = re.sub(pattern, DEFAULT_SEPARATOR, text)
# remove redundant
text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
# remove stopwords
if stopwords:
if lowercase:
stopwords_lower = [s.lower() for s in stopwords]
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
else:
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
text = DEFAULT_SEPARATOR.join(words)
# finalize user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
# smart truncate if requested
if max_length > 0:
text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
if separator != DEFAULT_SEPARATOR:
text = text.replace(DEFAULT_SEPARATOR, separator)
return text
|