summaryrefslogtreecommitdiffstats
path: root/third_party/python/python_slugify/slugify/slugify.py
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--third_party/python/python_slugify/slugify/slugify.py177
1 files changed, 177 insertions, 0 deletions
diff --git a/third_party/python/python_slugify/slugify/slugify.py b/third_party/python/python_slugify/slugify/slugify.py
new file mode 100644
index 0000000000..5354fa5e44
--- /dev/null
+++ b/third_party/python/python_slugify/slugify/slugify.py
@@ -0,0 +1,177 @@
+import re
+import sys
+import unicodedata
+from html.entities import name2codepoint
+
+try:
+ import unidecode
+except ImportError:
+ import text_unidecode as unidecode
+
+__all__ = ['slugify', 'smart_truncate']
+
+
+CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
+DECIMAL_PATTERN = re.compile(r'&#(\d+);')
+HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
+QUOTE_PATTERN = re.compile(r'[\']+')
+DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
+DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
+DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
+NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
+DEFAULT_SEPARATOR = '-'
+
+
+def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False):
+ """
+ Truncate a string.
+ :param string (str): string for modification
+ :param max_length (int): output string length
+ :param word_boundary (bool):
+ :param save_order (bool): if True then word order of output string is like input string
+ :param separator (str): separator between words
+ :return:
+ """
+
+ string = string.strip(separator)
+
+ if not max_length:
+ return string
+
+ if len(string) < max_length:
+ return string
+
+ if not word_boundary:
+ return string[:max_length].strip(separator)
+
+ if separator not in string:
+ return string[:max_length]
+
+ truncated = ''
+ for word in string.split(separator):
+ if word:
+ next_len = len(truncated) + len(word)
+ if next_len < max_length:
+ truncated += '{}{}'.format(word, separator)
+ elif next_len == max_length:
+ truncated += '{}'.format(word)
+ break
+ else:
+ if save_order:
+ break
+ if not truncated: # pragma: no cover
+ truncated = string[:max_length]
+ return truncated.strip(separator)
+
+
+def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
+ separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
+ replacements=(), allow_unicode=False):
+ """
+ Make a slug from the given text.
+ :param text (str): initial text
+ :param entities (bool): converts html entities to unicode
+ :param decimal (bool): converts html decimal to unicode
+ :param hexadecimal (bool): converts html hexadecimal to unicode
+ :param max_length (int): output string length
+ :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
+ :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
+ :param separator (str): separator between words
+ :param stopwords (iterable): words to discount
+ :param regex_pattern (str): regex pattern for disallowed characters
+ :param lowercase (bool): activate case sensitivity by setting it to False
+ :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
+ :param allow_unicode (bool): allow unicode characters
+ :return (str):
+ """
+
+ # user-specific replacements
+ if replacements:
+ for old, new in replacements:
+ text = text.replace(old, new)
+
+ # ensure text is unicode
+ if not isinstance(text, str):
+ text = str(text, 'utf-8', 'ignore')
+
+ # replace quotes with dashes - pre-process
+ text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
+
+ # decode unicode
+ if not allow_unicode:
+ text = unidecode.unidecode(text)
+
+ # ensure text is still in unicode
+ if not isinstance(text, str):
+ text = str(text, 'utf-8', 'ignore')
+
+ # character entity reference
+ if entities:
+ text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
+
+ # decimal character reference
+ if decimal:
+ try:
+ text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
+ except Exception:
+ pass
+
+ # hexadecimal character reference
+ if hexadecimal:
+ try:
+ text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
+ except Exception:
+ pass
+
+ # translate
+ if allow_unicode:
+ text = unicodedata.normalize('NFKC', text)
+ else:
+ text = unicodedata.normalize('NFKD', text)
+
+ if sys.version_info < (3,):
+ text = text.encode('ascii', 'ignore')
+
+ # make the text lowercase (optional)
+ if lowercase:
+ text = text.lower()
+
+ # remove generated quotes -- post-process
+ text = QUOTE_PATTERN.sub('', text)
+
+ # cleanup numbers
+ text = NUMBERS_PATTERN.sub('', text)
+
+ # replace all other unwanted characters
+ if allow_unicode:
+ pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
+ else:
+ pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
+
+ text = re.sub(pattern, DEFAULT_SEPARATOR, text)
+
+ # remove redundant
+ text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
+
+ # remove stopwords
+ if stopwords:
+ if lowercase:
+ stopwords_lower = [s.lower() for s in stopwords]
+ words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
+ else:
+ words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
+ text = DEFAULT_SEPARATOR.join(words)
+
+ # finalize user-specific replacements
+ if replacements:
+ for old, new in replacements:
+ text = text.replace(old, new)
+
+ # smart truncate if requested
+ if max_length > 0:
+ text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
+
+ if separator != DEFAULT_SEPARATOR:
+ text = text.replace(DEFAULT_SEPARATOR, separator)
+
+ return text