diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 00:47:55 +0000 |
commit | 26a029d407be480d791972afb5975cf62c9360a6 (patch) | |
tree | f435a8308119effd964b339f76abb83a57c29483 /third_party/python/python_slugify/slugify | |
parent | Initial commit. (diff) | |
download | firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip |
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/python/python_slugify/slugify')
5 files changed, 338 insertions, 0 deletions
diff --git a/third_party/python/python_slugify/slugify/__init__.py b/third_party/python/python_slugify/slugify/__init__.py new file mode 100644 index 0000000000..6d3279fb1a --- /dev/null +++ b/third_party/python/python_slugify/slugify/__init__.py @@ -0,0 +1,10 @@ +from .special import * +from .slugify import * +from .__version__ import __title__ +from .__version__ import __author__ +from .__version__ import __author_email__ +from .__version__ import __description__ +from .__version__ import __url__ +from .__version__ import __license__ +from .__version__ import __copyright__ +from .__version__ import __version__ diff --git a/third_party/python/python_slugify/slugify/__main__.py b/third_party/python/python_slugify/slugify/__main__.py new file mode 100644 index 0000000000..7dd6b01a5e --- /dev/null +++ b/third_party/python/python_slugify/slugify/__main__.py @@ -0,0 +1,96 @@ +from __future__ import print_function, absolute_import +import argparse +import sys + +from .slugify import slugify, DEFAULT_SEPARATOR + + +def parse_args(argv): + parser = argparse.ArgumentParser(description="Slug string") + + input_group = parser.add_argument_group(description="Input") + input_group.add_argument("input_string", nargs='*', + help='Text to slugify') + input_group.add_argument("--stdin", action='store_true', + help="Take the text from STDIN") + + parser.add_argument("--no-entities", action='store_false', dest='entities', default=True, + help="Do not convert HTML entities to unicode") + parser.add_argument("--no-decimal", action='store_false', dest='decimal', default=True, + help="Do not convert HTML decimal to unicode") + parser.add_argument("--no-hexadecimal", action='store_false', dest='hexadecimal', default=True, + help="Do not convert HTML hexadecimal to unicode") + parser.add_argument("--max-length", type=int, default=0, + help="Output string length, 0 for no limit") + parser.add_argument("--word-boundary", action='store_true', default=False, + help="Truncate to complete word even if length ends up shorter than --max_length") + parser.add_argument("--save-order", action='store_true', default=False, + help="When set and --max_length > 0 return whole words in the initial order") + parser.add_argument("--separator", type=str, default=DEFAULT_SEPARATOR, + help="Separator between words. By default " + DEFAULT_SEPARATOR) + parser.add_argument("--stopwords", nargs='+', + help="Words to discount") + parser.add_argument("--regex-pattern", + help="Python regex pattern for disallowed characters") + parser.add_argument("--no-lowercase", action='store_false', dest='lowercase', default=True, + help="Activate case sensitivity") + parser.add_argument("--replacements", nargs='+', + help="""Additional replacement rules e.g. "|->or", "%%->percent".""") + parser.add_argument("--allow-unicode", action='store_true', default=False, + help="Allow unicode characters") + + args = parser.parse_args(argv[1:]) + + if args.input_string and args.stdin: + parser.error("Input strings and --stdin cannot work together") + + if args.replacements: + def split_check(repl): + SEP = '->' + if SEP not in repl: + parser.error("Replacements must be of the form: ORIGINAL{SEP}REPLACED".format(SEP=SEP)) + return repl.split(SEP, 1) + args.replacements = [split_check(repl) for repl in args.replacements] + + if args.input_string: + args.input_string = " ".join(args.input_string) + elif args.stdin: + args.input_string = sys.stdin.read() + + if not args.input_string: + args.input_string = '' + + return args + + +def slugify_params(args): + return dict( + text=args.input_string, + entities=args.entities, + decimal=args.decimal, + hexadecimal=args.hexadecimal, + max_length=args.max_length, + word_boundary=args.word_boundary, + save_order=args.save_order, + separator=args.separator, + stopwords=args.stopwords, + lowercase=args.lowercase, + replacements=args.replacements, + allow_unicode=args.allow_unicode + ) + + +def main(argv=None): # pragma: no cover + """ Run this program """ + if argv is None: + argv = sys.argv + args = parse_args(argv) + params = slugify_params(args) + try: + print(slugify(**params)) + except KeyboardInterrupt: + sys.exit(-1) + + +if __name__ == '__main__': # pragma: no cover + main() diff --git a/third_party/python/python_slugify/slugify/__version__.py b/third_party/python/python_slugify/slugify/__version__.py new file mode 100644 index 0000000000..a558d9bce4 --- /dev/null +++ b/third_party/python/python_slugify/slugify/__version__.py @@ -0,0 +1,8 @@ +__title__ = 'python-slugify' +__author__ = 'Val Neekman' +__author_email__ = 'info@neekware.com' +__description__ = 'A Python slugify application that also handles Unicode' +__url__ = 'https://github.com/un33k/python-slugify' +__license__ = 'MIT' +__copyright__ = 'Copyright 2022 Val Neekman @ Neekware Inc.' +__version__ = '8.0.1' diff --git a/third_party/python/python_slugify/slugify/slugify.py b/third_party/python/python_slugify/slugify/slugify.py new file mode 100644 index 0000000000..5354fa5e44 --- /dev/null +++ b/third_party/python/python_slugify/slugify/slugify.py @@ -0,0 +1,177 @@ +import re +import sys +import unicodedata +from html.entities import name2codepoint + +try: + import unidecode +except ImportError: + import text_unidecode as unidecode + +__all__ = ['slugify', 'smart_truncate'] + + +CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint)) +DECIMAL_PATTERN = re.compile(r'&#(\d+);') +HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);') +QUOTE_PATTERN = re.compile(r'[\']+') +DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+') +DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+') +DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}') +NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)') +DEFAULT_SEPARATOR = '-' + + +def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False): + """ + Truncate a string. + :param string (str): string for modification + :param max_length (int): output string length + :param word_boundary (bool): + :param save_order (bool): if True then word order of output string is like input string + :param separator (str): separator between words + :return: + """ + + string = string.strip(separator) + + if not max_length: + return string + + if len(string) < max_length: + return string + + if not word_boundary: + return string[:max_length].strip(separator) + + if separator not in string: + return string[:max_length] + + truncated = '' + for word in string.split(separator): + if word: + next_len = len(truncated) + len(word) + if next_len < max_length: + truncated += '{}{}'.format(word, separator) + elif next_len == max_length: + truncated += '{}'.format(word) + break + else: + if save_order: + break + if not truncated: # pragma: no cover + truncated = string[:max_length] + return truncated.strip(separator) + + +def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, + separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True, + replacements=(), allow_unicode=False): + """ + Make a slug from the given text. + :param text (str): initial text + :param entities (bool): converts html entities to unicode + :param decimal (bool): converts html decimal to unicode + :param hexadecimal (bool): converts html hexadecimal to unicode + :param max_length (int): output string length + :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length + :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order + :param separator (str): separator between words + :param stopwords (iterable): words to discount + :param regex_pattern (str): regex pattern for disallowed characters + :param lowercase (bool): activate case sensitivity by setting it to False + :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] + :param allow_unicode (bool): allow unicode characters + :return (str): + """ + + # user-specific replacements + if replacements: + for old, new in replacements: + text = text.replace(old, new) + + # ensure text is unicode + if not isinstance(text, str): + text = str(text, 'utf-8', 'ignore') + + # replace quotes with dashes - pre-process + text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) + + # decode unicode + if not allow_unicode: + text = unidecode.unidecode(text) + + # ensure text is still in unicode + if not isinstance(text, str): + text = str(text, 'utf-8', 'ignore') + + # character entity reference + if entities: + text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text) + + # decimal character reference + if decimal: + try: + text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text) + except Exception: + pass + + # hexadecimal character reference + if hexadecimal: + try: + text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text) + except Exception: + pass + + # translate + if allow_unicode: + text = unicodedata.normalize('NFKC', text) + else: + text = unicodedata.normalize('NFKD', text) + + if sys.version_info < (3,): + text = text.encode('ascii', 'ignore') + + # make the text lowercase (optional) + if lowercase: + text = text.lower() + + # remove generated quotes -- post-process + text = QUOTE_PATTERN.sub('', text) + + # cleanup numbers + text = NUMBERS_PATTERN.sub('', text) + + # replace all other unwanted characters + if allow_unicode: + pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN + else: + pattern = regex_pattern or DISALLOWED_CHARS_PATTERN + + text = re.sub(pattern, DEFAULT_SEPARATOR, text) + + # remove redundant + text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) + + # remove stopwords + if stopwords: + if lowercase: + stopwords_lower = [s.lower() for s in stopwords] + words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower] + else: + words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords] + text = DEFAULT_SEPARATOR.join(words) + + # finalize user-specific replacements + if replacements: + for old, new in replacements: + text = text.replace(old, new) + + # smart truncate if requested + if max_length > 0: + text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order) + + if separator != DEFAULT_SEPARATOR: + text = text.replace(DEFAULT_SEPARATOR, separator) + + return text diff --git a/third_party/python/python_slugify/slugify/special.py b/third_party/python/python_slugify/slugify/special.py new file mode 100644 index 0000000000..54eb85c70e --- /dev/null +++ b/third_party/python/python_slugify/slugify/special.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + + +def add_uppercase_char(char_list): + """ Given a replacement char list, this adds uppercase chars to the list """ + + for item in char_list: + char, xlate = item + upper_dict = char.upper(), xlate.capitalize() + if upper_dict not in char_list and char != upper_dict[0]: + char_list.insert(0, upper_dict) + return char_list + + +# Language specific pre translations +# Source awesome-slugify + +_CYRILLIC = [ # package defaults: + (u'ё', u'e'), # io / yo + (u'я', u'ya'), # ia + (u'х', u'h'), # kh + (u'у', u'y'), # u + (u'щ', u'sch'), # sch + (u'ю', u'u'), # iu / yu +] +CYRILLIC = add_uppercase_char(_CYRILLIC) + +_GERMAN = [ # package defaults: + (u'ä', u'ae'), # a + (u'ö', u'oe'), # o + (u'ü', u'ue'), # u +] +GERMAN = add_uppercase_char(_GERMAN) + +_GREEK = [ # package defaults: + (u'χ', u'ch'), # kh + (u'Ξ', u'X'), # Ks + (u'ϒ', u'Y'), # U + (u'υ', u'y'), # u + (u'ύ', u'y'), + (u'ϋ', u'y'), + (u'ΰ', u'y'), +] +GREEK = add_uppercase_char(_GREEK) + +# Pre translations +PRE_TRANSLATIONS = CYRILLIC + GERMAN + GREEK |