summaryrefslogtreecommitdiffstats
path: root/third_party/python/chardet/chardet/metadata/languages.py
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/python/chardet/chardet/metadata/languages.py')
-rw-r--r--third_party/python/chardet/chardet/metadata/languages.py310
1 files changed, 310 insertions, 0 deletions
diff --git a/third_party/python/chardet/chardet/metadata/languages.py b/third_party/python/chardet/chardet/metadata/languages.py
new file mode 100644
index 0000000000..3237d5abf6
--- /dev/null
+++ b/third_party/python/chardet/chardet/metadata/languages.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Metadata about languages used by our model training code for our
+SingleByteCharSetProbers. Could be used for other things in the future.
+
+This code is based on the language metadata from the uchardet project.
+"""
+from __future__ import absolute_import, print_function
+
+from string import ascii_letters
+
+
+# TODO: Add Ukranian (KOI8-U)
+
+class Language(object):
+ """Metadata about a language useful for training models
+
+ :ivar name: The human name for the language, in English.
+ :type name: str
+ :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
+ or use another catalog as a last resort.
+ :type iso_code: str
+ :ivar use_ascii: Whether or not ASCII letters should be included in trained
+ models.
+ :type use_ascii: bool
+ :ivar charsets: The charsets we want to support and create data for.
+ :type charsets: list of str
+ :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
+ `True`, you only need to add those not in the ASCII set.
+ :type alphabet: str
+ :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
+ Wikipedia for training data.
+ :type wiki_start_pages: list of str
+ """
+ def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
+ alphabet=None, wiki_start_pages=None):
+ super(Language, self).__init__()
+ self.name = name
+ self.iso_code = iso_code
+ self.use_ascii = use_ascii
+ self.charsets = charsets
+ if self.use_ascii:
+ if alphabet:
+ alphabet += ascii_letters
+ else:
+ alphabet = ascii_letters
+ elif not alphabet:
+ raise ValueError('Must supply alphabet if use_ascii is False')
+ self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
+ self.wiki_start_pages = wiki_start_pages
+
+ def __repr__(self):
+ return '{}({})'.format(self.__class__.__name__,
+ ', '.join('{}={!r}'.format(k, v)
+ for k, v in self.__dict__.items()
+ if not k.startswith('_')))
+
+
+LANGUAGES = {'Arabic': Language(name='Arabic',
+ iso_code='ar',
+ use_ascii=False,
+ # We only support encodings that use isolated
+ # forms, because the current recommendation is
+ # that the rendering system handles presentation
+ # forms. This means we purposefully skip IBM864.
+ charsets=['ISO-8859-6', 'WINDOWS-1256',
+ 'CP720', 'CP864'],
+ alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
+ wiki_start_pages=[u'الصفحة_الرئيسية']),
+ 'Belarusian': Language(name='Belarusian',
+ iso_code='be',
+ use_ascii=False,
+ charsets=['ISO-8859-5', 'WINDOWS-1251',
+ 'IBM866', 'MacCyrillic'],
+ alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
+ u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
+ wiki_start_pages=[u'Галоўная_старонка']),
+ 'Bulgarian': Language(name='Bulgarian',
+ iso_code='bg',
+ use_ascii=False,
+ charsets=['ISO-8859-5', 'WINDOWS-1251',
+ 'IBM855'],
+ alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
+ u'абвгдежзийклмнопрстуфхцчшщъьюя'),
+ wiki_start_pages=[u'Начална_страница']),
+ 'Czech': Language(name='Czech',
+ iso_code='cz',
+ use_ascii=True,
+ charsets=['ISO-8859-2', 'WINDOWS-1250'],
+ alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
+ wiki_start_pages=[u'Hlavní_strana']),
+ 'Danish': Language(name='Danish',
+ iso_code='da',
+ use_ascii=True,
+ charsets=['ISO-8859-1', 'ISO-8859-15',
+ 'WINDOWS-1252'],
+ alphabet=u'æøåÆØÅ',
+ wiki_start_pages=[u'Forside']),
+ 'German': Language(name='German',
+ iso_code='de',
+ use_ascii=True,
+ charsets=['ISO-8859-1', 'WINDOWS-1252'],
+ alphabet=u'äöüßÄÖÜ',
+ wiki_start_pages=[u'Wikipedia:Hauptseite']),
+ 'Greek': Language(name='Greek',
+ iso_code='el',
+ use_ascii=False,
+ charsets=['ISO-8859-7', 'WINDOWS-1253'],
+ alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
+ u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
+ wiki_start_pages=[u'Πύλη:Κύρια']),
+ 'English': Language(name='English',
+ iso_code='en',
+ use_ascii=True,
+ charsets=['ISO-8859-1', 'WINDOWS-1252'],
+ wiki_start_pages=[u'Main_Page']),
+ 'Esperanto': Language(name='Esperanto',
+ iso_code='eo',
+ # Q, W, X, and Y not used at all
+ use_ascii=False,
+ charsets=['ISO-8859-3'],
+ alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
+ u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
+ wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
+ 'Spanish': Language(name='Spanish',
+ iso_code='es',
+ use_ascii=True,
+ charsets=['ISO-8859-1', 'ISO-8859-15',
+ 'WINDOWS-1252'],
+ alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
+ wiki_start_pages=[u'Wikipedia:Portada']),
+ 'Estonian': Language(name='Estonian',
+ iso_code='et',
+ use_ascii=False,
+ charsets=['ISO-8859-4', 'ISO-8859-13',
+ 'WINDOWS-1257'],
+ # C, F, Š, Q, W, X, Y, Z, Ž are only for
+ # loanwords
+ alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
+ u'abdeghijklmnoprstuvõäöü'),
+ wiki_start_pages=[u'Esileht']),
+ 'Finnish': Language(name='Finnish',
+ iso_code='fi',
+ use_ascii=True,
+ charsets=['ISO-8859-1', 'ISO-8859-15',
+ 'WINDOWS-1252'],
+ alphabet=u'ÅÄÖŠŽåäöšž',
+ wiki_start_pages=[u'Wikipedia:Etusivu']),
+ 'French': Language(name='French',
+ iso_code='fr',
+ use_ascii=True,
+ charsets=['ISO-8859-1', 'ISO-8859-15',
+ 'WINDOWS-1252'],
+ alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
+ wiki_start_pages=[u'Wikipédia:Accueil_principal',
+ u'Bœuf (animal)']),
+ 'Hebrew': Language(name='Hebrew',
+ iso_code='he',
+ use_ascii=False,
+ charsets=['ISO-8859-8', 'WINDOWS-1255'],
+ alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
+ wiki_start_pages=[u'עמוד_ראשי']),
+ 'Croatian': Language(name='Croatian',
+ iso_code='hr',
+ # Q, W, X, Y are only used for foreign words.
+ use_ascii=False,
+ charsets=['ISO-8859-2', 'WINDOWS-1250'],
+ alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
+ u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
+ wiki_start_pages=[u'Glavna_stranica']),
+ 'Hungarian': Language(name='Hungarian',
+ iso_code='hu',
+ # Q, W, X, Y are only used for foreign words.
+ use_ascii=False,
+ charsets=['ISO-8859-2', 'WINDOWS-1250'],
+ alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
+ u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
+ wiki_start_pages=[u'Kezdőlap']),
+ 'Italian': Language(name='Italian',
+ iso_code='it',
+ use_ascii=True,
+ charsets=['ISO-8859-1', 'ISO-8859-15',
+ 'WINDOWS-1252'],
+ alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
+ wiki_start_pages=[u'Pagina_principale']),
+ 'Lithuanian': Language(name='Lithuanian',
+ iso_code='lt',
+ use_ascii=False,
+ charsets=['ISO-8859-13', 'WINDOWS-1257',
+ 'ISO-8859-4'],
+ # Q, W, and X not used at all
+ alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
+ u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
+ wiki_start_pages=[u'Pagrindinis_puslapis']),
+ 'Latvian': Language(name='Latvian',
+ iso_code='lv',
+ use_ascii=False,
+ charsets=['ISO-8859-13', 'WINDOWS-1257',
+ 'ISO-8859-4'],
+ # Q, W, X, Y are only for loanwords
+ alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
+ u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
+ wiki_start_pages=[u'Sākumlapa']),
+ 'Macedonian': Language(name='Macedonian',
+ iso_code='mk',
+ use_ascii=False,
+ charsets=['ISO-8859-5', 'WINDOWS-1251',
+ 'MacCyrillic', 'IBM855'],
+ alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
+ u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
+ wiki_start_pages=[u'Главна_страница']),
+ 'Dutch': Language(name='Dutch',
+ iso_code='nl',
+ use_ascii=True,
+ charsets=['ISO-8859-1', 'WINDOWS-1252'],
+ wiki_start_pages=[u'Hoofdpagina']),
+ 'Polish': Language(name='Polish',
+ iso_code='pl',
+ # Q and X are only used for foreign words.
+ use_ascii=False,
+ charsets=['ISO-8859-2', 'WINDOWS-1250'],
+ alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
+ u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
+ wiki_start_pages=[u'Wikipedia:Strona_główna']),
+ 'Portuguese': Language(name='Portuguese',
+ iso_code='pt',
+ use_ascii=True,
+ charsets=['ISO-8859-1', 'ISO-8859-15',
+ 'WINDOWS-1252'],
+ alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
+ wiki_start_pages=[u'Wikipédia:Página_principal']),
+ 'Romanian': Language(name='Romanian',
+ iso_code='ro',
+ use_ascii=True,
+ charsets=['ISO-8859-2', 'WINDOWS-1250'],
+ alphabet=u'ăâîșțĂÂÎȘȚ',
+ wiki_start_pages=[u'Pagina_principală']),
+ 'Russian': Language(name='Russian',
+ iso_code='ru',
+ use_ascii=False,
+ charsets=['ISO-8859-5', 'WINDOWS-1251',
+ 'KOI8-R', 'MacCyrillic', 'IBM866',
+ 'IBM855'],
+ alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
+ u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
+ wiki_start_pages=[u'Заглавная_страница']),
+ 'Slovak': Language(name='Slovak',
+ iso_code='sk',
+ use_ascii=True,
+ charsets=['ISO-8859-2', 'WINDOWS-1250'],
+ alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
+ wiki_start_pages=[u'Hlavná_stránka']),
+ 'Slovene': Language(name='Slovene',
+ iso_code='sl',
+ # Q, W, X, Y are only used for foreign words.
+ use_ascii=False,
+ charsets=['ISO-8859-2', 'WINDOWS-1250'],
+ alphabet=(u'abcčdefghijklmnoprsštuvzž'
+ u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
+ wiki_start_pages=[u'Glavna_stran']),
+ # Serbian can be written in both Latin and Cyrillic, but there's no
+ # simple way to get the Latin alphabet pages from Wikipedia through
+ # the API, so for now we just support Cyrillic.
+ 'Serbian': Language(name='Serbian',
+ iso_code='sr',
+ alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
+ u'абвгдђежзијклљмнњопрстћуфхцчџш'),
+ charsets=['ISO-8859-5', 'WINDOWS-1251',
+ 'MacCyrillic', 'IBM855'],
+ wiki_start_pages=[u'Главна_страна']),
+ 'Thai': Language(name='Thai',
+ iso_code='th',
+ use_ascii=False,
+ charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
+ alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
+ wiki_start_pages=[u'หน้าหลัก']),
+ 'Turkish': Language(name='Turkish',
+ iso_code='tr',
+ # Q, W, and X are not used by Turkish
+ use_ascii=False,
+ charsets=['ISO-8859-3', 'ISO-8859-9',
+ 'WINDOWS-1254'],
+ alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
+ u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
+ wiki_start_pages=[u'Ana_Sayfa']),
+ 'Vietnamese': Language(name='Vietnamese',
+ iso_code='vi',
+ use_ascii=False,
+ # Windows-1258 is the only common 8-bit
+ # Vietnamese encoding supported by Python.
+ # From Wikipedia:
+ # For systems that lack support for Unicode,
+ # dozens of 8-bit Vietnamese code pages are
+ # available.[1] The most common are VISCII
+ # (TCVN 5712:1993), VPS, and Windows-1258.[3]
+ # Where ASCII is required, such as when
+ # ensuring readability in plain text e-mail,
+ # Vietnamese letters are often encoded
+ # according to Vietnamese Quoted-Readable
+ # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
+ # though usage of either variable-width
+ # scheme has declined dramatically following
+ # the adoption of Unicode on the World Wide
+ # Web.
+ charsets=['WINDOWS-1258'],
+ alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
+ u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
+ wiki_start_pages=[u'Chữ_Quốc_ngữ']),
+ }