diff options
Diffstat (limited to 'intl/icu/source/data/translit/Arab_Latn.txt')
-rw-r--r-- | intl/icu/source/data/translit/Arab_Latn.txt | 186 |
1 files changed, 186 insertions, 0 deletions
diff --git a/intl/icu/source/data/translit/Arab_Latn.txt b/intl/icu/source/data/translit/Arab_Latn.txt new file mode 100644 index 0000000000..0c002516ba --- /dev/null +++ b/intl/icu/source/data/translit/Arab_Latn.txt @@ -0,0 +1,186 @@ +# © 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Generated using tools/cldr/cldr-to-icu/build-icu-data.xml +# +# File: Arab_Latn.txt +# Generated from CLDR +# + +# Generally follows UNGEGN +# http://www.eki.ee/wgrs/rom1_ar.pdf +# Occasionally deviates in the direction of ISO 233 +# http://homepage.mac.com/sirbinks/pdf/Arabic.pdf +# a) where required for disambiguation. +# b) with underdot instead of cedilla for letter like SAD, +# since those are explicitly in Unicode for transliteration. +# c) with extra non-Arabic-language letters, like PEH +# +# Does *not* do assimilation of "al", nor hyphenation. +# While it could be done, we need to determine whether a prefix "al" could +# occur other than as the definite article (since no space is used). +:: [[:Arabic:][:block=ARABIC:][ⁿ،؛؟ـ\u064B-\u0655٠-٬۰-۹﷼ښ]] ; +:: NFKD (NFC); +$disambig = \u0331 ; +$disambig2 = \u0330 ; +$under = \u0323 ; +$descender = ˌ; +$notAbove = [[:^ccc=0:] & [:^ccc=230:]]; +# non-letters +[:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR +[:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR +٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR +٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR +# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate +، ↔ ',' ; # ARABIC COMMA +؛ ↔ ';' ; # ARABIC SEMICOLON +؟ ↔ '?' ; # ARABIC QUESTION MARK +٪ ↔ '%' ; # ARABIC PERCENT SIGN +۰ ↔ 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO +۱ ↔ 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE +۲ ↔ 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO +۳ ↔ 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE +۴ ↔ 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR +۵ ↔ 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE +۶ ↔ 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX +۷ ↔ 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN +۸ ↔ 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT +۹ ↔ 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE +٠ ↔ 0 ; # ARABIC-INDIC DIGIT ZERO +١ ↔ 1 ; # ARABIC-INDIC DIGIT ONE +٢ ↔ 2 ; # ARABIC-INDIC DIGIT TWO +٣ ↔ 3 ; # ARABIC-INDIC DIGIT THREE +٤ ↔ 4 ; # ARABIC-INDIC DIGIT FOUR +٥ ↔ 5 ; # ARABIC-INDIC DIGIT FIVE +٦ ↔ 6 ; # ARABIC-INDIC DIGIT SIX +٧ ↔ 7 ; # ARABIC-INDIC DIGIT SEVEN +٨ ↔ 8 ; # ARABIC-INDIC DIGIT EIGHT +٩ ↔ 9 ; # ARABIC-INDIC DIGIT NINE +؉ ↔ ‰ ; # U+0609 ARABIC-INDIC PER MILLE SIGN +؊ ↔ ‱ ; # U+060A ARABIC-INDIC PER TEN THOUSAND SIGN +۔ ↔ '.' ; # U+06D4 ARABIC FULL STOP +# letters +# long vowels +\u064Eا↔ a\u0304 ; # ARABIC FATHA, ARABIC LETTER ALEF +\u064Fو ↔ u\u0304 ; # ARABIC DAMMA, ARABIC LETTER WAW +\u0650ي ↔ i\u0304 ; # ARABIC KASRA, ARABIC LETTER YEH +# longer items moved here to prevent masking +ث ↔ t h $disambig ; # ARABIC LETTER THEH +ذ ↔ d h $disambig ; # ARABIC LETTER THAL +ش ↔ s h $disambig ; # ARABIC LETTER SHEEN +ص ↔ s $under ; # ARABIC LETTER SAD +ض ↔ d $under ; # ARABIC LETTER DAD +ط ↔ t $under ; # ARABIC LETTER TAH +ظ ↔ z $under ; # ARABIC LETTER ZAH +غ ↔ g h $disambig ; # ARABIC LETTER GHAIN +# WARNING: special case +# ←t, umlaut, half-ring below→ will be canonically ordered as ←t, half-ring below, umlaut→ +# so on the return, we have to skip over (but preserve) the half-ring below (or others like it) +# ة\u0655 ← t\u0339\u0308 ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS +ة ↔ t \u0308 ; # ARABIC LETTER TEH MARBUTA +ة | $1 ← t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA +# non-Arabic language +ژ ↔ z h $disambig ; # ARABIC LETTER JEH +ڭ ↔ n $disambig g ; # ARABIC LETTER NG +ۋ ↔ v $disambig ; # ARABIC LETTER VE +ی ↔ y $disambig2 ; # ARABIC LETTER FARSI YEH +ښ ↔ s $descender; +# Arabic language +ء ↔ ʾ ; # ARABIC LETTER HAMZA +ا ↔ a $under; # ARABIC LETTER ALEF +ب ↔ b ; # ARABIC LETTER BEH +ت ↔ t ; # ARABIC LETTER TEH +ج ↔ j ; # ARABIC LETTER JEEM +ح ↔ h $under ; # ARABIC LETTER HAH +خ ↔ k h $disambig ; # ARABIC LETTER KHAH +د ↔ d ; # ARABIC LETTER DAL +ر ↔ r ; # ARABIC LETTER REH +ز ↔ z ; # ARABIC LETTER ZAIN +س ↔ s ; # ARABIC LETTER SEEN +ع ↔ ʿ ; # ARABIC LETTER AIN +ـ → ; # ARABIC TATWEEL +ف ↔ f ; # ARABIC LETTER FEH +ق ↔ q ; # ARABIC LETTER QAF +ک ↔ k $disambig ; # ARABIC LETTER KEHEH +ك ↔ k ; # ARABIC LETTER KAF +ل ↔ l ; # ARABIC LETTER LAM +م ↔ m ; # ARABIC LETTER MEEM +ن ↔ n ; # ARABIC LETTER NOON +ه ↔ h ; # ARABIC LETTER HEH +و ↔ w ; # ARABIC LETTER WAW +ى ↔ y $disambig ; # ARABIC LETTER ALEF MAKSURA +ي ↔ y ; # ARABIC LETTER YEH +\u064B ↔ aⁿ ; # ARABIC FATHATAN +\u064C ↔ uⁿ ; # ARABIC DAMMATAN +\u064D ↔ iⁿ ; # ARABIC KASRATAN +\u064E ↔ a ; # ARABIC FATHA +\u064F ↔ u ; # ARABIC DAMMA +\u0650 ↔ i ; # ARABIC KASRA +\u0651 ↔ \u0303 ; # ARABIC SHADDA +\u0652 ↔ \u030A ; # ARABIC SUKUN +# special combining marks +\u0653 ↔ \u0302 ; # ARABIC MADDAH ABOVE +\u0654 ↔ \u0309 ; # ARABIC HAMZA ABOVE +\u0655 ↔ \u0339 ; # ARABIC HAMZA BELOW +# Some non-Arabic language (not in UNGEGN) +پ ↔ p ; # ARABIC LETTER PEH +چ ↔ c h $disambig ; # ARABIC LETTER TCHEH +ڤ ↔ v ; # ARABIC LETTER VEH +# ڥ ↔ v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW +# ڢ ↔ f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW +گ ↔ g ; # ARABIC LETTER GAF +# fallbacks TODO roundtrip where possible, using diacritics to distinguish +#https://en.wikipedia.org/wiki/Sindhi_transliteration +ٺ→ṭh; +ٿ→th; +ٽ→ṭ; +ڙ→ṛ; +ڦ→ph; +ڻ→ṇ; +ڱ→ṅ; +ڃ→ñ; +ڪ→k; +ڄ→j\u0308; +ۃ→ẖ; +ڳ→g\u0324; +ڍ→ḍh; +ڌ→dh; +ڏ→d\u0324; +ڊ→ḍ; +ڇ→ch; +ڀ→bh; +ٻ→ḇ; +۽→'&'; +۾→'mn'; +#https://en.wiktionary.org/wiki/Wiktionary:Urdu_transliteration +ھ → ʱ ; +ں → ◌\u0303 ; +ے → ai ; +ڈ → ḍ ; +ڑ → ṛ ; +ٹ → ṭ ; +#https://www.eki.ee/wgrs/rom2_ps.htm +#https://en.wikipedia.org/wiki/Pashto_alphabet +ټ → ṯ ; +ځ → dz ; +څ → ts ; +ډ → ḏ ; +ړ → ṟ ; +ږ → z\u035Fh ; +ګ → g ; +ڼ → ṉ ; +ۍ → ạy ; +ې → e ; +#https://www.eki.ee/wgrs/rom1_ug.pdf +ہ → ḥ ; +ە → ĥ ; +# fallbacks +| s ← c } [eiy]; +| k ← c ; +| i ← e ; +| u ← o ; +| ks ← x ; +| n ← ⁿ; +:: (lower) ; +::NFC (NFD); +:: ( [[:Latin:] [%,.0-9;?ʾ-ʿ\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339;ˌ]] ); + |