summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/data/translit/Arab_Latn.txt
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/data/translit/Arab_Latn.txt')
-rw-r--r--intl/icu/source/data/translit/Arab_Latn.txt186
1 files changed, 186 insertions, 0 deletions
diff --git a/intl/icu/source/data/translit/Arab_Latn.txt b/intl/icu/source/data/translit/Arab_Latn.txt
new file mode 100644
index 0000000000..0c002516ba
--- /dev/null
+++ b/intl/icu/source/data/translit/Arab_Latn.txt
@@ -0,0 +1,186 @@
+# © 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
+#
+# File: Arab_Latn.txt
+# Generated from CLDR
+#
+
+# Generally follows UNGEGN
+# http://www.eki.ee/wgrs/rom1_ar.pdf
+# Occasionally deviates in the direction of ISO 233
+# http://homepage.mac.com/sirbinks/pdf/Arabic.pdf
+# a) where required for disambiguation.
+# b) with underdot instead of cedilla for letter like SAD,
+# since those are explicitly in Unicode for transliteration.
+# c) with extra non-Arabic-language letters, like PEH
+#
+# Does *not* do assimilation of "al", nor hyphenation.
+# While it could be done, we need to determine whether a prefix "al" could
+# occur other than as the definite article (since no space is used).
+:: [[:Arabic:][:block=ARABIC:][‎ⁿ،؛؟ـ\u064B-\u0655٠-٬۰-۹﷼ښ]] ;
+:: NFKD (NFC);
+$disambig = \u0331 ;
+$disambig2 = \u0330 ;
+$under = \u0323 ;
+$descender = ˌ;
+$notAbove = [[:^ccc=0:] & [:^ccc=230:]];
+# non-letters
+[:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR
+[:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR
+٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR
+٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR
+# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate
+، ↔ ',' ; # ARABIC COMMA
+؛ ↔ ';' ; # ARABIC SEMICOLON
+؟ ↔ '?' ; # ARABIC QUESTION MARK
+٪ ↔ '%' ; # ARABIC PERCENT SIGN
+۰ ↔ 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO
+۱ ↔ 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE
+۲ ↔ 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO
+۳ ↔ 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE
+۴ ↔ 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR
+۵ ↔ 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE
+۶ ↔ 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX
+۷ ↔ 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN
+۸ ↔ 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT
+۹ ↔ 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE
+٠ ↔ 0 ; # ARABIC-INDIC DIGIT ZERO
+١ ↔ 1 ; # ARABIC-INDIC DIGIT ONE
+٢ ↔ 2 ; # ARABIC-INDIC DIGIT TWO
+٣ ↔ 3 ; # ARABIC-INDIC DIGIT THREE
+٤ ↔ 4 ; # ARABIC-INDIC DIGIT FOUR
+٥ ↔ 5 ; # ARABIC-INDIC DIGIT FIVE
+٦ ↔ 6 ; # ARABIC-INDIC DIGIT SIX
+٧ ↔ 7 ; # ARABIC-INDIC DIGIT SEVEN
+٨ ↔ 8 ; # ARABIC-INDIC DIGIT EIGHT
+٩ ↔ 9 ; # ARABIC-INDIC DIGIT NINE
+؉ ↔ ‰ ; # U+0609 ARABIC-INDIC PER MILLE SIGN
+؊ ↔ ‱ ; # U+060A ARABIC-INDIC PER TEN THOUSAND SIGN
+‎۔‎ ↔ '.' ; # U+06D4 ARABIC FULL STOP
+# letters
+# long vowels
+\u064Eا↔ a\u0304 ; # ARABIC FATHA, ARABIC LETTER ALEF
+\u064Fو ↔ u\u0304 ; # ARABIC DAMMA, ARABIC LETTER WAW
+\u0650ي ↔ i\u0304 ; # ARABIC KASRA, ARABIC LETTER YEH
+# longer items moved here to prevent masking
+ث ↔ t h $disambig ; # ARABIC LETTER THEH
+ذ ↔ d h $disambig ; # ARABIC LETTER THAL
+ش ↔ s h $disambig ; # ARABIC LETTER SHEEN
+ص ↔ s $under ; # ARABIC LETTER SAD
+ض ↔ d $under ; # ARABIC LETTER DAD
+ط ↔ t $under ; # ARABIC LETTER TAH
+ظ ↔ z $under ; # ARABIC LETTER ZAH
+غ ↔ g h $disambig ; # ARABIC LETTER GHAIN
+# WARNING: special case
+# ←t, umlaut, half-ring below→ will be canonically ordered as ←t, half-ring below, umlaut→
+# so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
+# ة\u0655 ← t\u0339\u0308 ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
+ة ↔ t \u0308 ; # ARABIC LETTER TEH MARBUTA
+ة | $1 ← t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA
+# non-Arabic language
+ژ ↔ z h $disambig ; # ARABIC LETTER JEH
+ڭ ↔ n $disambig g ; # ARABIC LETTER NG
+ۋ ↔ v $disambig ; # ARABIC LETTER VE
+ی ↔ y $disambig2 ; # ARABIC LETTER FARSI YEH
+ښ ↔ s $descender;
+# Arabic language
+ء ↔ ʾ ; # ARABIC LETTER HAMZA
+ا ↔ a $under; # ARABIC LETTER ALEF
+ب ↔ b ; # ARABIC LETTER BEH
+ت ↔ t ; # ARABIC LETTER TEH
+ج ↔ j ; # ARABIC LETTER JEEM
+ح ↔ h $under ; # ARABIC LETTER HAH
+خ ↔ k h $disambig ; # ARABIC LETTER KHAH
+د ↔ d ; # ARABIC LETTER DAL
+ر ↔ r ; # ARABIC LETTER REH
+ز ↔ z ; # ARABIC LETTER ZAIN
+س ↔ s ; # ARABIC LETTER SEEN
+ع ↔ ʿ ; # ARABIC LETTER AIN
+ـ → ; # ARABIC TATWEEL
+ف ↔ f ; # ARABIC LETTER FEH
+ق ↔ q ; # ARABIC LETTER QAF
+ک ↔ k $disambig ; # ARABIC LETTER KEHEH
+ك ↔ k ; # ARABIC LETTER KAF
+ل ↔ l ; # ARABIC LETTER LAM
+م ↔ m ; # ARABIC LETTER MEEM
+ن ↔ n ; # ARABIC LETTER NOON
+ه ↔ h ; # ARABIC LETTER HEH
+و ↔ w ; # ARABIC LETTER WAW
+ى ↔ y $disambig ; # ARABIC LETTER ALEF MAKSURA
+ي ↔ y ; # ARABIC LETTER YEH
+\u064B ↔ aⁿ ; # ARABIC FATHATAN
+\u064C ↔ uⁿ ; # ARABIC DAMMATAN
+\u064D ↔ iⁿ ; # ARABIC KASRATAN
+\u064E ↔ a ; # ARABIC FATHA
+\u064F ↔ u ; # ARABIC DAMMA
+\u0650 ↔ i ; # ARABIC KASRA
+\u0651 ↔ \u0303 ; # ARABIC SHADDA
+\u0652 ↔ \u030A ; # ARABIC SUKUN
+# special combining marks
+\u0653 ↔ \u0302 ; # ARABIC MADDAH ABOVE
+\u0654 ↔ \u0309 ; # ARABIC HAMZA ABOVE
+\u0655 ↔ \u0339 ; # ARABIC HAMZA BELOW
+# Some non-Arabic language (not in UNGEGN)
+پ ↔ p ; # ARABIC LETTER PEH
+چ ↔ c h $disambig ; # ARABIC LETTER TCHEH
+ڤ ↔ v ; # ARABIC LETTER VEH
+# ڥ ↔ v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
+# ڢ ↔ f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
+گ ↔ g ; # ARABIC LETTER GAF
+# fallbacks TODO roundtrip where possible, using diacritics to distinguish
+#https://en.wikipedia.org/wiki/Sindhi_transliteration
+‎ٺ‎→ṭh;
+‎ٿ‎→th;
+‎ٽ‎→ṭ;
+‎ڙ‎→ṛ;
+‎ڦ‎→ph;
+‎ڻ‎→ṇ;
+‎ڱ‎→ṅ;
+‎ڃ‎→ñ;
+‎ڪ‎→k;
+‎ڄ‎→j\u0308;
+‎ۃ‎→ẖ;
+‎ڳ‎→g\u0324;
+‎ڍ‎→ḍh;
+‎ڌ‎→dh;
+‎ڏ‎→d\u0324;
+‎ڊ‎→ḍ;
+‎ڇ‎→ch;
+‎ڀ‎→bh;
+‎ٻ‎→ḇ;
+‎۽‎→'&';
+‎۾‎→'mn';
+#https://en.wiktionary.org/wiki/Wiktionary:Urdu_transliteration
+‎ھ‎ → ʱ ;
+‎ں‎ → ◌\u0303 ;
+‎ے‎ → ai ;
+‎ڈ‎ → ḍ ;
+‎ڑ‎ → ṛ ;
+‎ٹ‎ → ṭ ;
+#https://www.eki.ee/wgrs/rom2_ps.htm
+#https://en.wikipedia.org/wiki/Pashto_alphabet
+‎ټ‎ → ṯ ;
+‎ځ‎ → dz ;
+‎څ‎ → ts ;
+‎ډ‎ → ḏ ;
+‎ړ‎ → ṟ ;
+‎ږ‎ → z\u035Fh ;
+‎ګ‎ → g ;
+‎ڼ‎ → ṉ ;
+‎ۍ‎ → ạy ;
+‎ې‎ → e ;
+#https://www.eki.ee/wgrs/rom1_ug.pdf
+‎ہ‎ → ḥ ;
+‎ە‎ → ĥ ;
+# fallbacks
+| s ← c } [eiy];
+| k ← c ;
+| i ← e ;
+| u ← o ;
+| ks ← x ;
+| n ← ‎ⁿ;
+:: (lower) ;
+::NFC (NFD);
+:: ( [[:Latin:] [%,.0-9;?ʾ-ʿ\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339;ˌ]] );
+