1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
|
# © 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
#
# File: Arab_Latn.txt
# Generated from CLDR
#
# Generally follows UNGEGN
# http://www.eki.ee/wgrs/rom1_ar.pdf
# Occasionally deviates in the direction of ISO 233
# http://homepage.mac.com/sirbinks/pdf/Arabic.pdf
# a) where required for disambiguation.
# b) with underdot instead of cedilla for letter like SAD,
# since those are explicitly in Unicode for transliteration.
# c) with extra non-Arabic-language letters, like PEH
#
# Does *not* do assimilation of "al", nor hyphenation.
# While it could be done, we need to determine whether a prefix "al" could
# occur other than as the definite article (since no space is used).
:: [[:Arabic:][:block=ARABIC:][ⁿ،؛؟ـ\u064B-\u0655٠-٬۰-۹﷼ښ]] ;
:: NFKD (NFC);
$disambig = \u0331 ;
$disambig2 = \u0330 ;
$under = \u0323 ;
$descender = ˌ;
$notAbove = [[:^ccc=0:] & [:^ccc=230:]];
# non-letters
[:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR
[:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR
٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR
٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR
# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate
، ↔ ',' ; # ARABIC COMMA
؛ ↔ ';' ; # ARABIC SEMICOLON
؟ ↔ '?' ; # ARABIC QUESTION MARK
٪ ↔ '%' ; # ARABIC PERCENT SIGN
۰ ↔ 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO
۱ ↔ 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE
۲ ↔ 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO
۳ ↔ 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE
۴ ↔ 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR
۵ ↔ 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE
۶ ↔ 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX
۷ ↔ 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN
۸ ↔ 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT
۹ ↔ 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE
٠ ↔ 0 ; # ARABIC-INDIC DIGIT ZERO
١ ↔ 1 ; # ARABIC-INDIC DIGIT ONE
٢ ↔ 2 ; # ARABIC-INDIC DIGIT TWO
٣ ↔ 3 ; # ARABIC-INDIC DIGIT THREE
٤ ↔ 4 ; # ARABIC-INDIC DIGIT FOUR
٥ ↔ 5 ; # ARABIC-INDIC DIGIT FIVE
٦ ↔ 6 ; # ARABIC-INDIC DIGIT SIX
٧ ↔ 7 ; # ARABIC-INDIC DIGIT SEVEN
٨ ↔ 8 ; # ARABIC-INDIC DIGIT EIGHT
٩ ↔ 9 ; # ARABIC-INDIC DIGIT NINE
؉ ↔ ‰ ; # U+0609 ARABIC-INDIC PER MILLE SIGN
؊ ↔ ‱ ; # U+060A ARABIC-INDIC PER TEN THOUSAND SIGN
۔ ↔ '.' ; # U+06D4 ARABIC FULL STOP
# letters
# long vowels
\u064Eا↔ a\u0304 ; # ARABIC FATHA, ARABIC LETTER ALEF
\u064Fو ↔ u\u0304 ; # ARABIC DAMMA, ARABIC LETTER WAW
\u0650ي ↔ i\u0304 ; # ARABIC KASRA, ARABIC LETTER YEH
# longer items moved here to prevent masking
ث ↔ t h $disambig ; # ARABIC LETTER THEH
ذ ↔ d h $disambig ; # ARABIC LETTER THAL
ش ↔ s h $disambig ; # ARABIC LETTER SHEEN
ص ↔ s $under ; # ARABIC LETTER SAD
ض ↔ d $under ; # ARABIC LETTER DAD
ط ↔ t $under ; # ARABIC LETTER TAH
ظ ↔ z $under ; # ARABIC LETTER ZAH
غ ↔ g h $disambig ; # ARABIC LETTER GHAIN
# WARNING: special case
# ←t, umlaut, half-ring below→ will be canonically ordered as ←t, half-ring below, umlaut→
# so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
# ة\u0655 ← t\u0339\u0308 ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
ة ↔ t \u0308 ; # ARABIC LETTER TEH MARBUTA
ة | $1 ← t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA
# non-Arabic language
ژ ↔ z h $disambig ; # ARABIC LETTER JEH
ڭ ↔ n $disambig g ; # ARABIC LETTER NG
ۋ ↔ v $disambig ; # ARABIC LETTER VE
ی ↔ y $disambig2 ; # ARABIC LETTER FARSI YEH
ښ ↔ s $descender;
# Arabic language
ء ↔ ʾ ; # ARABIC LETTER HAMZA
ا ↔ a $under; # ARABIC LETTER ALEF
ب ↔ b ; # ARABIC LETTER BEH
ت ↔ t ; # ARABIC LETTER TEH
ج ↔ j ; # ARABIC LETTER JEEM
ح ↔ h $under ; # ARABIC LETTER HAH
خ ↔ k h $disambig ; # ARABIC LETTER KHAH
د ↔ d ; # ARABIC LETTER DAL
ر ↔ r ; # ARABIC LETTER REH
ز ↔ z ; # ARABIC LETTER ZAIN
س ↔ s ; # ARABIC LETTER SEEN
ع ↔ ʿ ; # ARABIC LETTER AIN
ـ → ; # ARABIC TATWEEL
ف ↔ f ; # ARABIC LETTER FEH
ق ↔ q ; # ARABIC LETTER QAF
ک ↔ k $disambig ; # ARABIC LETTER KEHEH
ك ↔ k ; # ARABIC LETTER KAF
ل ↔ l ; # ARABIC LETTER LAM
م ↔ m ; # ARABIC LETTER MEEM
ن ↔ n ; # ARABIC LETTER NOON
ه ↔ h ; # ARABIC LETTER HEH
و ↔ w ; # ARABIC LETTER WAW
ى ↔ y $disambig ; # ARABIC LETTER ALEF MAKSURA
ي ↔ y ; # ARABIC LETTER YEH
\u064B ↔ aⁿ ; # ARABIC FATHATAN
\u064C ↔ uⁿ ; # ARABIC DAMMATAN
\u064D ↔ iⁿ ; # ARABIC KASRATAN
\u064E ↔ a ; # ARABIC FATHA
\u064F ↔ u ; # ARABIC DAMMA
\u0650 ↔ i ; # ARABIC KASRA
\u0651 ↔ \u0303 ; # ARABIC SHADDA
\u0652 ↔ \u030A ; # ARABIC SUKUN
# special combining marks
\u0653 ↔ \u0302 ; # ARABIC MADDAH ABOVE
\u0654 ↔ \u0309 ; # ARABIC HAMZA ABOVE
\u0655 ↔ \u0339 ; # ARABIC HAMZA BELOW
# Some non-Arabic language (not in UNGEGN)
پ ↔ p ; # ARABIC LETTER PEH
چ ↔ c h $disambig ; # ARABIC LETTER TCHEH
ڤ ↔ v ; # ARABIC LETTER VEH
# ڥ ↔ v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
# ڢ ↔ f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
گ ↔ g ; # ARABIC LETTER GAF
# fallbacks TODO roundtrip where possible, using diacritics to distinguish
#https://en.wikipedia.org/wiki/Sindhi_transliteration
ٺ→ṭh;
ٿ→th;
ٽ→ṭ;
ڙ→ṛ;
ڦ→ph;
ڻ→ṇ;
ڱ→ṅ;
ڃ→ñ;
ڪ→k;
ڄ→j\u0308;
ۃ→ẖ;
ڳ→g\u0324;
ڍ→ḍh;
ڌ→dh;
ڏ→d\u0324;
ڊ→ḍ;
ڇ→ch;
ڀ→bh;
ٻ→ḇ;
۽→'&';
۾→'mn';
#https://en.wiktionary.org/wiki/Wiktionary:Urdu_transliteration
ھ → ʱ ;
ں → ◌\u0303 ;
ے → ai ;
ڈ → ḍ ;
ڑ → ṛ ;
ٹ → ṭ ;
#https://www.eki.ee/wgrs/rom2_ps.htm
#https://en.wikipedia.org/wiki/Pashto_alphabet
ټ → ṯ ;
ځ → dz ;
څ → ts ;
ډ → ḏ ;
ړ → ṟ ;
ږ → z\u035Fh ;
ګ → g ;
ڼ → ṉ ;
ۍ → ạy ;
ې → e ;
#https://www.eki.ee/wgrs/rom1_ug.pdf
ہ → ḥ ;
ە → ĥ ;
# fallbacks
| s ← c } [eiy];
| k ← c ;
| i ← e ;
| u ← o ;
| ks ← x ;
| n ← ⁿ;
:: (lower) ;
::NFC (NFD);
:: ( [[:Latin:] [%,.0-9;?ʾ-ʿ\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339;ˌ]] );
|