summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/data/translit/Zawgyi_my.txt
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/data/translit/Zawgyi_my.txt')
-rw-r--r--intl/icu/source/data/translit/Zawgyi_my.txt238
1 files changed, 238 insertions, 0 deletions
diff --git a/intl/icu/source/data/translit/Zawgyi_my.txt b/intl/icu/source/data/translit/Zawgyi_my.txt
new file mode 100644
index 0000000000..665d83da4e
--- /dev/null
+++ b/intl/icu/source/data/translit/Zawgyi_my.txt
@@ -0,0 +1,238 @@
+# © 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
+#
+# File: Zawgyi_my.txt
+# Generated from CLDR
+#
+
+# This transform converts Zawgyi "encoded" Burmese into proper
+# unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses
+# the Myanmar unicode range but assigns different characters or
+# glyphs to some codepoints. In addition to the character mapping,
+# there is reordering of codepoints needed to match the expected
+# unicode order. This reordering is context-based.
+#
+# This transform is done in two main stages:
+# (1) Map all Zawgyi codepoints to their Unicode counterpart.
+# (2) Perform reordering.
+# Modern Burmese digits & Unicode code points.
+$nondigits = [^\u1040-\u1049];
+$consonant = [\u1000-\u1021];
+$vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031)
+$vowelsAndConsonants = [\u1000-\u102a];
+$umedial = [\u103B-\u103E]; # Medial codepoints in Unicode
+$vowelmedial = [\u102B-\u1030\u1032\1u36\u1037\u103A-\u103F]; # Union of vowel signs and medials
+$ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode
+# Zawgyi medial ra has multiple representations
+$zmedialra = [\u103B\u107E-\u1084];
+$wspace = [\u0020\u00a0\u1680\u2000-\u200d\u2060\u202f\u205f\u3000\ufeff];
+####
+#### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE
+####
+# Kinzi (predefined ligatures)
+# Move base character to the right
+($consonant) \u103A \u1064 → $ukinzi $1 \u103B;
+($consonant) \u1064 → $ukinzi $1;
+\u1064 → $ukinzi;
+# Special cases moving base character to right before vowel signs
+($consonant) \u108B → $ukinzi $1 \u102D;
+($consonant) \u108C → $ukinzi $1 \u102E;
+($consonant) \u108D → $ukinzi $1 \u1036;
+# Special cases moving Kinzi block to left
+($consonant) \u103A \u1033 \u108B → $ukinzi $1 \u103B \u102D \u102F;
+($consonant) \u103A \u108b → $ukinzi $1 \u103B \u102D ;
+($consonant) \u103A \u108C → $ukinzi $1 \u103B \u102E ;
+($consonant) \u103A \u108D → $ukinzi $1 \u103B \u1036 ;
+($consonant) \u103A \u108e → $1 \u103B \u102D \u1036 ;
+\u108B → $ukinzi \u102D ;
+\u108C → $ukinzi \u102E ;
+\u108D → $ukinzi \u1036 ;
+# Consonants (only the ones that have to change)
+\u106A → \u1009 ; # NYA
+\u106B → \u100A ;
+\u108F → \u1014 ;
+\u1090 → \u101B ;
+\u1086 → \u103F ;
+# yapin
+[\u103A\u107d] → \u103B ;
+# yayit
+($zmedialra)+ → \u103C ;
+# wasway
+\u103C* \u108A → \u103D \u103E; # To avoid duplicate medials
+\u103C → \u103D ;
+# hatoh
+[\u103D\u1087] → \u103E ;
+\u1088 → \u103E \u102F ;
+\u1089 → \u103E \u1030 ;
+# Vowels
+\u1033 → \u102F ;
+\u1034 → \u1030 ;
+# asat
+\u1039 → \u103A ;
+# lower dot
+[\u1094\u1095] → \u1037 ;
+# Special cases for 1025 vs 1009;
+\u1025 \u1039 → \u1009 \u103a;
+\u1025 \u1061 → \u1009 \u1039 \u1001;
+\u1025 \u1062 → \u1009 \u1039 \u1002;
+\u1025 \u1065 → \u1009 \u1039 \u1005;
+\u1025 \u1068 → \u1009 \u1039 \u1007;
+\u1025 \u1076 → \u1009 \u1039 \u1013;
+\u1025 \u1078 → \u1009 \u1039 \u1015;
+\u1025 \u107A → \u1009 \u1039 \u1017;
+\u1025 \u1079 → \u1009 \u1039 \u1016;
+# Stacked Consonants
+\u105A → \u102B \u103A ;
+\u1060 → \u1039 \u1000 ;
+\u1061 → \u1039 \u1001 ;
+\u1062 → \u1039 \u1002 ;
+\u1063 → \u1039 \u1003 ;
+\u1065 → \u1039 \u1005 ;
+[\u1066\u1067] → \u1039 \u1006 ;
+\u1068 → \u1039 \u1007 ;
+\u1069 → \u1039 \u1008 ;
+\u106C → \u1039 \u100B ;
+\u106D → \u1039 \u100C ;
+\u1070 → \u1039 \u100F ;
+[\u1071\u1072] → \u1039 \u1010 ;
+\u1096 → \u1039 \u1010 \u103D;
+[\u1073\u1074] → \u1039 \u1011 ;
+\u1075 → \u1039 \u1012 ;
+\u1076 → \u1039 \u1013 ;
+\u1077 → \u1039 \u1014 ;
+\u1078 → \u1039 \u1015 ;
+\u1079 → \u1039 \u1016 ;
+\u107A → \u1039 \u1017 ;
+[\u107B\u1093] → \u1039 \u1018 ;
+\u107C → \u1039 \u1019 ;
+\u1085 → \u1039 \u101C ;
+\u108E → \u102D \u1036 ;
+# Pre-defined ligatures
+\u106E → \u100D\u1039\u100D ;
+\u106F → \u100D\u1039\u100E ;
+\u1091 → \u100F\u1039\u100D ;
+\u1092 → \u100B\u1039\u100C ;
+\u1097 → \u100B\u1039\u100B ;
+\u104E → \u104E\u1004\u103A\u1038 ;
+####
+#### STAGE 1.01: Digits 0 and 4 used instead of letters
+# Case of MYANMAR digit being used instead of a letter
+# Lone digit zero and four at start
+::Null;
+^ \u1040 ($nondigits) → \u101D $1;
+^ \u1044 ($nondigits) → | \u104E $1 ;
+# Lone digit zero or four at end
+($nondigits) \u1040 $ → $1 \u101D;
+($nondigits) \u1044 $ → $1 \u104e;
+# Evowel and dependent vowel signs before 0 or 4 only
+# -> convert to the consonant.
+([\u102b-\u103f]) \u1040 ($nondigits) → $1 \u101d $2;
+([\u102b-\u103f]) \u1044 ($nondigits) → $1 \u104E $2;
+####
+#### STAGE 1.1: Strip spaces immediately before combining characters.
+#### Move e-vowel after consonants and medials
+#### Now every codepoint is Unicode. This starts conversion
+#### from semi-visual order to logical order.
+####
+::Null;
+# Don't remove spaces before E vowel or medial Ra at this stage
+($wspace) \u1037 > \u1037 $1;
+($wspace+) ([\u102b-\u1030\u1032-\u103b\u103d\u103e]) → $2;
+# Remove a duplicate early
+\u1037+ → \u1037;
+# Move e-vowel after medials and consonants.
+\u1031+ $ukinzi ($consonant) > $ukinzi $1 \u1031;
+\u1031+ \u1037+ ($consonant) > $1 \u1031 \u1037 ;
+\u1031+ \u103c ($consonant) > $1 \u103c \u1031;
+# Move medials other than 103c before the 1031. Leave 103c for
+# the next consonant.
+\u1031+ ($consonant) ([\u103b\u103d\u103e]+) > $1 $2 \u1031;
+\u1031+ ($vowelsAndConsonants) > $1 \u1031;
+####
+#### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING
+####
+::Null;
+\u103b \u103a > \u103a \u103b;
+# Simpler replacements for Zawgyi 1025
+\u1025 \u102E → \u1026;
+# Asat and dot below reordering, to Unicode NFC.
+\u103A\u1037 → \u1037\u103A;
+# Reorder some vowel signs
+\u1036 ($umedial*) ($vowelsign+) → $1 $2 \u1036 ;
+([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) → $2 $1;
+# Move ra medial which precedes consonant, but not other medials.
+\u103C ($consonant) → $1 \u103C;
+####
+#### Stage 3
+#### Move \u1036, and \u103C after consonants.
+::Null;
+($umedial) \u1039 ($consonant) > \u1039 $2 $1;
+\u103C \u103A \u1039 ($consonant) → \u103A \u1039 $1 \u103C;
+\u1036 ($umedial+) → $1 \u1036;
+####
+#### Stage 4
+#### Reordering medials, dot below, contractions, E sign, and asat.
+::Null;
+# Reorder the medials
+([\u103C\u103D\u103E]+) \u103B → \u103B $1;
+([\u103D\u103E]+) \u103C → \u103C $1;
+\u103E\u103D → \u103D\u103E ;
+# Contractions with vowel signs
+([\u1031]+) ($vowelsign*) \u1039 ($consonant) → \u1039 $3 $1 $2;
+($vowelsign+) \u1039 ($consonant) → \u1039 $2 $1;
+# Move vowel sign E \u1031 after medials, but not across consonants
+($umedial*) ([\u1031]+) ($umedial*) → $1 $3 $2;
+# Reorder dot below after medials and vowel diacritics
+\u1037 ([\u102D-\u1030\u1032\u1036\u103b-\u103e]+) → $1 \u1037;
+# Move vowel signs after medials
+($vowelsign+) ($umedial+) → $2 $1;
+# Reorder modifiers and asat
+($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) → $1 \u103A $2 $3;
+####
+#### Stage 5. More reorderings
+#### Vowel signs after medials, sort medials,
+####
+::Null;
+# Replace CA + YA with JHA after moving other things beyond the medials.
+\u1005 \u103b → \u1008;
+# More moving vowel signs after medials
+([\u102b-\u1032]) ($umedial) → $2 $1;
+# Sort the medials
+([\u103C\u103D\u103E]) \u103B → \u103B $1;
+([\u103D\u103E]) \u103C → \u103C $1;
+\u103E\u103D → \u103D\u103E ;
+# Move visarga after other signs
+\u1038 ($vowelmedial) → $1 \u1038;
+# Reorder
+\u1036 \u102f → \u102f \u1036;
+###
+### Stage 6
+### Finish conflicting and extra diacritics. Remove some white space
+###
+::Null;
+# Fix duplicate combiners
+\u102D \u102D+ → \u102D;
+\u102E \u102E+ → \u102E;
+\u102F \u102F+ → \u102F;
+\u1030 \u1030+ → \u1030;
+\u1032 \u1032+ → \u1032;
+\u1036 \u1036+ → \u1036;
+\u1037 \u1037+ → \u1037;
+\u1039 \u1039+ → \u1039;
+\u103a \u103a+ → \u103a;
+\u103b \u103b+ → \u103b;
+\u103c \u103c+ → \u103c;
+\u103d \u103d+ → \u103d;
+\u103e \u103e+ → \u103e; # http://unicode.org/cldr/trac/ticket/10386
+# Fix overlapping signs
+\u102F [\u1030\u103a] → \u102F;
+\u102D \u102E → \u102E;
+# Remove space directly before diacritics.
+($wspace)+ ([\u102b-\u1032\u1036-\u103e]) → $2;
+# Remove ZWSP at start and end
+^ \u200b+ → ;
+\u200b+ $ → ;
+# Fix multiple spaces around ZWSP to single ZWSP.
+$wspace* \u200b $wspace* → \u200b;
+