summaryrefslogtreecommitdiffstats
path: root/intl/icu/source/data/translit/Han_Spacedhan.txt
diff options
context:
space:
mode:
Diffstat (limited to 'intl/icu/source/data/translit/Han_Spacedhan.txt')
-rw-r--r--intl/icu/source/data/translit/Han_Spacedhan.txt42
1 files changed, 42 insertions, 0 deletions
diff --git a/intl/icu/source/data/translit/Han_Spacedhan.txt b/intl/icu/source/data/translit/Han_Spacedhan.txt
new file mode 100644
index 0000000000..b88c1dd14c
--- /dev/null
+++ b/intl/icu/source/data/translit/Han_Spacedhan.txt
@@ -0,0 +1,42 @@
+# © 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html
+# Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
+#
+# File: Han_Spacedhan.txt
+# Generated from CLDR
+#
+
+# Only intended for internal use
+# Make sure Han are normalized, including characters that contain them.
+# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:]
+# Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release!
+:: [[、。々《-』〜・㆒-㆟㈠-㉇㊀-㊰㋀-㋋ ㍘-㍰㍻-㍿㏠-㏾🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc;
+:: fullwidth-halfwidth;
+。 → '.';
+。→ '.';
+、→ ',';
+、→ ',';
+《→ '«';
+》→ '»';
+〈 → '‹';
+〉→ '›';
+「→ '‘';
+」→ '’';
+「→ '‘';
+」→ '’';
+『→ '“';
+』→ '”';
+・→ '‧';
+・ → '‧';
+々→ '⓶';
+〜→ '~';
+$terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]];
+$initialPunct = [:Ps:][:Pi:];
+# add space between any Han or terminal punctuation and letters, and
+# between letters and Han or initial punct
+[[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ;
+[:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ;
+# remove spacing between ideographs and other letters
+← [:Ideographic:] { ' ' } [:Letter:] ;
+← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ;
+