diff options
Diffstat (limited to 'intl/icu/source/data/translit/si_si_FONIPA.txt')
-rw-r--r-- | intl/icu/source/data/translit/si_si_FONIPA.txt | 164 |
1 files changed, 164 insertions, 0 deletions
diff --git a/intl/icu/source/data/translit/si_si_FONIPA.txt b/intl/icu/source/data/translit/si_si_FONIPA.txt new file mode 100644 index 0000000000..581de75f0e --- /dev/null +++ b/intl/icu/source/data/translit/si_si_FONIPA.txt @@ -0,0 +1,164 @@ +# © 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html +# Generated using tools/cldr/cldr-to-icu/build-icu-data.xml +# +# File: si_si_FONIPA.txt +# Generated from CLDR +# + +# Sinhala pronunciation rules +# +# Output +# k ɡ ŋ ᵑɡ c ɟ ɲ ʈ ɖ ⁿɖ t d n ⁿd p b m ᵐb j r l w ʃ s h f +# ə əː a aː æ æː i iː u uː e eː o oː +# +# References +# [1] Asanka Wasala, Ruvan Weerasinghe, and Kumudu Gamage: +# Sinhala Grapheme-to-Phoneme Conversion and Rules for Schwa Epenthesis. +# Proceedings of the COLING/ACL 2006 Main Conference Poster Sessions, +# pages 890–897. http://www.aclweb.org/anthology/P06-2114 +# Simplify ya + yansaya to plain ya after a consonant. +[\u0D9A-\u0DC6] \u0DCA (\u200D)? { ය\u0DCAය → ය; +# Delete ZWNJ and ZWJ to simplify further processing. +\u200C → ; +\u200D → ; +# Insert a schwa after every consonant that is not followed by a dependent vowel +# or virama. +::Null; +([\u0D9A-\u0DC6]) } [^\u0DCA-\u0DDF \u0DF2\u0DF3] → $1 ə; +# Pronunciation rules proper. +::Null; +# fප is an alternative spelling of ෆ. +# This occurs e.g. in ඩේව\u0DD2ඩ\u0DCA කොපර\u0DCAfප\u0DD3ල\u0DCAඩ\u0DCA (David Copperfield) +# [see http://bradshawofthefuture.blogspot.com/2013/02/f.html]. +[Ff]ප → f; +# zස is seemingly the only way to unambiguously indicate a voiced /z/ sound. +# This occurs in e.g. ඇල\u0DCAzසය\u0DD2ම' රෝගය (Alzheimer's disease) +# [see https://si.wikipedia.org/wiki/ඇල\u0DCAzසය\u0DD2ම%27_රෝගය] +# or in zස\u0DD3බ\u0DCAරා (zebra) [see https://si.wikipedia.org/wiki/zස\u0DD3බ\u0DCAරා]. +[Zz]ස → z; +ං → ŋ; +o → ŋ; # common substitution for anusvaraya +ඃ ([\u0D9A-\u0DC6]) → | $1 \u0DCA $1; # TODO: check which consonants geminate +ඃ → h; +අ → a; +ආ → aː; +ඇ → æ; +ඈ → æː; +ඉ → i; +ඊ → iː; +උ → u; +ඌ → uː; +ඍ → ri; +ඎ → ruː; +ඏ → ilu; +ඐ → iluː; +එ → e; +ඒ → eː; +ඓ → aj; +ඔ → o; +ඕ → oː; +ඖ → aw; # TODO: check if this is correct +ක → k; +ඛ → k; +ග → ɡ; +ඝ → ɡ; +ඞ → ŋ; +ඟ → ᵑɡ; +ච → c; +ඡ → c; +ජ → ɟ; +ඣ → ɟ; +ඤ → ɲ; +ඥ → kɲ; # TODO: double-check +ඦ → ɟ; +ට → ʈ; +ඨ → ʈ; +ඩ → ɖ; +ඪ → ɖ; +ණ → n; +ඬ → ⁿɖ; +ත → t; +ථ → t; +ද → d; +ධ → d; +න → n; +ඳ → ⁿd; +ප → p; +ඵ → p; +බ → b; +භ → b; +ම → m; +ඹ → ᵐb; +ය → j; +ර → r; +ල → l; +ව → w; +ශ → ʃ; +ෂ → ʃ; +ස → s; +හ → h; +ළ → l; +ෆ → f; +\u0DCA → ; # delete virama +ා → aː; +ැ → æ; +ෑ → æː; +\u0DD2 → i; +\u0DD3 → iː; +\u0DD4 → u; +\u0DD6 → uː; +ෘ → ru; +ෙ → e; +ේ → eː; +ෛ → aj; +ො → o; +ෝ → oː; +ෞ → aw; # TODO: check if this is correct +ෟ → lu; +ෲ → ruː; +ෳ → luː; +# Heuristics for turning /ə/ into /a/. Based on [1]. +$c=[k ɡ ŋ {ᵑɡ} c ɟ ɲ ʈ ɖ {ⁿɖ} t d n {ⁿd} p b m {ᵐb} j r l w ʃ s z h f]; +$s=[:^L:]; +# Rule #1 +::Null; +$s sv { ə → ə; # exception (a) +$s k { ə } r → ə; # exception (b) +$s $c { ə } $s → ə; # exception (c) +$s $c $c { ə → a; +$s $c { ə → a; +# Rule #2 +::Null; +$c r { ə } $c → a; # clause (a) and (b) +$c r { a } h → a; # clause (d), exception +$c r { a } $c → ə; # clause (c) +# Rule #3 +# The paper is unclear about what this rule means. The interpretation here +# assumes that "preceded" in the paper is a typo and should be read "followed". +::Null; +[a e æ o ə] h { ə → a; +# Rules #4 through #7 +::Null; +ə } $c $c → a; # Rule #4 +ə } [rbɖʈ] $s → ə; # Rule #5 exception +ə } $c $s → a; # Rule #5 +ə } ji $s → a; # Rule #6 +k { ə } [rl] u → a; # Rule #7 +# Rule #8 +# Note that the paper doesn't say explicitly that this rule should be +# anchored at the beginning of a word, but the remarks before the rules +# seem to imply this. +::Null; +$s k { a } l[aeo]ːj → ə; # Typo in paper: /j/ was /y/. +$s k { a } le[mh][ui] → ə; +$s k { alə } h[ui] → əle; +$s k { a } lə → ə; +# Diphthongs +::Null; +www+ → ww; # යෞව\u0DCAවන +[i {iː} e {eː} æ {æː} o {oː} a {aː}] { wu → w; +əji → aj; +iji → iː; # perhaps: ij +[u {uː} e {eː} æ {æː} o {oː} a {aː}] { ji → j; + |