From ed5640d8b587fbcfed7dd7967f3de04b37a76f26 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 11:06:44 +0200 Subject: Adding upstream version 4:7.4.7. Signed-off-by: Daniel Baumann --- .../source/breakiterator/data/edit_word_hu.txt | 159 +++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 i18npool/source/breakiterator/data/edit_word_hu.txt (limited to 'i18npool/source/breakiterator/data/edit_word_hu.txt') diff --git a/i18npool/source/breakiterator/data/edit_word_hu.txt b/i18npool/source/breakiterator/data/edit_word_hu.txt new file mode 100644 index 000000000..4a08acab0 --- /dev/null +++ b/i18npool/source/breakiterator/data/edit_word_hu.txt @@ -0,0 +1,159 @@ +# +# Copyright (C) 2002-2003, International Business Machines Corporation and others. +# All Rights Reserved. +# +# file: edit_word.txt +# +# ICU Word Break Rules +# See Unicode Standard Annex #29. +# These rules are based on Version 4.0.0, dated 2003-04-17 +# + + + +#################################################################################### +# +# Character class definitions from TR 29 +# +#################################################################################### +$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] + [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] + [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] + [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; + +$Ideographic = [:Ideographic:]; +$Hangul = [:Script = HANGUL:]; + +$ALetter = [[:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] + [:name = PERCENT SIGN:] [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] + [:name = SECTION SIGN:] [:name = DEGREE SIGN:] [:name = EURO SIGN:] + [:name = HYPHEN-MINUS:] [:name = EN DASH:] [:name = EM DASH:] + [:name = DIGIT ZERO:] + [:name = DIGIT ONE:] + [:name = DIGIT TWO:] + [:name = DIGIT THREE:] + [:name = DIGIT FOUR:] + [:name = DIGIT FIVE:] + [:name = DIGIT SIX:] + [:name = DIGIT SEVEN:] + [:name = DIGIT EIGHT:] + [:name = DIGIT NINE:] + - $Ideographic + - $Katakana + - $Hangul + - [:Script = Thai:] + - [:Script = Lao:] + - [:Script = Hiragana:]]; + +$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] + [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] + [:name = HYPHEN-MINUS:] [:name = EURO SIGN:] [:name = PERCENT SIGN:] + [:name = PER MILLE SIGN:] [:name = PER TEN THOUSAND SIGN:] + [:name = EN DASH:] [:name = EM DASH:] + [:name = PERCENT SIGN:] [:name = SECTION SIGN:] [:name = DEGREE SIGN:]]; + +$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; +$Numeric = [:LineBreak = Numeric:]; + + +$TheZWSP = \u200b; + +# +# Character Class Definitions. +# The names are those from TR29. +# +$CR = \u000d; +$LF = \u000a; +$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; +$Extend = [[:Grapheme_Extend = TRUE:]]; + + + + +#################################################################################### +# +# Word Break Rules. Definitions and Rules specific to word break begin Here. +# +#################################################################################### + +$Format = [[:Cf:] - $TheZWSP]; + + + +# Rule 3: Treat a grapheme cluster as if it were a single character. +# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters +# because we don't need to find the boundaries between adjacent syllables - +# they won't be word boundaries. +# + + +# +# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. +# +$ALetterEx = $ALetter $Extend*; +$NumericEx = $Numeric $Extend*; +$MidNumEx = $MidNum $Extend*; +$MidLetterEx = $MidLetter $Extend*; +$KatakanaEx = $Katakana $Extend*; +$IdeographicEx= $Ideographic $Extend*; +$HangulEx = $Hangul $Extend*; +$FormatEx = $Format $Extend*; + + +# +# Numbers. Rules 8, 11, 12 form the TR. +# +$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; +$NumberSequence {100}; + +# +# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 +# - must include at least one letter. +# - may include both letters and numbers. +# - may include MideLetter, MidNumber punctuation. +# +$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 +($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; + +# Punctuations by themselves +[[:P:][:S:]-[:name = FULL STOP:]]*; +[[:name = FULL STOP:]]*; + +# +# Do not break between Katakana. Rule #13. +# +$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; +[:Hiragana:] $Extend* {300}; + +# +# Ideographic Characters. Stand by themselves as words. +# Separated from the "Everything Else" rule, below, only so that they +# can be tagged with a return value. TODO: is this what we want? +# +$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; +$HangulEx ($FormatEx* $HangulEx)* {400}; + +# +# Everything Else, with no tag. +# Non-Control chars combine with $Extend (combining) chars. +# Controls are do not. +# +[^$Control [:Ideographic:]] $Extend*; +$CR $LF; + +# +# Reverse Rules. Back up over any of the chars that can group together. +# (Reverse rules do not need to be exact; they can back up too far, +# but must back up at least enough, and must stop on a boundary.) +# + +# NonStarters are the set of all characters that can appear at the 2nd - nth position of +# a word. (They may also be the first.) The reverse rule skips over these, until it +# reaches something that can only be the start (and probably only) char in a "word". +# A space or punctuation meets the test. +# +$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; + +#!.*; +! ($NonStarters* | \n \r) .; + -- cgit v1.2.3