From 26a029d407be480d791972afb5975cf62c9360a6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 02:47:55 +0200 Subject: Adding upstream version 124.0.1. Signed-off-by: Daniel Baumann --- intl/lwbrk/LineBreaker.cpp | 1344 ++++++++++++++++++++ intl/lwbrk/LineBreaker.h | 82 ++ intl/lwbrk/Segmenter.cpp | 545 ++++++++ intl/lwbrk/Segmenter.h | 231 ++++ intl/lwbrk/WordBreaker.cpp | 249 ++++ intl/lwbrk/WordBreaker.h | 72 ++ intl/lwbrk/crashtests/416721.html | 11 + .../Lo_test_page_no_uniscribe_breaks.html | 12 + .../UDHR_Thai_test_page_long_sequences.html | 184 +++ intl/lwbrk/crashtests/crashtests.list | 1 + intl/lwbrk/crashtests/crashtests_manual.list | 6 + intl/lwbrk/gtest/TestBreak.cpp | 376 ++++++ intl/lwbrk/gtest/TestSegmenter.cpp | 209 +++ intl/lwbrk/gtest/TestSegmenterPerf.cpp | 276 ++++ intl/lwbrk/gtest/moz.build | 13 + intl/lwbrk/jisx4051class.h | 217 ++++ intl/lwbrk/jisx4051pairtable.txt | 286 +++++ intl/lwbrk/moz.build | 55 + intl/lwbrk/nsCarbonBreaker.cpp | 43 + intl/lwbrk/nsComplexBreaker.cpp | 174 +++ intl/lwbrk/nsComplexBreaker.h | 36 + intl/lwbrk/nsLWBrkCIID.h | 28 + intl/lwbrk/nsPangoBreaker.cpp | 61 + intl/lwbrk/nsRuleBreaker.cpp | 18 + intl/lwbrk/nsUniscribeBreaker.cpp | 146 +++ intl/lwbrk/rulebrk.c | 388 ++++++ intl/lwbrk/rulebrk.h | 26 + intl/lwbrk/th_char.h | 133 ++ intl/lwbrk/tools/anzx4051.html | 709 +++++++++++ intl/lwbrk/tools/anzx4051.pl | 356 ++++++ intl/lwbrk/tools/jisx4051class.txt | 159 +++ intl/lwbrk/tools/jisx4051simp.txt | 24 + intl/lwbrk/tools/spec_table.html | 664 ++++++++++ 33 files changed, 7134 insertions(+) create mode 100644 intl/lwbrk/LineBreaker.cpp create mode 100644 intl/lwbrk/LineBreaker.h create mode 100644 intl/lwbrk/Segmenter.cpp create mode 100644 intl/lwbrk/Segmenter.h create mode 100644 intl/lwbrk/WordBreaker.cpp create mode 100644 intl/lwbrk/WordBreaker.h create mode 100644 intl/lwbrk/crashtests/416721.html create mode 100644 intl/lwbrk/crashtests/Lo_test_page_no_uniscribe_breaks.html create mode 100644 intl/lwbrk/crashtests/UDHR_Thai_test_page_long_sequences.html create mode 100644 intl/lwbrk/crashtests/crashtests.list create mode 100644 intl/lwbrk/crashtests/crashtests_manual.list create mode 100644 intl/lwbrk/gtest/TestBreak.cpp create mode 100644 intl/lwbrk/gtest/TestSegmenter.cpp create mode 100644 intl/lwbrk/gtest/TestSegmenterPerf.cpp create mode 100644 intl/lwbrk/gtest/moz.build create mode 100644 intl/lwbrk/jisx4051class.h create mode 100644 intl/lwbrk/jisx4051pairtable.txt create mode 100644 intl/lwbrk/moz.build create mode 100644 intl/lwbrk/nsCarbonBreaker.cpp create mode 100644 intl/lwbrk/nsComplexBreaker.cpp create mode 100644 intl/lwbrk/nsComplexBreaker.h create mode 100644 intl/lwbrk/nsLWBrkCIID.h create mode 100644 intl/lwbrk/nsPangoBreaker.cpp create mode 100644 intl/lwbrk/nsRuleBreaker.cpp create mode 100644 intl/lwbrk/nsUniscribeBreaker.cpp create mode 100644 intl/lwbrk/rulebrk.c create mode 100644 intl/lwbrk/rulebrk.h create mode 100644 intl/lwbrk/th_char.h create mode 100644 intl/lwbrk/tools/anzx4051.html create mode 100644 intl/lwbrk/tools/anzx4051.pl create mode 100644 intl/lwbrk/tools/jisx4051class.txt create mode 100644 intl/lwbrk/tools/jisx4051simp.txt create mode 100644 intl/lwbrk/tools/spec_table.html (limited to 'intl/lwbrk') diff --git a/intl/lwbrk/LineBreaker.cpp b/intl/lwbrk/LineBreaker.cpp new file mode 100644 index 0000000000..6f73035f42 --- /dev/null +++ b/intl/lwbrk/LineBreaker.cpp @@ -0,0 +1,1344 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/intl/LineBreaker.h" + +#include "jisx4051class.h" +#include "nsComplexBreaker.h" +#include "nsTArray.h" +#include "nsUnicodeProperties.h" +#include "mozilla/ArrayUtils.h" +#include "mozilla/intl/Segmenter.h" +#include "mozilla/intl/UnicodeProperties.h" + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +# include "ICU4XDataProvider.h" +# include "ICU4XLineBreakIteratorLatin1.hpp" +# include "ICU4XLineBreakIteratorUtf16.hpp" +# include "ICU4XLineSegmenter.h" +# include "mozilla/CheckedInt.h" +# include "mozilla/ClearOnShutdown.h" +# include "mozilla/intl/ICU4XGeckoDataProvider.h" +# include "mozilla/StaticPrefs_intl.h" +# include "nsThreadUtils.h" + +# include +#endif + +using namespace mozilla::unicode; +using namespace mozilla::intl; + +/* + + Simplification of Pair Table in JIS X 4051 + + 1. The Origion Table - in 4.1.3 + + In JIS x 4051. The pair table is defined as below + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 + * # * # + 1 X X X X X X X X X X X X X X X X X X X X X E + 2 X X X X X X + 3 X X X X X X + 4 X X X X X X + 5 X X X X X X + 6 X X X X X X + 7 X X X X X X X + 8 X X X X X X E + 9 X X X X X X + 10 X X X X X X + 11 X X X X X X + 12 X X X X X X + 13 X X X X X X X + 14 X X X X X X X + 15 X X X X X X X X X + 16 X X X X X X X X + 17 X X X X X E + 18 X X X X X X X X X + 19 X E E E E E X X X X X X X X X X X X E X E E + 20 X X X X X E + + * Same Char + # Other Char + + X Cannot Break + + The classes mean: + 1: Open parenthesis + 2: Close parenthesis + 3: Prohibit a line break before + 4: Punctuation for sentence end (except Full stop, e.g., "!" and "?") + 5: Middle dot (e.g., U+30FB KATAKANA MIDDLE DOT) + 6: Full stop + 7: Non-breakable between same characters + 8: Prefix (e.g., "$", "NO.") + 9: Postfix (e.g., "%") + 10: Ideographic space + 11: Hiragana + 12: Japanese characters (except class 11) + 13: Subscript + 14: Ruby + 15: Numeric + 16: Alphabet + 17: Space for Western language + 18: Western characters (except class 17) + 19: Split line note (Warichu) begin quote + 20: Split line note (Warichu) end quote + + 2. Simplified by remove the class which we do not care + + However, since we do not care about class 13(Subscript), 14(Ruby), + 16 (Aphabet), 19(split line note begin quote), and 20(split line note end + quote) we can simplify this par table into the following + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 15 17 18 + + 1 X X X X X X X X X X X X X X X + 2 X X X X X + 3 X X X X X + 4 X X X X X + 5 X X X X X + 6 X X X X X + 7 X X X X X X + 8 X X X X X X + 9 X X X X X + 10 X X X X X + 11 X X X X X + 12 X X X X X + 15 X X X X X X X X + 17 X X X X X + 18 X X X X X X X + + 3. Simplified by merged classes + + After the 2 simplification, the pair table have some duplication + a. class 2, 3, 4, 5, 6, are the same- we can merged them + b. class 10, 11, 12, 17 are the same- we can merged them + + We introduce an extra non-breaking pair at [b]/7 to better match + the expectations of CSS line-breaking as tested by WPT tests. + This added entry is marked as * in the tables below. + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 + + 1 X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X * + 15 X X X X + 18 X X X + + + 4. We add COMPLEX characters and make it breakable w/ all ther class + except after class 1 and before class [a] + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX + + 1 X X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X * + 15 X X X X + 18 X X X + COMPLEX X T + + T : need special handling + + + 5. However, we need two special class for some punctuations/parentheses, + theirs breaking rules like character class (18), see bug 389056. + And also we need character like punctuation that is same behavior with 18, + but the characters are not letters of all languages. (e.g., '_') + [c]. Based on open parenthesis class (1), but it is not breakable after + character class (18) or numeric class (15). + [d]. Based on close parenthesis (or punctuation) class (2), but it is not + breakable before character class (18) or numeric class (15). + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] + + 1 X X X X X X X X X X X + [a] X X X + 7 X X + 8 X X + 9 X + [b] X * X + 15 X X X X X X + 18 X X X X X + COMPLEX X T + [c] X X X X X X X X X X X + [d] X X X X + + + 6. And Unicode has "NON-BREAK" characters. The lines should be broken around + them. But in JIS X 4051, such class is not, therefore, we create [e]. + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] + + 1 X X X X X X X X X X X X + [a] X X X + 7 X X X + 8 X X X + 9 X X + [b] X * X X + 15 X X X X X X X + 18 X X X X X X + COMPLEX X T X + [c] X X X X X X X X X X X X + [d] X X X X X + [e] X X X X X X X X X X X X + + + 7. Now we use one bit to encode whether it is breakable, and use 2 bytes + for one row, then the bit table will look like: + + 18 <- 1 + + 1 0000 1111 1111 1111 = 0x0FFF + [a] 0000 1100 0000 0010 = 0x0C02 + 7 0000 1000 0000 0110 = 0x0806 + 8 0000 1000 0100 0010 = 0x0842 + 9 0000 1000 0000 0010 = 0x0802 + [b] 0000 1100 0000 0110 = 0x0C06 + 15 0000 1110 1101 0010 = 0x0ED2 + 18 0000 1110 1100 0010 = 0x0EC2 + COMPLEX 0000 1001 0000 0010 = 0x0902 + [c] 0000 1111 1111 1111 = 0x0FFF + [d] 0000 1100 1100 0010 = 0x0CC2 + [e] 0000 1111 1111 1111 = 0x0FFF +*/ + +#define MAX_CLASSES 12 + +static const uint16_t gPair[MAX_CLASSES] = {0x0FFF, 0x0C02, 0x0806, 0x0842, + 0x0802, 0x0C06, 0x0ED2, 0x0EC2, + 0x0902, 0x0FFF, 0x0CC2, 0x0FFF}; + +/* + + 8. And if the character is not enough far from word start, word end and + another break point, we should not break in non-CJK languages. + I.e., Don't break around 15, 18, [c] and [d], but don't change + that if they are related to [b]. + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 18 COMPLEX [c] [d] [e] + + 1 X X X X X X X X X X X X + [a] X X X X X X + 7 X X X X X X X + 8 X X X X X X + 9 X X X X X X + [b] X * X X + 15 X X X X X X X X X X X + 18 X X X X X X X X X X X + COMPLEX X X X T X X X + [c] X X X X X X X X X X X X + [d] X X X X X X X X X X X + [e] X X X X X X X X X X X X + + 18 <- 1 + + 1 0000 1111 1111 1111 = 0x0FFF + [a] 0000 1110 1100 0010 = 0x0EC2 + 7 0000 1110 1100 0110 = 0x0EC6 + 8 0000 1110 1100 0010 = 0x0EC2 + 9 0000 1110 1100 0010 = 0x0EC2 + [b] 0000 1100 0000 0110 = 0x0C06 + 15 0000 1111 1101 1111 = 0x0FDF + 18 0000 1111 1101 1111 = 0x0FDF + COMPLEX 0000 1111 1100 0010 = 0x0FC2 + [c] 0000 1111 1111 1111 = 0x0FFF + [d] 0000 1111 1101 1111 = 0x0FDF + [e] 0000 1111 1111 1111 = 0x0FFF +*/ + +static const uint16_t gPairConservative[MAX_CLASSES] = { + 0x0FFF, 0x0EC2, 0x0EC6, 0x0EC2, 0x0EC2, 0x0C06, + 0x0FDF, 0x0FDF, 0x0FC2, 0x0FFF, 0x0FDF, 0x0FFF}; + +/* + + 9. Now we map the class to number + + 0: 1 + 1: [a]- 2, 3, 4, 5, 6 + 2: 7 + 3: 8 + 4: 9 + 5: [b]- 10, 11, 12, 17 + 6: 15 + 7: 18 + 8: COMPLEX + 9: [c] + A: [d] + B: [e] + + and they mean: + 0: Open parenthesis + 1: Punctuation that prohibits break before + 2: Non-breakable between same classes + 3: Prefix + 4: Postfix + 5: Breakable character (Spaces and Most Japanese characters) + 6: Numeric + 7: Characters + 8: Need special handling characters (E.g., Thai) + 9: Open parentheses like Character (See bug 389056) + A: Close parenthese (or punctuations) like Character (See bug 389056) + B: Non breakable (See bug 390920) + +*/ + +#define CLASS_NONE INT8_MAX + +#define CLASS_OPEN 0x00 +#define CLASS_CLOSE 0x01 +#define CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS 0x02 +#define CLASS_PREFIX 0x03 +#define CLASS_POSTFFIX 0x04 +#define CLASS_BREAKABLE 0x05 +#define CLASS_NUMERIC 0x06 +#define CLASS_CHARACTER 0x07 +#define CLASS_COMPLEX 0x08 +#define CLASS_OPEN_LIKE_CHARACTER 0x09 +#define CLASS_CLOSE_LIKE_CHARACTER 0x0A +#define CLASS_NON_BREAKABLE 0x0B + +#define U_NULL char16_t(0x0000) +#define U_SLASH char16_t('/') +#define U_SPACE char16_t(' ') +#define U_HYPHEN char16_t('-') +#define U_EQUAL char16_t('=') +#define U_PERCENT char16_t('%') +#define U_AMPERSAND char16_t('&') +#define U_SEMICOLON char16_t(';') +#define U_BACKSLASH char16_t('\\') +#define U_OPEN_SINGLE_QUOTE char16_t(0x2018) +#define U_OPEN_DOUBLE_QUOTE char16_t(0x201C) +#define U_OPEN_GUILLEMET char16_t(0x00AB) + +#define NEED_CONTEXTUAL_ANALYSIS(c) \ + (IS_HYPHEN(c) || (c) == U_SLASH || (c) == U_PERCENT || (c) == U_AMPERSAND || \ + (c) == U_SEMICOLON || (c) == U_BACKSLASH || (c) == U_OPEN_SINGLE_QUOTE || \ + (c) == U_OPEN_DOUBLE_QUOTE || (c) == U_OPEN_GUILLEMET) + +#define IS_ASCII_DIGIT(u) (0x0030 <= (u) && (u) <= 0x0039) + +static inline int GETCLASSFROMTABLE(const uint32_t* t, uint16_t l) { + return ((((t)[(l >> 3)]) >> ((l & 0x0007) << 2)) & 0x000f); +} + +static inline int IS_HALFWIDTH_IN_JISx4051_CLASS3(char16_t u) { + return ((0xff66 <= (u)) && ((u) <= 0xff70)); +} + +static inline int IS_CJK_CHAR(char32_t u) { + return ( + (0x1100 <= (u) && (u) <= 0x11ff) || (0x2e80 <= (u) && (u) <= 0xd7ff) || + (0xf900 <= (u) && (u) <= 0xfaff) || (0xff00 <= (u) && (u) <= 0xffef) || + (0x20000 <= (u) && (u) <= 0x2fffd)); +} + +static inline bool IS_NONBREAKABLE_SPACE(char16_t u) { + return u == 0x00A0 || u == 0x2007; // NO-BREAK SPACE, FIGURE SPACE +} + +static inline bool IS_HYPHEN(char16_t u) { + return (u == U_HYPHEN || u == 0x2010 || // HYPHEN + u == 0x2012 || // FIGURE DASH + u == 0x2013 || // EN DASH +#if ANDROID || XP_WIN + /* Bug 1647377: On Android and Windows, we don't have a "platform" + * backend that supports Tibetan (nsRuleBreaker.cpp only knows about + * Thai, and ScriptBreak doesn't handle Tibetan well either), so + * instead we just treat the TSHEG like a hyphen to provide basic + * line-breaking possibilities. + */ + u == 0x0F0B || // TIBETAN MARK INTERSYLLABIC TSHEG +#endif + u == 0x058A); // ARMENIAN HYPHEN +} + +static int8_t GetClass(uint32_t u, LineBreakRule aLevel, + bool aIsChineseOrJapanese) { + // Mapping for Unicode LineBreak.txt classes to the (simplified) set of + // character classes used here. + // XXX The mappings here were derived by comparing the Unicode LineBreak + // values of BMP characters to the classes our existing GetClass returns + // for the same codepoints; in cases where characters with the same + // LineBreak class mapped to various classes here, I picked what seemed + // the most prevalent equivalence. + // Some of these are unclear to me, but currently they are ONLY used + // for characters not handled by the old code below, so all the JISx405 + // special cases should already be accounted for. + static const int8_t sUnicodeLineBreakToClass[] = { + /* UNKNOWN = 0, [XX] */ CLASS_CHARACTER, + /* AMBIGUOUS = 1, [AI] */ CLASS_CHARACTER, + /* ALPHABETIC = 2, [AL] */ CLASS_CHARACTER, + /* BREAK_BOTH = 3, [B2] */ CLASS_CHARACTER, + /* BREAK_AFTER = 4, [BA] */ CLASS_BREAKABLE, + /* BREAK_BEFORE = 5, [BB] */ CLASS_OPEN_LIKE_CHARACTER, + /* MANDATORY_BREAK = 6, [BK] */ CLASS_CHARACTER, + /* CONTINGENT_BREAK = 7, [CB] */ CLASS_CHARACTER, + /* CLOSE_PUNCTUATION = 8, [CL] */ CLASS_CLOSE_LIKE_CHARACTER, + /* COMBINING_MARK = 9, [CM] */ CLASS_CHARACTER, + /* CARRIAGE_RETURN = 10, [CR] */ CLASS_BREAKABLE, + /* EXCLAMATION = 11, [EX] */ CLASS_CLOSE_LIKE_CHARACTER, + /* GLUE = 12, [GL] */ CLASS_NON_BREAKABLE, + /* HYPHEN = 13, [HY] */ CLASS_CHARACTER, + /* IDEOGRAPHIC = 14, [ID] */ CLASS_BREAKABLE, + /* INSEPARABLE = 15, [IN] */ CLASS_CLOSE_LIKE_CHARACTER, + /* INFIX_NUMERIC = 16, [IS] */ CLASS_CHARACTER, + /* LINE_FEED = 17, [LF] */ CLASS_BREAKABLE, + /* NONSTARTER = 18, [NS] */ CLASS_CLOSE_LIKE_CHARACTER, + /* NUMERIC = 19, [NU] */ CLASS_NUMERIC, + /* OPEN_PUNCTUATION = 20, [OP] */ CLASS_OPEN_LIKE_CHARACTER, + /* POSTFIX_NUMERIC = 21, [PO] */ CLASS_CLOSE_LIKE_CHARACTER, + /* PREFIX_NUMERIC = 22, [PR] */ CLASS_CHARACTER, + /* QUOTATION = 23, [QU] */ CLASS_CHARACTER, + /* COMPLEX_CONTEXT = 24, [SA] */ CLASS_CHARACTER, + /* SURROGATE = 25, [SG] */ CLASS_CHARACTER, + /* SPACE = 26, [SP] */ CLASS_BREAKABLE, + /* BREAK_SYMBOLS = 27, [SY] */ CLASS_CHARACTER, + /* ZWSPACE = 28, [ZW] */ CLASS_BREAKABLE, + /* NEXT_LINE = 29, [NL] */ CLASS_CHARACTER, + /* WORD_JOINER = 30, [WJ] */ CLASS_NON_BREAKABLE, + /* H2 = 31, [H2] */ CLASS_BREAKABLE, + /* H3 = 32, [H3] */ CLASS_BREAKABLE, + /* JL = 33, [JL] */ CLASS_CHARACTER, + /* JT = 34, [JT] */ CLASS_CHARACTER, + /* JV = 35, [JV] */ CLASS_CHARACTER, + /* CLOSE_PARENTHESIS = 36, [CP] */ CLASS_CLOSE_LIKE_CHARACTER, + /* CONDITIONAL_JAPANESE_STARTER = 37, [CJ] */ CLASS_CLOSE, + /* HEBREW_LETTER = 38, [HL] */ CLASS_CHARACTER, + /* REGIONAL_INDICATOR = 39, [RI] */ CLASS_CHARACTER, + /* E_BASE = 40, [EB] */ CLASS_BREAKABLE, + /* E_MODIFIER = 41, [EM] */ CLASS_CHARACTER, + /* ZWJ = 42, [ZWJ]*/ CLASS_CHARACTER}; + + static_assert(U_LB_COUNT == mozilla::ArrayLength(sUnicodeLineBreakToClass), + "Gecko vs ICU LineBreak class mismatch"); + + auto cls = GetLineBreakClass(u); + MOZ_ASSERT(cls < mozilla::ArrayLength(sUnicodeLineBreakToClass)); + + // Overrides based on rules for the different line-break values given in + // https://drafts.csswg.org/css-text-3/#line-break-property + switch (aLevel) { + case LineBreakRule::Auto: + // For now, just use legacy Gecko behavior. + // XXX Possible enhancement - vary strictness according to line width + // or other criteria. + break; + case LineBreakRule::Strict: + if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER || + (u == 0x3095 || u == 0x3096 || u == 0x30f5 || u == 0x30f6)) { + return CLASS_CLOSE; + } + if (cls == U_LB_INSEPARABLE) { + return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS; + } + if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E || + u == 0x30FD || u == 0x30FE) { + return CLASS_CLOSE_LIKE_CHARACTER; + } + if (aIsChineseOrJapanese) { + if (cls == U_LB_POSTFIX_NUMERIC && + UnicodeProperties::IsEastAsianWidthAFW(u)) { + return CLASS_CLOSE_LIKE_CHARACTER; + } + if (cls == U_LB_PREFIX_NUMERIC && + UnicodeProperties::IsEastAsianWidthAFW(u)) { + return CLASS_OPEN_LIKE_CHARACTER; + } + if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) { + return CLASS_CLOSE_LIKE_CHARACTER; + } + } + break; + case LineBreakRule::Normal: + if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) { + return CLASS_BREAKABLE; + } + if (cls == U_LB_INSEPARABLE) { + return CLASS_NON_BREAKABLE_BETWEEN_SAME_CLASS; + } + if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E || + u == 0x30FD || u == 0x30FE) { + return CLASS_CLOSE_LIKE_CHARACTER; + } + if (aIsChineseOrJapanese) { + if (cls == U_LB_POSTFIX_NUMERIC && + UnicodeProperties::IsEastAsianWidthAFW(u)) { + return CLASS_CLOSE_LIKE_CHARACTER; + } + if (cls == U_LB_PREFIX_NUMERIC && + UnicodeProperties::IsEastAsianWidthAFW(u)) { + return CLASS_OPEN_LIKE_CHARACTER; + } + if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) { + return CLASS_BREAKABLE; + } + } + break; + case LineBreakRule::Loose: + if (cls == U_LB_CONDITIONAL_JAPANESE_STARTER) { + return CLASS_BREAKABLE; + } + if (u == 0x3005 || u == 0x303B || u == 0x309D || u == 0x309E || + u == 0x30FD || u == 0x30FE) { + return CLASS_BREAKABLE; + } + if (cls == U_LB_INSEPARABLE) { + return CLASS_BREAKABLE; + } + if (aIsChineseOrJapanese) { + if (u == 0x30FB || u == 0xFF1A || u == 0xFF1B || u == 0xFF65 || + u == 0x203C || u == 0x2047 || u == 0x2048 || u == 0x2049 || + u == 0xFF01 || u == 0xFF1F) { + return CLASS_BREAKABLE; + } + if (cls == U_LB_POSTFIX_NUMERIC && + UnicodeProperties::IsEastAsianWidthAFW(u)) { + return CLASS_BREAKABLE; + } + if (cls == U_LB_PREFIX_NUMERIC && + UnicodeProperties::IsEastAsianWidthAFW(u)) { + return CLASS_BREAKABLE; + } + if (u == 0x2010 || u == 0x2013 || u == 0x301C || u == 0x30A0) { + return CLASS_BREAKABLE; + } + } + break; + case LineBreakRule::Anywhere: + MOZ_ASSERT_UNREACHABLE("should have been handled already"); + break; + } + + if (u < 0x10000) { + uint16_t h = u & 0xFF00; + uint16_t l = u & 0x00ff; + + // Handle 3 range table first + if (0x0000 == h) { + return GETCLASSFROMTABLE(gLBClass00, l); + } + if (0x1700 == h) { + return GETCLASSFROMTABLE(gLBClass17, l); + } + if (NS_NeedsPlatformNativeHandling(u)) { + return CLASS_COMPLEX; + } + if (0x0E00 == h) { + return GETCLASSFROMTABLE(gLBClass0E, l); + } + if (0x2000 == h) { + return GETCLASSFROMTABLE(gLBClass20, l); + } + if (0x2100 == h) { + return GETCLASSFROMTABLE(gLBClass21, l); + } + if (0x3000 == h) { + return GETCLASSFROMTABLE(gLBClass30, l); + } + if (0xff00 == h) { + if (l <= 0x0060) { // Fullwidth ASCII variant + // Previously, we treated Fullwidth chars the same as their ASCII + // counterparts, but UAX#14 (LineBreak.txt) disagrees with this and + // treats many of them as ideograph-like. + return sUnicodeLineBreakToClass[cls]; + } + if (l < 0x00a0) { // Halfwidth Katakana variants + switch (l) { + case 0x61: + return GetClass(0x3002, aLevel, aIsChineseOrJapanese); + case 0x62: + return GetClass(0x300c, aLevel, aIsChineseOrJapanese); + case 0x63: + return GetClass(0x300d, aLevel, aIsChineseOrJapanese); + case 0x64: + return GetClass(0x3001, aLevel, aIsChineseOrJapanese); + case 0x65: + return GetClass(0x30fb, aLevel, aIsChineseOrJapanese); + case 0x9e: + return GetClass(0x309b, aLevel, aIsChineseOrJapanese); + case 0x9f: + return GetClass(0x309c, aLevel, aIsChineseOrJapanese); + default: + if (IS_HALFWIDTH_IN_JISx4051_CLASS3(u)) { + return CLASS_CLOSE; // jis x4051 class 3 + } + return CLASS_BREAKABLE; // jis x4051 class 11 + } + } + if (l < 0x00e0) { + return CLASS_CHARACTER; // Halfwidth Hangul variants + } + if (l < 0x00f0) { + static char16_t NarrowFFEx[16] = { + 0x00A2, 0x00A3, 0x00AC, 0x00AF, 0x00A6, 0x00A5, 0x20A9, 0x0000, + 0x2502, 0x2190, 0x2191, 0x2192, 0x2193, 0x25A0, 0x25CB, 0x0000}; + return GetClass(NarrowFFEx[l - 0x00e0], aLevel, aIsChineseOrJapanese); + } + } else if (0x3100 == h) { + if (l <= 0xbf) { // Hangul Compatibility Jamo, Bopomofo, Kanbun + // XXX: This is per UAX #14, but UAX #14 may change + // the line breaking rules about Kanbun and Bopomofo. + return CLASS_BREAKABLE; + } + if (l >= 0xf0) { // Katakana small letters for Ainu + return CLASS_CLOSE; + } + } else if (0x0300 == h) { + if (0x4F == l || (0x5C <= l && l <= 0x62)) { + return CLASS_NON_BREAKABLE; + } + } else if (0x0500 == h) { + // ARMENIAN HYPHEN (for "Breaking Hyphens" of UAX#14) + if (l == 0x8A) { + return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); + } + } else if (0x0F00 == h) { + // We treat Tibetan TSHEG as a hyphen (when not using platform breaker); + // other Tibetan chars with LineBreak class=BA will be handled by the + // default sUnicodeLineBreakToClass mapping below. + if (l == 0x0B) { + return GETCLASSFROMTABLE(gLBClass00, uint16_t(U_HYPHEN)); + } + } else if (0x1800 == h) { + if (0x0E == l) { + return CLASS_NON_BREAKABLE; + } + } else if (0x1600 == h) { + if (0x80 == l) { // U+1680 OGHAM SPACE MARK + return CLASS_BREAKABLE; + } + } else if (u == 0xfeff) { + return CLASS_NON_BREAKABLE; + } + } + + return sUnicodeLineBreakToClass[cls]; +} + +static bool GetPair(int8_t c1, int8_t c2) { + NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1"); + NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2"); + + return (0 == ((gPair[c1] >> c2) & 0x0001)); +} + +static bool GetPairConservative(int8_t c1, int8_t c2) { + NS_ASSERTION(c1 < MAX_CLASSES, "illegal classes 1"); + NS_ASSERTION(c2 < MAX_CLASSES, "illegal classes 2"); + + return (0 == ((gPairConservative[c1] >> c2) & 0x0001)); +} + +class ContextState { + public: + ContextState(const char16_t* aText, uint32_t aLength) + : mUniText(aText), mText(nullptr), mLength(aLength) { + Init(); + } + + ContextState(const uint8_t* aText, uint32_t aLength) + : mUniText(nullptr), mText(aText), mLength(aLength) { + Init(); + } + + uint32_t Length() const { return mLength; } + uint32_t Index() const { return mIndex; } + + // This gets a single code unit of the text, without checking for surrogates + // (in the case of a 16-bit text buffer). That's OK if we're only checking for + // specific characters that are known to be BMP values. + char16_t GetCodeUnitAt(uint32_t aIndex) const { + MOZ_ASSERT(aIndex < mLength, "Out of range!"); + return mUniText ? mUniText[aIndex] : char16_t(mText[aIndex]); + } + + // This gets a 32-bit Unicode character (codepoint), handling surrogate pairs + // as necessary. It must ONLY be called for 16-bit text, not 8-bit. + char32_t GetUnicodeCharAt(uint32_t aIndex) const { + MOZ_ASSERT(mUniText, "Only for 16-bit text!"); + MOZ_ASSERT(aIndex < mLength, "Out of range!"); + char32_t c = mUniText[aIndex]; + if (aIndex + 1 < mLength && NS_IS_SURROGATE_PAIR(c, mUniText[aIndex + 1])) { + c = SURROGATE_TO_UCS4(c, mUniText[aIndex + 1]); + } + return c; + } + + void AdvanceIndex() { ++mIndex; } + + void NotifyBreakBefore() { mLastBreakIndex = mIndex; } + + // A word of western language should not be broken. But even if the word has + // only ASCII characters, non-natural context words should be broken, e.g., + // URL and file path. For protecting the natural words, we should use + // conservative breaking rules at following conditions: + // 1. at near the start of word + // 2. at near the end of word + // 3. at near the latest broken point + // CONSERVATIVE_RANGE_{LETTER,OTHER} define the 'near' in characters, + // which varies depending whether we are looking at a letter or a non-letter + // character: for non-letters, we use an extended "conservative" range. + +#define CONSERVATIVE_RANGE_LETTER 2 +#define CONSERVATIVE_RANGE_OTHER 6 + + bool UseConservativeBreaking(uint32_t aOffset = 0) const { + if (mHasCJKChar) return false; + uint32_t index = mIndex + aOffset; + + // If the character at index is a letter (rather than various punctuation + // characters, etc) then we want a shorter "conservative" range + uint32_t conservativeRangeStart, conservativeRangeEnd; + if (index < mLength && + nsUGenCategory::kLetter == + (mText ? GetGenCategory(mText[index]) + : GetGenCategory(GetUnicodeCharAt(index)))) { + // Primarily for hyphenated word prefixes/suffixes; we add 1 to Start + // to get more balanced behavior (if we break off a 2-letter prefix, + // that means the break will actually be three letters from start of + // word, to include the hyphen; whereas a 2-letter suffix will be + // broken only two letters from end of word). + conservativeRangeEnd = CONSERVATIVE_RANGE_LETTER; + conservativeRangeStart = CONSERVATIVE_RANGE_LETTER + 1; + } else { + conservativeRangeEnd = conservativeRangeStart = CONSERVATIVE_RANGE_OTHER; + } + + bool result = (index < conservativeRangeStart || + mLength - index < conservativeRangeEnd || + index - mLastBreakIndex < conservativeRangeStart); + if (result || !mHasNonbreakableSpace) return result; + + // This text has no-breakable space, we need to check whether the index + // is near it. + + // Note that index is always larger than conservativeRange here. + for (uint32_t i = index; index - conservativeRangeStart < i; --i) { + if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i - 1))) return true; + } + // Note that index is always less than mLength - conservativeRange. + for (uint32_t i = index + 1; i < index + conservativeRangeEnd; ++i) { + if (IS_NONBREAKABLE_SPACE(GetCodeUnitAt(i))) return true; + } + return false; + } + + bool HasPreviousEqualsSign() const { return mHasPreviousEqualsSign; } + void NotifySeenEqualsSign() { mHasPreviousEqualsSign = true; } + + bool HasPreviousSlash() const { return mHasPreviousSlash; } + void NotifySeenSlash() { mHasPreviousSlash = true; } + + bool HasPreviousBackslash() const { return mHasPreviousBackslash; } + void NotifySeenBackslash() { mHasPreviousBackslash = true; } + + uint32_t GetPreviousNonHyphenCharacter() const { + return mPreviousNonHyphenCharacter; + } + void NotifyNonHyphenCharacter(uint32_t ch) { + mPreviousNonHyphenCharacter = ch; + } + + private: + void Init() { + mIndex = 0; + mLastBreakIndex = 0; + mPreviousNonHyphenCharacter = U_NULL; + mHasCJKChar = false; + mHasNonbreakableSpace = false; + mHasPreviousEqualsSign = false; + mHasPreviousSlash = false; + mHasPreviousBackslash = false; + + if (mText) { + // 8-bit text: we only need to check for   + for (uint32_t i = 0; i < mLength; ++i) { + if (IS_NONBREAKABLE_SPACE(mText[i])) { + mHasNonbreakableSpace = true; + break; + } + } + } else { + // 16-bit text: handle surrogates and check for CJK as well as   + for (uint32_t i = 0; i < mLength; ++i) { + char32_t u = GetUnicodeCharAt(i); + if (!mHasNonbreakableSpace && IS_NONBREAKABLE_SPACE(u)) { + mHasNonbreakableSpace = true; + if (mHasCJKChar) { + break; + } + } else if (!mHasCJKChar && IS_CJK_CHAR(u)) { + mHasCJKChar = true; + if (mHasNonbreakableSpace) { + break; + } + } + if (u > 0xFFFFu) { + ++i; // step over trailing low surrogate + } + } + } + } + + const char16_t* const mUniText; + const uint8_t* const mText; + + uint32_t mIndex; + const uint32_t mLength; // length of text + uint32_t mLastBreakIndex; + char32_t mPreviousNonHyphenCharacter; // The last character we have seen + // which is not U_HYPHEN + bool mHasCJKChar; // if the text has CJK character, this is true. + bool mHasNonbreakableSpace; // if the text has no-breakable space, + // this is true. + bool mHasPreviousEqualsSign; // True if we have seen a U_EQUAL + bool mHasPreviousSlash; // True if we have seen a U_SLASH + bool mHasPreviousBackslash; // True if we have seen a U_BACKSLASH +}; + +static int8_t ContextualAnalysis(char32_t prev, char32_t cur, char32_t next, + ContextState& aState, LineBreakRule aLevel, + bool aIsChineseOrJapanese) { + // Don't return CLASS_OPEN/CLASS_CLOSE if aState.UseJISX4051 is FALSE. + + if (IS_HYPHEN(cur)) { + // If next character is hyphen, we don't need to break between them. + if (IS_HYPHEN(next)) return CLASS_CHARACTER; + // If prev and next characters are numeric, it may be in Math context. + // So, we should not break here. + bool prevIsNum = IS_ASCII_DIGIT(prev); + bool nextIsNum = IS_ASCII_DIGIT(next); + if (prevIsNum && nextIsNum) return CLASS_NUMERIC; + // If one side is numeric and the other is a character, or if both sides are + // characters, the hyphen should be breakable. + if (!aState.UseConservativeBreaking(1)) { + char32_t prevOfHyphen = aState.GetPreviousNonHyphenCharacter(); + if (prevOfHyphen && next) { + int8_t prevClass = GetClass(prevOfHyphen, aLevel, aIsChineseOrJapanese); + int8_t nextClass = GetClass(next, aLevel, aIsChineseOrJapanese); + bool prevIsNumOrCharOrClose = + prevIsNum || + (prevClass == CLASS_CHARACTER && + !NEED_CONTEXTUAL_ANALYSIS(prevOfHyphen)) || + prevClass == CLASS_CLOSE || prevClass == CLASS_CLOSE_LIKE_CHARACTER; + bool nextIsNumOrCharOrOpen = + nextIsNum || + (nextClass == CLASS_CHARACTER && !NEED_CONTEXTUAL_ANALYSIS(next)) || + nextClass == CLASS_OPEN || nextClass == CLASS_OPEN_LIKE_CHARACTER || + next == U_OPEN_SINGLE_QUOTE || next == U_OPEN_DOUBLE_QUOTE || + next == U_OPEN_GUILLEMET; + if (prevIsNumOrCharOrClose && nextIsNumOrCharOrOpen) { + return CLASS_CLOSE; + } + } + } + } else { + aState.NotifyNonHyphenCharacter(cur); + if (cur == U_SLASH || cur == U_BACKSLASH) { + // If this is immediately after same char, we should not break here. + if (prev == cur) return CLASS_CHARACTER; + // If this text has two or more (BACK)SLASHs, this may be file path or + // URL. Make sure to compute shouldReturn before we notify on this slash. + bool shouldReturn = !aState.UseConservativeBreaking() && + (cur == U_SLASH ? aState.HasPreviousSlash() + : aState.HasPreviousBackslash()); + + if (cur == U_SLASH) { + aState.NotifySeenSlash(); + } else { + aState.NotifySeenBackslash(); + } + + if (shouldReturn) return CLASS_OPEN; + } else if (cur == U_PERCENT) { + // If this is a part of the param of URL, we should break before. + if (!aState.UseConservativeBreaking()) { + if (aState.Index() >= 3 && + aState.GetCodeUnitAt(aState.Index() - 3) == U_PERCENT) + return CLASS_OPEN; + if (aState.Index() + 3 < aState.Length() && + aState.GetCodeUnitAt(aState.Index() + 3) == U_PERCENT) + return CLASS_OPEN; + } + } else if (cur == U_AMPERSAND || cur == U_SEMICOLON) { + // If this may be a separator of params of URL, we should break after. + if (!aState.UseConservativeBreaking(1) && aState.HasPreviousEqualsSign()) + return CLASS_CLOSE; + } else if (cur == U_OPEN_SINGLE_QUOTE || cur == U_OPEN_DOUBLE_QUOTE || + cur == U_OPEN_GUILLEMET) { + // for CJK usage, we treat these as openers to allow a break before them, + // but otherwise treat them as normal characters because quote mark usage + // in various Western languages varies too much; see bug #450088 + // discussion. + if (!aState.UseConservativeBreaking() && IS_CJK_CHAR(next)) + return CLASS_OPEN; + } else { + NS_ERROR("Forgot to handle the current character!"); + } + } + return GetClass(cur, aLevel, aIsChineseOrJapanese); +} + +int32_t LineBreaker::Next(const char16_t* aText, uint32_t aLen, uint32_t aPos) { + MOZ_ASSERT(aText); + + if (aPos >= aLen) { + return NS_LINEBREAKER_NEED_MORE_TEXT; + } + + bool textNeedsComplexLineBreak = false; + int32_t begin, end; + + for (begin = aPos; begin > 0 && !NS_IsSpace(aText[begin - 1]); --begin) { + if (IS_CJK_CHAR(aText[begin]) || + NS_NeedsPlatformNativeHandling(aText[begin])) { + textNeedsComplexLineBreak = true; + } + } + for (end = aPos + 1; end < int32_t(aLen) && !NS_IsSpace(aText[end]); ++end) { + if (IS_CJK_CHAR(aText[end]) || NS_NeedsPlatformNativeHandling(aText[end])) { + textNeedsComplexLineBreak = true; + } + } + + int32_t ret; + if (!textNeedsComplexLineBreak) { + // No complex text character, do not try to do complex line break. + // (This is required for serializers. See Bug #344816.) + ret = end; + } else { + AutoTArray breakState; + // XXX(Bug 1631371) Check if this should use a fallible operation as it + // pretended earlier. + breakState.AppendElements(end - begin); + ComputeBreakPositions(aText + begin, end - begin, WordBreakRule::Normal, + LineBreakRule::Auto, false, breakState.Elements()); + + ret = aPos; + do { + ++ret; + } while (begin < ret && ret < end && !breakState[ret - begin]); + } + + return ret; +} + +static bool SuppressBreakForKeepAll(uint32_t aPrev, uint32_t aCh) { + auto affectedByKeepAll = [](uint8_t aLBClass) { + switch (aLBClass) { + // Per https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all: + // "implicit soft wrap opportunities between typographic letter units + // (or other typographic character units belonging to the NU, AL, AI, + // or ID Unicode line breaking classes [UAX14]) are suppressed..." + case U_LB_ALPHABETIC: + case U_LB_AMBIGUOUS: + case U_LB_NUMERIC: + case U_LB_IDEOGRAPHIC: + // Additional classes that should be treated similarly, but have been + // broken out as separate classes in newer Unicode versions: + case U_LB_H2: + case U_LB_H3: + case U_LB_JL: + case U_LB_JV: + case U_LB_JT: + case U_LB_CONDITIONAL_JAPANESE_STARTER: + return true; + default: + return false; + } + }; + return affectedByKeepAll(GetLineBreakClass(aPrev)) && + affectedByKeepAll(GetLineBreakClass(aCh)); +} + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +static capi::ICU4XLineBreakStrictness ConvertLineBreakRuleToICU4X( + LineBreakRule aLevel) { + switch (aLevel) { + case LineBreakRule::Auto: + return capi::ICU4XLineBreakStrictness_Strict; + case LineBreakRule::Strict: + return capi::ICU4XLineBreakStrictness_Strict; + case LineBreakRule::Loose: + return capi::ICU4XLineBreakStrictness_Loose; + case LineBreakRule::Normal: + return capi::ICU4XLineBreakStrictness_Normal; + case LineBreakRule::Anywhere: + return capi::ICU4XLineBreakStrictness_Anywhere; + } + MOZ_ASSERT_UNREACHABLE("should have been handled already"); + return capi::ICU4XLineBreakStrictness_Normal; +} + +static capi::ICU4XLineBreakWordOption ConvertWordBreakRuleToICU4X( + WordBreakRule aWordBreak) { + switch (aWordBreak) { + case WordBreakRule::Normal: + return capi::ICU4XLineBreakWordOption_Normal; + case WordBreakRule::BreakAll: + return capi::ICU4XLineBreakWordOption_BreakAll; + case WordBreakRule::KeepAll: + return capi::ICU4XLineBreakWordOption_KeepAll; + } + MOZ_ASSERT_UNREACHABLE("should have been handled already"); + return capi::ICU4XLineBreakWordOption_Normal; +} + +static capi::ICU4XLineSegmenter* sLineSegmenter = nullptr; + +static capi::ICU4XLineSegmenter* GetDefaultLineSegmenter() { + static std::once_flag sOnce; + + std::call_once(sOnce, [] { + auto result = capi::ICU4XLineSegmenter_create_auto(GetDataProvider()); + MOZ_ASSERT(result.is_ok); + sLineSegmenter = result.ok; + + if (NS_IsMainThread()) { + mozilla::RunOnShutdown([] { + if (sLineSegmenter) { + capi::ICU4XLineSegmenter_destroy(sLineSegmenter); + } + sLineSegmenter = nullptr; + }); + return; + } + NS_DispatchToMainThread( + NS_NewRunnableFunction("GetDefaultLineSegmenter", [] { + mozilla::RunOnShutdown([] { + if (sLineSegmenter) { + capi::ICU4XLineSegmenter_destroy(sLineSegmenter); + } + sLineSegmenter = nullptr; + }); + })); + }); + + return sLineSegmenter; +} + +static bool UseDefaultLineSegmenter(WordBreakRule aWordBreak, + LineBreakRule aLevel, + bool aIsChineseOrJapanese) { + return aWordBreak == WordBreakRule::Normal && + (aLevel == LineBreakRule::Strict || aLevel == LineBreakRule::Auto) && + !aIsChineseOrJapanese; +} + +static capi::ICU4XLineSegmenter* GetLineSegmenter(bool aUseDefault, + WordBreakRule aWordBreak, + LineBreakRule aLevel, + bool aIsChineseOrJapanese) { + if (aUseDefault) { + MOZ_ASSERT( + UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese)); + return GetDefaultLineSegmenter(); + } + + capi::ICU4XLineBreakOptionsV1 options; + options.word_option = ConvertWordBreakRuleToICU4X(aWordBreak); + options.strictness = ConvertLineBreakRuleToICU4X(aLevel); + options.ja_zh = aIsChineseOrJapanese; + + auto result = capi::ICU4XLineSegmenter_create_lstm_with_options_v1( + GetDataProvider(), options); + MOZ_ASSERT(result.is_ok); + return result.ok; +} +#endif + +void LineBreaker::ComputeBreakPositions( + const char16_t* aChars, uint32_t aLength, WordBreakRule aWordBreak, + LineBreakRule aLevel, bool aIsChineseOrJapanese, uint8_t* aBreakBefore) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (StaticPrefs::intl_icu4x_segmenter_enabled()) { + if (aLength == 1) { + // Although UAX#14 LB2 rule requires never breaking at the start of text + // (SOT), ICU4X line segmenter API is designed to match other segmenter in + // UAX#29 to always break at the start of text. Hence the optimization + // here to avoid calling into ICU4X line segmenter. + aBreakBefore[0] = 1; + return; + } + + memset(aBreakBefore, 0, aLength); + + CheckedInt length = aLength; + if (!length.isValid()) { + return; + } + + const bool useDefault = + UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese); + capi::ICU4XLineSegmenter* lineSegmenter = + GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese); + ICU4XLineBreakIteratorUtf16 iterator(capi::ICU4XLineSegmenter_segment_utf16( + lineSegmenter, (const uint16_t*)aChars, aLength)); + + while (true) { + const int32_t nextPos = iterator.next(); + if (nextPos < 0 || nextPos >= length.value()) { + break; + } + aBreakBefore[nextPos] = 1; + } + + if (!useDefault) { + capi::ICU4XLineSegmenter_destroy(lineSegmenter); + } + return; + } +#endif + + uint32_t cur; + int8_t lastClass = CLASS_NONE; + ContextState state(aChars, aLength); + + for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { + char32_t ch = state.GetUnicodeCharAt(cur); + uint32_t chLen = ch > 0xFFFFu ? 2 : 1; + int8_t cl; + + auto prev = [=]() -> char32_t { + if (!cur) { + return 0; + } + char32_t c = aChars[cur - 1]; + if (cur > 1 && NS_IS_SURROGATE_PAIR(aChars[cur - 2], c)) { + c = SURROGATE_TO_UCS4(aChars[cur - 2], c); + } + return c; + }; + + if (NEED_CONTEXTUAL_ANALYSIS(ch)) { + char32_t next; + if (cur + chLen < aLength) { + next = state.GetUnicodeCharAt(cur + chLen); + } else { + next = 0; + } + cl = ContextualAnalysis(prev(), ch, next, state, aLevel, + aIsChineseOrJapanese); + } else { + if (ch == U_EQUAL) state.NotifySeenEqualsSign(); + state.NotifyNonHyphenCharacter(ch); + cl = GetClass(ch, aLevel, aIsChineseOrJapanese); + } + + // To implement word-break:break-all, we overwrite the line-break class of + // alphanumeric characters so they are treated the same as ideographic. + // The relevant characters will have been assigned CLASS_CHARACTER, _CLOSE, + // _CLOSE_LIKE_CHARACTER, or _NUMERIC by GetClass(), but those classes also + // include others that we don't want to touch here, so we re-check the + // Unicode line-break class to determine which ones to modify. + if (aWordBreak == WordBreakRule::BreakAll && + (cl == CLASS_CHARACTER || cl == CLASS_CLOSE || + cl == CLASS_CLOSE_LIKE_CHARACTER || cl == CLASS_NUMERIC)) { + auto cls = GetLineBreakClass(ch); + if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC || + cls == U_LB_AMBIGUOUS || cls == U_LB_COMPLEX_CONTEXT || + /* Additional Japanese and Korean LB classes; CSS Text spec doesn't + explicitly mention these, but this appears to give expected + behavior (spec issue?) */ + cls == U_LB_CONDITIONAL_JAPANESE_STARTER || + (cls >= U_LB_H2 && cls <= U_LB_JV)) { + cl = CLASS_BREAKABLE; + } + } + + bool allowBreak = false; + if (cur > 0) { + NS_ASSERTION(CLASS_COMPLEX != lastClass || CLASS_COMPLEX != cl, + "Loop should have prevented adjacent complex chars here"); + allowBreak = + (state.UseConservativeBreaking() ? GetPairConservative(lastClass, cl) + : GetPair(lastClass, cl)); + // Special cases where a normally-allowed break is suppressed: + if (allowBreak) { + // word-break:keep-all suppresses breaks between certain line-break + // classes. + if (aWordBreak == WordBreakRule::KeepAll && + SuppressBreakForKeepAll(prev(), ch)) { + allowBreak = false; + } + // We also don't allow a break within a run of U+3000 chars unless + // word-break:break-all is in effect. + if (ch == 0x3000 && prev() == 0x3000 && + aWordBreak != WordBreakRule::BreakAll) { + allowBreak = false; + } + } + } + aBreakBefore[cur] = allowBreak; + if (allowBreak) state.NotifyBreakBefore(); + lastClass = cl; + if (CLASS_COMPLEX == cl) { + uint32_t end = cur + chLen; + + while (end < aLength) { + char32_t c = state.GetUnicodeCharAt(end); + if (CLASS_COMPLEX != GetClass(c, aLevel, false)) { + break; + } + ++end; + if (c > 0xFFFFU) { // it was a surrogate pair + ++end; + } + } + + if (aWordBreak == WordBreakRule::BreakAll) { + // For break-all, we don't need to run a dictionary-based breaking + // algorithm, we just allow breaks between all grapheme clusters. + GraphemeClusterBreakIteratorUtf16 ci( + Span(aChars + cur, end - cur)); + while (Maybe pos = ci.Next()) { + aBreakBefore[cur + *pos] = true; + } + } else { + ComplexBreaker::GetBreaks(aChars + cur, end - cur, aBreakBefore + cur); + // restore breakability at chunk begin, which was always set to false + // by the complex line breaker + aBreakBefore[cur] = allowBreak; + } + + cur = end - 1; + } + + if (chLen == 2) { + // Supplementary-plane character: mark that we cannot break before the + // trailing low surrogate, and advance past it. + ++cur; + aBreakBefore[cur] = false; + state.AdvanceIndex(); + } + } +} + +void LineBreaker::ComputeBreakPositions(const uint8_t* aChars, uint32_t aLength, + WordBreakRule aWordBreak, + LineBreakRule aLevel, + bool aIsChineseOrJapanese, + uint8_t* aBreakBefore) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (StaticPrefs::intl_icu4x_segmenter_enabled()) { + if (aLength == 1) { + // Although UAX#14 LB2 rule requires never breaking at the start of text + // (SOT), ICU4X line segmenter API is designed to match other segmenter in + // UAX#29 to always break at the start of text. Hence the optimization + // here to avoid calling into ICU4X line segmenter. + aBreakBefore[0] = 1; + return; + } + + memset(aBreakBefore, 0, aLength); + + CheckedInt length = aLength; + if (!length.isValid()) { + return; + } + + const bool useDefault = + UseDefaultLineSegmenter(aWordBreak, aLevel, aIsChineseOrJapanese); + capi::ICU4XLineSegmenter* lineSegmenter = + GetLineSegmenter(useDefault, aWordBreak, aLevel, aIsChineseOrJapanese); + ICU4XLineBreakIteratorLatin1 iterator( + capi::ICU4XLineSegmenter_segment_latin1( + lineSegmenter, (const uint8_t*)aChars, aLength)); + + while (true) { + const int32_t nextPos = iterator.next(); + if (nextPos < 0 || nextPos >= length.value()) { + break; + } + aBreakBefore[nextPos] = 1; + } + + if (!useDefault) { + capi::ICU4XLineSegmenter_destroy(lineSegmenter); + } + return; + } +#endif + + uint32_t cur; + int8_t lastClass = CLASS_NONE; + ContextState state(aChars, aLength); + + for (cur = 0; cur < aLength; ++cur, state.AdvanceIndex()) { + char32_t ch = aChars[cur]; + int8_t cl; + + if (NEED_CONTEXTUAL_ANALYSIS(ch)) { + cl = ContextualAnalysis(cur > 0 ? aChars[cur - 1] : U_NULL, ch, + cur + 1 < aLength ? aChars[cur + 1] : U_NULL, + state, aLevel, aIsChineseOrJapanese); + } else { + if (ch == U_EQUAL) state.NotifySeenEqualsSign(); + state.NotifyNonHyphenCharacter(ch); + cl = GetClass(ch, aLevel, aIsChineseOrJapanese); + } + if (aWordBreak == WordBreakRule::BreakAll && + (cl == CLASS_CHARACTER || cl == CLASS_CLOSE || + cl == CLASS_CLOSE_LIKE_CHARACTER || cl == CLASS_NUMERIC)) { + auto cls = GetLineBreakClass(ch); + // Don't need to check additional Japanese/Korean classes in 8-bit + if (cls == U_LB_ALPHABETIC || cls == U_LB_NUMERIC || + cls == U_LB_COMPLEX_CONTEXT) { + cl = CLASS_BREAKABLE; + } + } + + bool allowBreak = false; + if (cur > 0) { + allowBreak = + (state.UseConservativeBreaking() ? GetPairConservative(lastClass, cl) + : GetPair(lastClass, cl)) && + (aWordBreak != WordBreakRule::KeepAll || + !SuppressBreakForKeepAll(aChars[cur - 1], ch)); + } + aBreakBefore[cur] = allowBreak; + if (allowBreak) state.NotifyBreakBefore(); + lastClass = cl; + } +} diff --git a/intl/lwbrk/LineBreaker.h b/intl/lwbrk/LineBreaker.h new file mode 100644 index 0000000000..a2d7377474 --- /dev/null +++ b/intl/lwbrk/LineBreaker.h @@ -0,0 +1,82 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef mozilla_intl_LineBreaker_h__ +#define mozilla_intl_LineBreaker_h__ + +#include + +#define NS_LINEBREAKER_NEED_MORE_TEXT -1 + +namespace mozilla { +namespace intl { +enum class LineBreakRule : uint8_t; +enum class WordBreakRule : uint8_t; + +class LineBreaker final { + public: + // LineBreaker is a utility class with only static methods. No need to + // instantiate it. + LineBreaker() = delete; + ~LineBreaker() = delete; + + // Find the next line break opportunity starting from aPos + 1. It can return + // aLen if there's no break opportunity between [aPos + 1, aLen - 1]. + // + // If aPos is already at the end of aText or beyond, i.e. aPos >= aLen, return + // NS_LINEBREAKER_NEED_MORE_TEXT. + // + // DEPRECATED: Use LineBreakIteratorUtf16 instead. + static int32_t Next(const char16_t* aText, uint32_t aLen, uint32_t aPos); + + // Call this on a word with whitespace at either end. We will apply JISx4051 + // rules to find breaks inside the word. aBreakBefore is set to the break- + // before status of each character; aBreakBefore[0] will always be false + // because we never return a break before the first character. + // aLength is the length of the aText array and also the length of the + // aBreakBefore output array. + static void ComputeBreakPositions(const char16_t* aText, uint32_t aLength, + WordBreakRule aWordBreak, + LineBreakRule aLevel, + bool aIsChineseOrJapanese, + uint8_t* aBreakBefore); + static void ComputeBreakPositions(const uint8_t* aText, uint32_t aLength, + WordBreakRule aWordBreak, + LineBreakRule aLevel, + bool aIsChineseOrJapanese, + uint8_t* aBreakBefore); +}; + +static inline bool NS_IsSpace(char16_t u) { + return u == 0x0020 || // SPACE + u == 0x0009 || // CHARACTER TABULATION + u == 0x000D || // CARRIAGE RETURN + (0x2000 <= u && u <= 0x2006) || // EN QUAD, EM QUAD, EN SPACE, + // EM SPACE, THREE-PER-EM SPACE, + // FOUR-PER-SPACE, SIX-PER-EM SPACE, + (0x2008 <= u && u <= 0x200B) || // PUNCTUATION SPACE, THIN SPACE, + // HAIR SPACE, ZERO WIDTH SPACE + u == 0x1361 || // ETHIOPIC WORDSPACE + u == 0x1680 || // OGHAM SPACE MARK + u == 0x205F; // MEDIUM MATHEMATICAL SPACE +} + +static inline bool NS_NeedsPlatformNativeHandling(char16_t aChar) { + return +#if ANDROID || XP_WIN // Bug 1647377/1736393: no "platform native" support for + // Tibetan; better to just use our class-based breaker. + (0x0e01 <= aChar && aChar <= 0x0eff) || // Thai, Lao +#else + // Routing Tibetan to the platform-native breaker currently results in + // WPT failures in a few css3-text-line-break-opclns-* testcases that mix + // a Tibetan character with other-script context. + (0x0e01 <= aChar && aChar <= 0x0fff) || // Thai, Lao, Tibetan +#endif + (0x1780 <= aChar && aChar <= 0x17ff); // Khmer +} + +} // namespace intl +} // namespace mozilla + +#endif /* mozilla_intl_LineBreaker_h__ */ diff --git a/intl/lwbrk/Segmenter.cpp b/intl/lwbrk/Segmenter.cpp new file mode 100644 index 0000000000..8cfd179366 --- /dev/null +++ b/intl/lwbrk/Segmenter.cpp @@ -0,0 +1,545 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Classes to iterate over grapheme, word, sentence, or line. */ + +#include "mozilla/intl/Segmenter.h" + +#include "mozilla/intl/LineBreaker.h" +#include "mozilla/intl/WordBreaker.h" +#include "mozilla/intl/UnicodeProperties.h" +#include "mozilla/StaticPrefs_intl.h" +#include "nsUnicodeProperties.h" +#include "nsCharTraits.h" + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +# include "ICU4XDataProvider.h" +# include "ICU4XGraphemeClusterSegmenter.h" +# include "ICU4XLineSegmenter.h" +# include "ICU4XSentenceSegmenter.h" +# include "ICU4XWordSegmenter.h" +# include "mozilla/ClearOnShutdown.h" +# include "mozilla/intl/ICU4XGeckoDataProvider.h" +# include "nsThreadUtils.h" + +# include +#endif + +using namespace mozilla::unicode; + +namespace mozilla::intl { + +SegmentIteratorUtf16::SegmentIteratorUtf16(Span aText) + : mText(aText) {} + +Maybe SegmentIteratorUtf16::Seek(uint32_t aPos) { + if (mPos < aPos) { + mPos = aPos; + } + return Next(); +} + +LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span aText, + const LineBreakOptions& aOptions) + : SegmentIteratorUtf16(aText), mOptions(aOptions) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { + return; + } + auto result = + capi::ICU4XLineSegmenter_create_auto(mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + mSegmenter = result.ok; + mIterator = capi::ICU4XLineSegmenter_segment_utf16( + mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +#endif +} + +LineBreakIteratorUtf16::~LineBreakIteratorUtf16() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + capi::ICU4XLineBreakIteratorUtf16_destroy(mIterator); + } + if (mSegmenter) { + capi::ICU4XLineSegmenter_destroy(mSegmenter); + } +#endif +} + +Maybe LineBreakIteratorUtf16::Next() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); + } +#endif + const int32_t nextPos = + LineBreaker::Next(mText.Elements(), mText.Length(), mPos); + if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) { + return Nothing(); + } + mPos = nextPos; + return Some(mPos); +} + +Maybe LineBreakIteratorUtf16::Seek(uint32_t aPos) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = capi::ICU4XLineBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); + } +#endif + return SegmentIteratorUtf16::Seek(aPos); +} + +WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span aText) + : SegmentIteratorUtf16(aText) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { + return; + } + auto result = + capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + mSegmenter = result.ok; + mIterator = capi::ICU4XWordSegmenter_segment_utf16( + mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +#endif +} + +WordBreakIteratorUtf16::~WordBreakIteratorUtf16() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + capi::ICU4XWordBreakIteratorUtf16_destroy(mIterator); + } + if (mSegmenter) { + capi::ICU4XWordSegmenter_destroy(mSegmenter); + } +#endif +} + +Maybe WordBreakIteratorUtf16::Next() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); + } +#endif + const int32_t nextPos = + WordBreaker::Next(mText.Elements(), mText.Length(), mPos); + if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) { + return Nothing(); + } + mPos = nextPos; + return Some(mPos); +} + +Maybe WordBreakIteratorUtf16::Seek(uint32_t aPos) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = capi::ICU4XWordBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); + } +#endif + return SegmentIteratorUtf16::Seek(aPos); +} + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +capi::ICU4XGraphemeClusterSegmenter* + GraphemeClusterBreakIteratorUtf16::sSegmenter = nullptr; +#endif + +GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16( + Span aText) + : SegmentIteratorUtf16(aText) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (!StaticPrefs::intl_icu4x_segmenter_enabled()) { + return; + } + static std::once_flag sOnce; + + std::call_once(sOnce, [] { + auto result = capi::ICU4XGraphemeClusterSegmenter_create( + mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + sSegmenter = result.ok; + + NS_DispatchToMainThread( + NS_NewRunnableFunction("GraphemeClusterBreakIteratorUtf16", [] { + RunOnShutdown([] { + capi::ICU4XGraphemeClusterSegmenter_destroy(sSegmenter); + sSegmenter = nullptr; + }); + })); + }); + + MOZ_RELEASE_ASSERT(sSegmenter); + mIterator = capi::ICU4XGraphemeClusterSegmenter_segment_utf16( + sSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +#endif +} + +GraphemeClusterBreakIteratorUtf16::~GraphemeClusterBreakIteratorUtf16() { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + capi::ICU4XGraphemeClusterBreakIteratorUtf16_destroy(mIterator); + } +#endif +} + +enum HSType { + HST_NONE = U_HST_NOT_APPLICABLE, + HST_L = U_HST_LEADING_JAMO, + HST_V = U_HST_VOWEL_JAMO, + HST_T = U_HST_TRAILING_JAMO, + HST_LV = U_HST_LV_SYLLABLE, + HST_LVT = U_HST_LVT_SYLLABLE +}; + +static HSType GetHangulSyllableType(uint32_t aCh) { + return HSType(UnicodeProperties::GetIntPropertyValue( + aCh, UnicodeProperties::IntProperty::HangulSyllableType)); +} + +Maybe GraphemeClusterBreakIteratorUtf16::Next() { + const auto len = mText.Length(); +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + const int32_t nextPos = + capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); + } +#endif + if (mPos >= len) { + // The iterator has already reached the end. + return Nothing(); + } + + uint32_t ch = mText[mPos++]; + + if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) { + ch = SURROGATE_TO_UCS4(ch, mText[mPos++]); + } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) || + (ch >= 0xac00 && ch <= 0xd7ff)) { + // Handle conjoining Jamo that make Hangul syllables + HSType hangulState = GetHangulSyllableType(ch); + while (mPos < len) { + ch = mText[mPos]; + HSType hangulType = GetHangulSyllableType(ch); + switch (hangulType) { + case HST_L: + case HST_LV: + case HST_LVT: + if (hangulState == HST_L) { + hangulState = hangulType; + mPos++; + continue; + } + break; + case HST_V: + if ((hangulState != HST_NONE) && (hangulState != HST_T) && + (hangulState != HST_LVT)) { + hangulState = hangulType; + mPos++; + continue; + } + break; + case HST_T: + if (hangulState != HST_NONE && hangulState != HST_L) { + hangulState = hangulType; + mPos++; + continue; + } + break; + default: + break; + } + break; + } + } + + const uint32_t kVS16 = 0xfe0f; + const uint32_t kZWJ = 0x200d; + // UTF-16 surrogate values for Fitzpatrick type modifiers + const uint32_t kFitzpatrickHigh = 0xD83C; + const uint32_t kFitzpatrickLowFirst = 0xDFFB; + const uint32_t kFitzpatrickLowLast = 0xDFFF; + + // Checking the emoji-presentation property of the base character is a bit + // expensive, so we do it lazily. + enum class EmojiStatus : uint8_t { + No, + Yes, + Unknown, + } baseIsEmojiStatus = EmojiStatus::Unknown; + + // Remember the base character and the position of the next, in case we need + // to evaluate its emoji status. + uint32_t baseCh = ch; + uint32_t afterBase = mPos; + + auto isFitzpatrickModifierAt = [&](uint32_t aPos) -> bool { + return aPos + 1 < len && mText[aPos] == kFitzpatrickHigh && + mText[aPos + 1] >= kFitzpatrickLowFirst && + mText[aPos + 1] <= kFitzpatrickLowLast; + }; + + auto baseIsEmoji = [&]() -> bool { + if (baseIsEmojiStatus == EmojiStatus::Unknown) { + auto basePresentation = GetEmojiPresentation(baseCh); + baseIsEmojiStatus = + basePresentation == EmojiDefault || + (basePresentation == TextDefault && + ((afterBase < len && mText[afterBase] == kVS16) || + isFitzpatrickModifierAt(afterBase))) + ? EmojiStatus::Yes + : EmojiStatus::No; + } + return baseIsEmojiStatus == EmojiStatus::Yes; + }; + + bool prevWasZwj = false; + + while (mPos < len) { + ch = mText[mPos]; + size_t chLen = 1; + + // Check for surrogate pairs; note that isolated surrogates will just + // be treated as generic (non-cluster-extending) characters here, + // which is fine for cluster-iterating purposes + if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) { + ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]); + chLen = 2; + } + + bool extendCluster = + IsClusterExtender(ch) || + (prevWasZwj && baseIsEmoji() && + ((GetEmojiPresentation(ch) == EmojiDefault) || + (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len && + mText[mPos + chLen] == kVS16))); + if (!extendCluster) { + break; + } + + prevWasZwj = (ch == kZWJ); + mPos += chLen; + } + + MOZ_ASSERT(mPos <= len, "Next() has overshot the string!"); + return Some(mPos); +} + +Maybe GraphemeClusterBreakIteratorUtf16::Seek(uint32_t aPos) { +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (mIterator) { + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = + capi::ICU4XGraphemeClusterBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); + } +#endif + return SegmentIteratorUtf16::Seek(aPos); +} + +GraphemeClusterBreakReverseIteratorUtf16:: + GraphemeClusterBreakReverseIteratorUtf16(Span aText) + : SegmentIteratorUtf16(aText) { + mPos = mText.Length(); +} + +Maybe GraphemeClusterBreakReverseIteratorUtf16::Next() { + if (mPos == 0) { + return Nothing(); + } + + uint32_t ch; + do { + ch = mText[--mPos]; + + if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) { + ch = SURROGATE_TO_UCS4(mText[--mPos], ch); + } + + if (!IsClusterExtender(ch)) { + break; + } + } while (mPos > 0); + + // XXX May need to handle conjoining Jamo + + return Some(mPos); +} + +Maybe GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) { + if (mPos > aPos) { + mPos = aPos; + } + return Next(); +} + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +SentenceBreakIteratorUtf16::SentenceBreakIteratorUtf16( + Span aText) + : SegmentIteratorUtf16(aText) { + auto result = + capi::ICU4XSentenceSegmenter_create(mozilla::intl::GetDataProvider()); + MOZ_RELEASE_ASSERT(result.is_ok); + mSegmenter = result.ok; + mIterator = capi::ICU4XSentenceSegmenter_segment_utf16( + mSegmenter, (const uint16_t*)mText.Elements(), mText.Length()); +} + +SentenceBreakIteratorUtf16::~SentenceBreakIteratorUtf16() { + if (mIterator) { + capi::ICU4XSentenceBreakIteratorUtf16_destroy(mIterator); + } + if (mSegmenter) { + capi::ICU4XSentenceSegmenter_destroy(mSegmenter); + } +} + +Maybe SentenceBreakIteratorUtf16::Seek(uint32_t aPos) { + if (!mIterator) { + return Nothing(); + } + + if (mPos >= aPos) { + return Next(); + } + + while (mPos < aPos) { + const int32_t nextPos = + capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + mPos = static_cast(nextPos); + } + + if (aPos < mPos) { + return Some(mPos); + } + + return Next(); +} + +Maybe SentenceBreakIteratorUtf16::Next() { + if (!mIterator) { + return Nothing(); + } + + const int32_t nextPos = capi::ICU4XSentenceBreakIteratorUtf16_next(mIterator); + if (nextPos < 0) { + return Nothing(); + } + if (!nextPos) { + return Next(); + } + mPos = nextPos; + return Some(mPos); +} +#endif + +Result, ICUError> Segmenter::TryCreate( + Span aLocale, const SegmenterOptions& aOptions) { +#if !defined(MOZ_ICU4X) || !defined(JS_HAS_INTL_API) + if (aOptions.mGranularity == SegmenterGranularity::Sentence) { + // Grapheme and Sentence iterator are not yet implemented. + return Err(ICUError::InternalError); + } +#endif + return MakeUnique(aLocale, aOptions); +} + +UniquePtr Segmenter::Segment( + Span aText) const { + switch (mOptions.mGranularity) { + case SegmenterGranularity::Grapheme: + return MakeUnique(aText); + case SegmenterGranularity::Sentence: +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (StaticPrefs::intl_icu4x_segmenter_enabled()) { + return MakeUnique(aText); + } +#endif + MOZ_ASSERT_UNREACHABLE("Unimplemented yet!"); + return nullptr; + case SegmenterGranularity::Word: + return MakeUnique(aText); + case SegmenterGranularity::Line: + return MakeUnique(aText); + } + MOZ_ASSERT_UNREACHABLE("All granularities must be handled!"); + return nullptr; +} + +} // namespace mozilla::intl diff --git a/intl/lwbrk/Segmenter.h b/intl/lwbrk/Segmenter.h new file mode 100644 index 0000000000..a3233dc8ed --- /dev/null +++ b/intl/lwbrk/Segmenter.h @@ -0,0 +1,231 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* Classes to iterate over grapheme, word, sentence, or line. */ + +#ifndef intl_components_Segmenter_h_ +#define intl_components_Segmenter_h_ + +#include "mozilla/intl/ICUError.h" +#include "mozilla/Maybe.h" +#include "mozilla/Result.h" +#include "mozilla/Span.h" +#include "mozilla/UniquePtr.h" + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +namespace capi { +struct ICU4XLineSegmenter; +struct ICU4XLineBreakIteratorUtf16; +struct ICU4XWordSegmenter; +struct ICU4XWordBreakIteratorUtf16; +struct ICU4XGraphemeClusterSegmenter; +struct ICU4XGraphemeClusterBreakIteratorUtf16; +struct ICU4XSentenceSegmenter; +struct ICU4XSentenceBreakIteratorUtf16; +} // namespace capi +#endif + +namespace mozilla::intl { + +enum class SegmenterGranularity : uint8_t { + Grapheme, + Word, + Sentence, + Line, +}; + +struct SegmenterOptions final { + SegmenterGranularity mGranularity = SegmenterGranularity::Grapheme; +}; + +/** + * Interface of segment iterators. Subclass this class to implement iterator for + * UTF-16 text. + */ +class SegmentIteratorUtf16 { + public: + virtual ~SegmentIteratorUtf16() = default; + + // Disable copy or move semantics. Move semantic could be enabled in the + // future if needed. + SegmentIteratorUtf16(SegmentIteratorUtf16&&) = delete; + SegmentIteratorUtf16& operator=(SegmentIteratorUtf16&&) = delete; + SegmentIteratorUtf16(const SegmentIteratorUtf16&) = delete; + SegmentIteratorUtf16& operator=(const SegmentIteratorUtf16&) = delete; + + /** + * Advance the iterator to the next break position. + * + * @return the break position. If there's no further break position, return + * Nothing(). + */ + virtual Maybe Next() = 0; + + /** + * Advance the iterator to the first break position following the specified + * position aPos. + * + * Note: if this iterator's current position is already >= aPos, this method + * behaves the same as Next(). + */ + virtual Maybe Seek(uint32_t aPos); + + protected: + explicit SegmentIteratorUtf16(Span aText); + + // The text to iterate over. + Span mText; + + // The current break position within mText. + uint32_t mPos = 0; +}; + +// Each enum value has the same meaning with respect to the `word-break` +// property values in the CSS Text spec. See the details in +// https://drafts.csswg.org/css-text-3/#word-break-property +enum class WordBreakRule : uint8_t { + Normal = 0, + BreakAll, + KeepAll, +}; + +// Each enum value has the same meaning with respect to the `line-break` +// property values in the CSS Text spec. See the details in +// https://drafts.csswg.org/css-text-3/#line-break-property. +enum class LineBreakRule : uint8_t { + Auto = 0, + Loose, + Normal, + Strict, + Anywhere, +}; + +// Extra options for line break iterator. +struct LineBreakOptions final { + WordBreakRule mWordBreakRule = WordBreakRule::Normal; + LineBreakRule mLineBreakRule = LineBreakRule::Auto; + bool mScriptIsChineseOrJapanese = false; +}; + +/** + * Line break iterator for UTF-16 text. + */ +class LineBreakIteratorUtf16 final : public SegmentIteratorUtf16 { + public: + explicit LineBreakIteratorUtf16(Span aText, + const LineBreakOptions& aOptions = {}); + ~LineBreakIteratorUtf16() override; + + Maybe Next() override; + Maybe Seek(uint32_t aPos) override; + + private: + LineBreakOptions mOptions; + +#ifdef MOZ_ICU4X + capi::ICU4XLineSegmenter* mSegmenter = nullptr; + capi::ICU4XLineBreakIteratorUtf16* mIterator = nullptr; +#endif +}; + +/** + * Word break iterator for UTF-16 text. + */ +class WordBreakIteratorUtf16 final : public SegmentIteratorUtf16 { + public: + explicit WordBreakIteratorUtf16(Span aText); + ~WordBreakIteratorUtf16() override; + + Maybe Next() override; + Maybe Seek(uint32_t aPos) override; + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + private: + capi::ICU4XWordSegmenter* mSegmenter = nullptr; + capi::ICU4XWordBreakIteratorUtf16* mIterator = nullptr; +#endif +}; + +/** + * Grapheme cluster break iterator for UTF-16 text. + */ +class GraphemeClusterBreakIteratorUtf16 final : public SegmentIteratorUtf16 { + public: + explicit GraphemeClusterBreakIteratorUtf16(Span aText); + ~GraphemeClusterBreakIteratorUtf16() override; + + Maybe Next() override; + Maybe Seek(uint32_t aPos) override; + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + private: + static capi::ICU4XGraphemeClusterSegmenter* sSegmenter; + capi::ICU4XGraphemeClusterBreakIteratorUtf16* mIterator = nullptr; +#endif +}; + +/** + * Grapheme cluster break reverse iterator for UTF-16 text. + * + * Note: The reverse iterator doesn't handle conjoining Jamo and emoji. Use it + * at your own risk. + */ +class GraphemeClusterBreakReverseIteratorUtf16 final + : public SegmentIteratorUtf16 { + public: + explicit GraphemeClusterBreakReverseIteratorUtf16(Span aText); + + Maybe Next() override; + Maybe Seek(uint32_t aPos) override; +}; + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +/** + * Sentence break iterator for UTF-16 text. + */ +class SentenceBreakIteratorUtf16 final : public SegmentIteratorUtf16 { + public: + explicit SentenceBreakIteratorUtf16(Span aText); + ~SentenceBreakIteratorUtf16() override; + + Maybe Next() override; + Maybe Seek(uint32_t aPos) override; + + private: + capi::ICU4XSentenceSegmenter* mSegmenter = nullptr; + capi::ICU4XSentenceBreakIteratorUtf16* mIterator = nullptr; +}; +#endif + +/** + * This component is a Mozilla-focused API for working with segmenters in + * internationalization code. + * + * This is a factor class. Calling Segment() to create an iterator over a text + * of given granularity. + */ +class Segmenter final { + public: + // NOTE: aLocale is a no-op currently. + static Result, ICUError> TryCreate( + Span aLocale, const SegmenterOptions& aOptions); + + explicit Segmenter(Span aLocale, const SegmenterOptions& aOptions) + : mOptions(aOptions) {} + + // Creates an iterator over aText of a given granularity in mOptions. + UniquePtr Segment(Span aText) const; + + // TODO: Implement an iterator for Latin1 text. + // UniquePtr Segment(Span aText) const; + + private: + SegmenterOptions mOptions; +}; + +} // namespace mozilla::intl + +#endif diff --git a/intl/lwbrk/WordBreaker.cpp b/intl/lwbrk/WordBreaker.cpp new file mode 100644 index 0000000000..024bdbbb1c --- /dev/null +++ b/intl/lwbrk/WordBreaker.cpp @@ -0,0 +1,249 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/CheckedInt.h" +#include "mozilla/intl/UnicodeProperties.h" +#include "mozilla/intl/WordBreaker.h" +#include "mozilla/StaticPrefs_layout.h" +#include "nsComplexBreaker.h" +#include "nsTArray.h" +#include "nsUnicodeProperties.h" + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) +# include "ICU4XDataProvider.h" +# include "ICU4XWordBreakIteratorUtf16.hpp" +# include "ICU4XWordSegmenter.hpp" +# include "mozilla/intl/ICU4XGeckoDataProvider.h" +# include "mozilla/StaticPrefs_intl.h" +# include "nsUnicharUtils.h" +#endif + +using mozilla::intl::Script; +using mozilla::intl::UnicodeProperties; +using mozilla::intl::WordBreaker; +using mozilla::intl::WordRange; +using mozilla::unicode::GetGenCategory; + +#define ASCII_IS_ALPHA(c) \ + ((('a' <= (c)) && ((c) <= 'z')) || (('A' <= (c)) && ((c) <= 'Z'))) +#define ASCII_IS_DIGIT(c) (('0' <= (c)) && ((c) <= '9')) +#define ASCII_IS_SPACE(c) \ + ((' ' == (c)) || ('\t' == (c)) || ('\r' == (c)) || ('\n' == (c))) +#define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80) + +// we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect +// Unicode 3.0 +#define IS_HAN(c) \ + ((0x3400 <= (c)) && ((c) <= 0x9fff)) || ((0xf900 <= (c)) && ((c) <= 0xfaff)) +#define IS_KATAKANA(c) ((0x30A0 <= (c)) && ((c) <= 0x30FF)) +#define IS_HIRAGANA(c) ((0x3040 <= (c)) && ((c) <= 0x309F)) +#define IS_HALFWIDTHKATAKANA(c) ((0xFF60 <= (c)) && ((c) <= 0xFF9F)) + +// Return true if aChar belongs to a SEAsian script that is written without +// word spaces, so we need to use the "complex breaker" to find possible word +// boundaries. (https://en.wikipedia.org/wiki/Scriptio_continua) +// (How well this works depends on the level of platform support for finding +// possible line breaks - or possible word boundaries - in the particular +// script. Thai, at least, works pretty well on the major desktop OSes. If +// the script is not supported by the platform, we just won't find any useful +// boundaries.) +static bool IsScriptioContinua(char16_t aChar) { + Script sc = UnicodeProperties::GetScriptCode(aChar); + return sc == Script::THAI || sc == Script::MYANMAR || sc == Script::KHMER || + sc == Script::JAVANESE || sc == Script::BALINESE || + sc == Script::SUNDANESE || sc == Script::LAO; +} + +/* static */ +WordBreaker::WordBreakClass WordBreaker::GetClass(char16_t c) { + // begin of the hack + + if (IS_ALPHABETICAL_SCRIPT(c)) { + if (IS_ASCII(c)) { + if (ASCII_IS_SPACE(c)) { + return kWbClassSpace; + } + if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) || + (c == '_' && !StaticPrefs::layout_word_select_stop_at_underscore())) { + return kWbClassAlphaLetter; + } + return kWbClassPunct; + } + if (c == 0x00A0 /*NBSP*/) { + return kWbClassSpace; + } + if (GetGenCategory(c) == nsUGenCategory::kPunctuation) { + return kWbClassPunct; + } + if (IsScriptioContinua(c)) { + return kWbClassScriptioContinua; + } + return kWbClassAlphaLetter; + } + if (IS_HAN(c)) { + return kWbClassHanLetter; + } + if (IS_KATAKANA(c)) { + return kWbClassKatakanaLetter; + } + if (IS_HIRAGANA(c)) { + return kWbClassHiraganaLetter; + } + if (IS_HALFWIDTHKATAKANA(c)) { + return kWbClassHWKatakanaLetter; + } + if (GetGenCategory(c) == nsUGenCategory::kPunctuation) { + return kWbClassPunct; + } + if (IsScriptioContinua(c)) { + return kWbClassScriptioContinua; + } + return kWbClassAlphaLetter; +} + +WordRange WordBreaker::FindWord(const nsAString& aText, uint32_t aPos, + const FindWordOptions aOptions) { + const CheckedInt len = aText.Length(); + MOZ_RELEASE_ASSERT(len.isValid()); + + if (aPos >= len.value()) { + return {len.value(), len.value()}; + } + + WordRange range{0, len.value()}; + +#if defined(MOZ_ICU4X) && defined(JS_HAS_INTL_API) + if (StaticPrefs::intl_icu4x_segmenter_enabled()) { + auto result = + capi::ICU4XWordSegmenter_create_auto(mozilla::intl::GetDataProvider()); + MOZ_ASSERT(result.is_ok); + ICU4XWordSegmenter segmenter(result.ok); + ICU4XWordBreakIteratorUtf16 iterator = + segmenter.segment_utf16(diplomat::span( + (const uint16_t*)aText.BeginReading(), aText.Length())); + + uint32_t previousPos = 0; + while (true) { + const int32_t nextPos = iterator.next(); + if (nextPos < 0) { + range.mBegin = previousPos; + range.mEnd = len.value(); + break; + } + if ((uint32_t)nextPos > aPos) { + range.mBegin = previousPos; + range.mEnd = (uint32_t)nextPos; + break; + } + + previousPos = nextPos; + } + + if (aOptions != FindWordOptions::StopAtPunctuation) { + return range; + } + + for (uint32_t i = range.mBegin; i < range.mEnd; i++) { + if (mozilla::IsPunctuationForWordSelect(aText[i])) { + if (i > aPos) { + range.mEnd = i; + break; + } + if (i == aPos) { + range.mBegin = i; + range.mEnd = i + 1; + break; + } + if (i < aPos) { + range.mBegin = i + 1; + } + } + } + + return range; + } +#endif + + WordBreakClass c = GetClass(aText[aPos]); + + // Scan forward + for (uint32_t i = aPos + 1; i < len.value(); i++) { + if (c != GetClass(aText[i])) { + range.mEnd = i; + break; + } + } + + // Scan backward + for (uint32_t i = aPos; i > 0; i--) { + if (c != GetClass(aText[i - 1])) { + range.mBegin = i; + break; + } + } + + if (kWbClassScriptioContinua == c) { + // we pass the whole text segment to the complex word breaker to find a + // shorter answer + AutoTArray breakBefore; + breakBefore.SetLength(range.mEnd - range.mBegin); + ComplexBreaker::GetBreaks(aText.BeginReading() + range.mBegin, + range.mEnd - range.mBegin, + breakBefore.Elements()); + + // Scan forward + for (uint32_t i = aPos + 1; i < range.mEnd; i++) { + if (breakBefore[i - range.mBegin]) { + range.mEnd = i; + break; + } + } + + // Scan backward + for (uint32_t i = aPos; i > range.mBegin; i--) { + if (breakBefore[i - range.mBegin]) { + range.mBegin = i; + break; + } + } + } + return range; +} + +int32_t WordBreaker::Next(const char16_t* aText, uint32_t aLen, uint32_t aPos) { + MOZ_ASSERT(aText); + + if (aPos >= aLen) { + return NS_WORDBREAKER_NEED_MORE_TEXT; + } + + const WordBreakClass posClass = GetClass(aText[aPos]); + uint32_t nextBreakPos; + for (nextBreakPos = aPos + 1; nextBreakPos < aLen; ++nextBreakPos) { + if (posClass != GetClass(aText[nextBreakPos])) { + break; + } + } + + if (kWbClassScriptioContinua == posClass) { + // We pass the whole text segment to the complex word breaker to find a + // shorter answer. + const char16_t* segStart = aText + aPos; + const uint32_t segLen = nextBreakPos - aPos + 1; + AutoTArray breakBefore; + breakBefore.SetLength(segLen); + ComplexBreaker::GetBreaks(segStart, segLen, breakBefore.Elements()); + + for (uint32_t i = aPos + 1; i < nextBreakPos; ++i) { + if (breakBefore[i - aPos]) { + nextBreakPos = i; + break; + } + } + } + + MOZ_ASSERT(nextBreakPos != aPos); + return nextBreakPos; +} diff --git a/intl/lwbrk/WordBreaker.h b/intl/lwbrk/WordBreaker.h new file mode 100644 index 0000000000..88ccf1a380 --- /dev/null +++ b/intl/lwbrk/WordBreaker.h @@ -0,0 +1,72 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef mozilla_intl_WordBreaker_h__ +#define mozilla_intl_WordBreaker_h__ + +#include "nsStringFwd.h" +#include + +#define NS_WORDBREAKER_NEED_MORE_TEXT -1 + +namespace mozilla { +namespace intl { + +struct WordRange { + uint32_t mBegin; + uint32_t mEnd; +}; + +class WordBreaker final { + public: + // WordBreaker is a utility class with only static methods. No need to + // instantiate it. + WordBreaker() = delete; + ~WordBreaker() = delete; + + // Find the word boundary by scanning forward and backward from aPos. + // + // @return WordRange where mBegin equals to the offset to first character in + // the word and mEnd equals to the offset to the last character plus 1. mEnd + // can be aText.Lengh() if the desired word is at the end of aText. + // + // If aPos is already at the end of aText or beyond, both mBegin and mEnd + // equals to aText.Length(). + // + // If setting StopAtPunctuation, even if using UAX#29 word segmenter rule, + // there will be break opportunities on characters with punctuation class. + enum class FindWordOptions { None, StopAtPunctuation }; + + static WordRange FindWord( + const nsAString& aText, uint32_t aPos, + const FindWordOptions aOptions = FindWordOptions::None); + + // Find the next word break opportunity starting from aPos + 1. It can return + // aLen if there's no break opportunity between [aPos + 1, aLen - 1]. + // + // If aPos is already at the end of aText or beyond, i.e. aPos >= aLen, return + // NS_WORDBREAKER_NEED_MORE_TEXT. + // + // DEPRECATED: Use WordBreakIteratorUtf16 instead. + static int32_t Next(const char16_t* aText, uint32_t aLen, uint32_t aPos); + + private: + enum WordBreakClass : uint8_t { + kWbClassSpace = 0, + kWbClassAlphaLetter, + kWbClassPunct, + kWbClassHanLetter, + kWbClassKatakanaLetter, + kWbClassHiraganaLetter, + kWbClassHWKatakanaLetter, + kWbClassScriptioContinua + }; + + static WordBreakClass GetClass(char16_t aChar); +}; + +} // namespace intl +} // namespace mozilla + +#endif /* mozilla_intl_WordBreaker_h__ */ diff --git a/intl/lwbrk/crashtests/416721.html b/intl/lwbrk/crashtests/416721.html new file mode 100644 index 0000000000..0a6625ba8a --- /dev/null +++ b/intl/lwbrk/crashtests/416721.html @@ -0,0 +1,11 @@ + + + + Testcase for bug 416721 + + + +

กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛

+ + + diff --git a/intl/lwbrk/crashtests/Lo_test_page_no_uniscribe_breaks.html b/intl/lwbrk/crashtests/Lo_test_page_no_uniscribe_breaks.html new file mode 100644 index 0000000000..b8958e474e --- /dev/null +++ b/intl/lwbrk/crashtests/Lo_test_page_no_uniscribe_breaks.html @@ -0,0 +1,12 @@ + + +Lao - test page no breaks + + +

The text below does not produce any breaks with the Uniscribe breaker and is longer than the test buffer length used for brokering.

+ +
ການຮັບຮູ້ກຽດຕິສັກອັນມີປະຈຳຢູ່ຕົວບຸກຄົນໃນວົງສະກຸນຂອງມະນຸດທຸກໆຄົນການຮັບຮູ້ກຽດຕິສັກອັນມີປະຈຳຢູ່ຕົວບຸກຄົນໃນວົງສະກຸນຂອງມະນຸດທຸກໆຄົນ
diff --git a/intl/lwbrk/crashtests/UDHR_Thai_test_page_long_sequences.html b/intl/lwbrk/crashtests/UDHR_Thai_test_page_long_sequences.html new file mode 100644 index 0000000000..bd483d93c5 --- /dev/null +++ b/intl/lwbrk/crashtests/UDHR_Thai_test_page_long_sequences.html @@ -0,0 +1,184 @@ + + + +UDHR - Thai - test page for bug 1713973 + + + +

Universal Declaration of Human Rights - Thai

+

© 1996 – 2009 The Office of the High Commissioner for Human Rights

+

This HTML version prepared by the UDHR in Unicode project, http://www.unicode.org/udhr.

+
+

NOTE: Spaces within Thai content were removed for testing purposes.

+
+

ปฏิญญาสากลว่าด้วยสิทธิมนุษยชน

+

ได้รับการรับรองและประกาศโดยข้อมติสมัชชาสหประชาชาติที่ 217 เอ (III) วันที่ 10 ธันวาคม พ.ศ. 2491

+

อารัมภบท

+

โดยที่การยอมรับศักดิ์ศรีแต่กำเนิดและสิทธิที่เท่าเทียมกันและที่ไม่อาจ +เพิกถอนได้ของสมาชิกทั้งมวลแห่งครอบครัวมนุษยชาติเป็นพื้นฐานแห่งอิสรภาพ +ความยุติธรรมและสันติภาพในโลก

+

โดยที่การไม่นำพาและการหมิ่นในคุณค่าของสิทธิมนุษยชนยังผลให้มีการกระทำ +อันป่าเถื่อนซึ่งเป็นการขัดอย่างร้ายแรงต่อมโนธรรมของมนุษยชาติและการมาถึง +ของโลกที่ได้มีการประกาศให้ความมีอิสรภาพในการพูดและความเชื่อและอิสรภาพจาก +ความหวาดกลัวและความต้องการของมนุษย์เป็นความปรารถนาสูงสุดของประชาชนทั่วไป

+

โดยที่เป็นการจำเป็นที่สิทธิมนุษยชนควรได้รับความคุ้มครองโดยหลัก +นิติธรรมถ้าจะไม่บังคับให้คนต้องหันเข้าหาการลุกขึ้นต่อต้านทรราชและการกด +ขี่เป็นวิถีทางสุดท้าย

+

โดยที่เป็นการจำเป็นที่จะส่งเสริมพัฒนาการแห่งความสัมพันธ์ฉันมิตรระหว่างชาติต่างๆ

+

โดยที่ประชาชนแห่งสหประชาชาติได้ยืนยันอีกครั้งไว้ในกฎบัตรถึงศรัทธาใน +สิทธิมนุษยชนขั้นพื้นฐานในศักดิ์ศรีและค่าของมนุษย์และในสิทธิที่เท่าเทียม +กันของบรรดาชายและหญิงและได้มุ่งมั่นที่จะส่งเสริมความก้าวหน้าทางสังคมและ +มาตรฐานแห่งชีวิตที่ดีขึ้นในอิสรภาพอันกว้างขวางยิ่งขึ้น

+

โดยที่รัฐสมาชิกต่างปฏิญาณที่จะบรรลุถึงซึ่งการส่งเสริมการเคารพและการ +ยึดถือสิทธิมนุษยชนและอิสรภาพขั้นพื้นฐานโดยสากลโดยความร่วมมือกับสหประชา +ชาติ

+

โดยที่ความเข้าใจร่วมกันในสิทธิและอิสรภาพเหล่านี้เป็นสิ่งสำคัญที่สุดเพื่อให้ปฏิญาณนี้สำเร็จผลเต็มบริบูรณ์

+

ฉะนั้นบัดนี้สมัชชาจึงประกาศปฏิญญาสากลว่าด้วยสิทธิมนุษยชนนี้ให้เป็น +มาตรฐานร่วมกันแห่งความสำเร็จสำหรับประชาชนทั้งมวลและประชาชาติทั้งหลาย +เพื่อจุดมุ่งหมายที่ว่าปัจเจกบุคคลทุกคนและทุกส่วนของสังคมโดยการคำนึงถึง +ปฏิญญานี้เป็นเนืองนิตย์จะมุ่งมั่นส่งเสริมการเคารพสิทธิและอิสรภาพเหล่านี้ +ด้วยการสอนและการศึกษาและให้มีการยอมรับและยึดถือโดยสากลอย่างมีประสิทธิผล +ด้วยมาตรการแห่งชาติและระหว่างประเทศอันก้าวหน้าตามลำดับทั้งในบรรดาประชาชน +ของรัฐสมาชิกด้วยกันเองและในบรรดาประชาชนของดินแดนที่อยู่ใต้เขตอำนาจแห่ง +รัฐนั้น

+

ข้อ1

+

มนุษย์ทั้งปวงเกิดมามีอิสระและเสมอภาคกันในศักดิ์ศรีและสิทธิต่างในตนมีเหตุผลและมโนธรรมและควรปฏิบัติต่อกันด้วยจิตวิญญาณแห่งภราดรภาพ

+

ข้อ2

+

ทุกคนย่อมมีสิทธิและอิสรภาพทั้งปวงตามที่กำหนดไว้ในปฏิญญานี้โดยปราศจาก +การแบ่งแยกไม่ว่าชนิดใดอาทิเชื้อชาติผิวเพศภาษาศาสนาความคิดเห็นทางการเมือง +หรือทางอื่นพื้นเพทางชาติหรือสังคมทรัพย์สินการเกิดหรือสถานะอื่นนอกเหนือ +จากนี้จะไม่มีการแบ่งแยกใดบนพื้นฐานของสถานะทางการเมืองทางกฎหมายหรือทางการ +ระหว่างประเทศของประเทศหรือดินแดนที่บุคคลสังกัดไม่ว่าดินแดนนี้จะเป็น +เอกราชอยู่ในความพิทักษ์มิได้ปกครองตนเองหรืออยู่ภายใต้การจำกัดอธิปไตยอื่น +ใด

+

ข้อ3

+

ทุกคนมีสิทธิในการมีชีวิตเสรีภาพและความมั่นคงแห่งบุคคล

+

ข้อ4

+

บุคคลใดจะตกอยู่ในความเป็นทาสหรือสภาวะจำยอมไม่ได้ทั้งนี้ห้ามความเป็นทาสและการค้าทาสทุกรูปแบบ

+

ข้อ5

+

บุคคลใดจะถูกกระทำการทรมานหรือการปฏิบัติหรือการลงโทษที่โหดร้ายไร้มนุษยธรรมหรือย่ำยีศักดิ์ศรีไม่ได้

+

ข้อ6

+

ทุกคนมีสิทธิที่จะได้รับการยอมรับทุกแห่งหนว่าเป็นบุคคลตามกฎหมาย

+

ข้อ7

+

ทุกคนเสมอภาคกันตามกฎหมายและมีสิทธิที่จะได้รับความคุ้มครองของกฎหมาย +เท่าเทียมกันโดยปราศจากการเลือกปฏิบัติใดทุกคนมีสิทธิที่จะได้รับความคุ้ม +ครองเท่าเทียมกันจากการเลือกปฏิบัติใดอันเป็นการล่วงละเมิดปฏิญญานี้และจาก +การยุยงให้มีการเลือกปฏิบัติดังกล่าว

+

ข้อ8

+

ทุกคนมีสิทธิที่จะได้รับการเยียวยาอันมีประสิทธิผลจากศาลที่มีอำนาจแห่ง +รัฐต่อการกระทำอันล่วงละเมิดสิทธิขั้นพื้นฐานซึ่งตนได้รับตามรัฐธรรมนูญหรือ +กฎหมาย

+

ข้อ9

+

บุคคลใดจะถูกจับกุมกักขังหรือเนรเทศตามอำเภอใจไม่ได้

+

ข้อ10

+

ทุกคนย่อมมีสิทธิในความเสมอภาคอย่างเต็มที่ในการได้รับการพิจารณาคดีที่ +เป็นธรรมและเปิดเผยจากศาลที่อิสระและไม่ลำเอียงในการพิจารณากำหนดสิทธิและ +หน้าที่ของตนและข้อกล่าวหาอาญาใดต่อตน

+

ข้อ11

+

1.ทุกคนที่ถูกกล่าวหาว่ากระทำผิดทางอาญามีสิทธิที่จะได้รับการสันนิษฐาน +ไว้ก่อนว่าบริสุทธิ์จนกว่าจะพิสูจน์ได้ว่ามีความผิดตามกฎหมายในการพิจารณา +คดีที่เปิดเผยซึ่งตนได้รับหลักประกันที่จำเป็นทั้งปวงสำหรับการต่อสู้คดี

+

2.บุคคลใดจะถูกตัดสินว่ามีความผิดทางอาญาใดอันเนื่องจากการกระทำหรือ +ละเว้นใดอันมิได้ถือว่าเป็นความผิดทางอาญาตามกฎหมายแห่งชาติหรือกฎหมาย +ระหว่างประเทศในขณะที่ได้กระทำการนั้นไม่ได้และจะกำหนดโทษที่หนักกว่าที่ +บังคับใช้ในขณะที่ได้กระทำความผิดทางอาญานั้นไม่ได้

+

ข้อ12

+

บุคคลใดจะถูกแทรกแซงตามอำเภอใจในความเป็นส่วนตัวครอบครัวที่อยู่อาศัย +หรือการสื่อสารหรือจะถูกลบหลู่เกียรติยศและชื่อเสียงไม่ได้ทุกคนมีสิทธิที่ +จะได้รับความคุ้มครองของกฎหมายต่อการแทรกแซงสิทธิหรือการลบหลู่ดังกล่าวนั้น

+

ข13

+

1.ทุกคนมีสิทธิในอิสรภาพแห่งการเคลื่อนย้ายและการอยู่อาศัยภายในพรมแดนของแต่ละรัฐ

+

2.ทุกคนมีสิทธิที่จะออกนอกประเทศใดรวมทั้งประเทศของตนเองและสิทธิที่จะกลับสู่ประเทศตน

+

ข้อ14

+

1.ทุกคนมีสิทธิที่จะแสวงหาและที่จะได้ที่ลี้ภัยในประเทศอื่นจากการประหัตประหาร

+

2.สิทธินี้จะยกขึ้นกล่าวอ้างกับกรณีที่การดำเนินคดีที่เกิดขึ้นโดยแท้จาก +ความผิดที่มิใช่ทางการเมืองหรือจากการกระทำอันขัดต่อวัตถุประสงค์และหลักการ +ของสหประชาชาติไม่ได้

+

ข้อ15

+

1.ทุกคนมีสิทธิในสัญชาติหนึ่ง

+

2.บุคคลใดจะถูกเพิกถอนสัญชาติของตนตามอำเภอใจหรือถูกปฏิเสธสิทธิที่จะเปลี่ยนสัญชาติของตนไม่ได้

+

ข้อ16

+

1.บรรดาชายและหญิงที่มีอายุครบบริบูรณ์แล้วมีสิทธิที่จะสมรสและก่อร่าง +สร้างครอบครัวโดยปราศจากการจำกัดใดอันเนื่องจากเชื้อชาติสัญชาติหรือศาสนา +ต่างย่อมมีสิทธิเท่าเทียมกันในการสมรสระหว่างการสมรสและในการขาดจากการสมรส

+

2.การสมรสจะกระทำโดยความยินยอมอย่างอิสระและเต็มที่ของผู้ที่จะเป็นคู่สมรสเท่านั้น

+

3.ครอบครัวเป็นหน่วยธรรมชาติและพื้นฐานของสังคมและย่อมมีสิทธิที่จะได้รับความคุ้มครองจากสังคมและรัฐ

+

ข้อ17

+

1.ทุกคนมีสิทธิที่จะเป็นเจ้าของทรัพย์สินโดยตนเองและโดยร่วมกับผู้อื่น

+

2.บุคคลใดจะถูกเอาทรัพย์สินไปจากตนตามอำเภอใจไม่ได้

+

ข้อ18

+

ทุกคนมีสิทธิในอิสรภาพแห่งความคิดมโนธรรมและศาสนาทั้งนี้สิทธินี้รวมถึง +อิสรภาพในการเปลี่ยนศาสนาหรือความเชื่อและอิสรภาพในการแสดงออกทางศาสนาหรือ +ความเชื่อถือของตนในการสอนการปฏิบัติการสักการะบูชาและการประกอบพิธีกรรมไม่ +ว่าจะโดยลำพังหรือในชุมชนร่วมกับผู้อื่นและในที่สาธารณะหรือส่วนบุคคล

+

ข้อ19

+

ทุกคนมีสิทธิในอิสรภาพแห่งความเห็นและการแสดงออกทั้งนี้สิทธินี้รวมถึง +อิสรภาพที่จะถือเอาความเห็นโดยปราศจากการแทรกแซงและที่จะแสวงหารับและส่ง +ข้อมูลข่าวสารและข้อคิดผ่านสื่อใดและโดยไม่คำนึงถึงพรมแดน

+

ข้อ20

+

1.ทุกคนมีสิทธิในอิสรภาพแห่งการชุมนุมและการสมาคมโดยสันติ

+

2.บุคคลใดไม่อาจถูกบังคับให้สังกัดสมาคมหนึ่งได้

+

ข้อ21

+

1.ทุกคนมีสิทธิที่จะมีส่วนร่วมในการปกครองประเทศตนโดยตรงหรือผ่านผู้แทนซึ่งได้รับเลือกตั้งโดยอิสระ

+

2.ทุกคนมีสิทธิที่จะเข้าถึงบริการสาธารณะในประเทศตนโดยเสมอภาค

+

3.เจตจำนงของประชาชนจะต้องเป็นพื้นฐานแห่งอำนาจการปกครองทั้งนี้เจตจำนง +นี้จะต้องแสดงออกทางการเลือกตั้งตามกำหนดเวลาและอย่างแท้จริงซึ่งต้องเป็น +การออกเสียงอย่างทั่วถึงและเสมอภาคและต้องเป็นการลงคะแนนลับหรือวิธีการลง +คะแนนโดยอิสระในทำนองเดียวกัน

+

ข้อ22

+

ทุกคนในฐานะสมาชิกของสังคมมีสิทธิในหลักประกันทางสังคมและย่อมมีสิทธิใน +การบรรลุสิทธิทางเศรษฐกิจสังคมและวัฒนธรรมอันจำเป็นยิ่งสำหรับศักดิ์ศรีของ +ตนและการพัฒนาบุคลิกภาพของตนอย่างอิสระผ่านความพยายามของรัฐและความร่วมมือ +ระหว่างประเทศและตามการจัดการและทรัพยากรของแต่ละรัฐ

+

ข้อ23

+

1.ทุกคนมีสิทธิในการทำงานในการเลือกงานโดยอิสระในเงื่อนไขที่ยุติธรรมและเอื้ออำนวยต่อการทำงานและในการคุ้มครองต่อการว่างงาน

+

2.ทุกคนมีสิทธิที่จะได้รับค่าจ้างที่เท่าเทียมกันสำหรับงานที่เท่าเทียมกันโดยปราศจากการเลือกปฏิบัติใด

+

3.ทุกคนที่ทำงานมีสิทธิที่จะได้รับค่าตอบแทนที่ยุติธรรมและเอื้ออำนวยต่อ +การประกันความเป็นอยู่อันควรค่าแก่ศักดิ์ศรีของมนุษย์สำหรับตนเองและครอบ +ครัวและหากจำเป็นก็จะได้รับการคุ้มครองทางสังคมในรูปแบบอื่นเพิ่มเติมด้วย

+

4.ทุกคนมีสิทธิที่จะจัดตั้งและที่จะเข้าร่วมสหภาพแรงงานเพื่อความคุ้มครองผลประโยชน์ของตน

+

ข้อ24

+

ทุกคนมีสิทธิในการพักผ่อนและการผ่อนคลายยามว่างรวมทั้งจำกัดเวลาทำงานตามสมควรและวันหยุดเป็นครั้งคราวโดยได้รับค่าจ้าง

+

ข้อ25

+

1.ทุกคนมีสิทธิในมาตรฐานการครองชีพอันเพียงพอสำหรับสุขภาพและความอยู่ดี +ของตนและของครอบครัวรวมทั้งอาหารเครื่องนุ่งห่มที่อยู่อาศัยและการดูแลรักษา +ทางการแพทย์และบริการสังคมที่จำเป็นและมีสิทธิในหลักประกันยามว่างงานเจ็บ +ป่วยพิการหม้ายวัยชราหรือปราศจากการดำรงชีพอื่นในสภาวะแวดล้อมนอกเหนือการ +ควบคุมของตน

+

2.มารดาและเด็กย่อมมีสิทธิที่จะรับการดูแลรักษาและการช่วยเหลือเป็นพิเศษ +เด็กทั้งปวงไม่ว่าจะเกิดในหรือนอกสมรสจะต้องได้รับการคุ้มครองทางสังคมเช่น +เดียวกัน

+

ข้อ26

+

1.ทุกคนมีสิทธิในการศึกษาการศึกษาจะต้องให้เปล่าอย่างน้อยในขั้นประถม +ศึกษาและขั้นพื้นฐานการศึกษาระดับประถมจะต้องเป็นภาคบังคับการศึกษาด้าน +วิชาการและวิชาชีพจะต้องเปิดเป็นการทั่วไปและการศึกษาระดับสูงขึ้นไปจะต้อง +เข้าถึงได้อย่างเสมอภาคสำหรับทุกคนบนพื้นฐานของคุณสมบัติความเหมาะสม

+

2.การศึกษาจะต้องมุ่งไปสู่การพัฒนาบุคลิกภาพของมนุษย์อย่างเต็มที่และการ +เสริมสร้างความเคารพต่อสิทธิมนุษยชนและอิสรภาพขั้นพื้นฐานการศึกษาจะต้องส่ง +เสริมความเข้าใจขันติธรรมและมิตรภาพระหว่างประชาชาติกลุ่มเชื้อชาติหรือ +ศาสนาทั้งมวลและจะต้องส่งเสริมกิจกรรมของสหประชาชาติเพื่อการธำรงไว้ซึ่ง +สันติภาพ

+

3.ผู้ปกครองมีสิทธิเบื้องแรกที่จะเลือกประเภทการศึกษาที่จะให้แก่บุตรของตน

+

ข้อ27

+

1.ทุกคนมีสิทธิที่จะเข้าร่วมโดยอิสระในชีวิตทางวัฒนธรรมของชุมชนที่จะ +เพลิดเพลินกับศิลปะและมีส่วนในความรุดหน้าและคุณประโยชน์ทางวิทยาศาสตร์

+

2.ทุกคนมีสิทธิที่จะได้รับการคุ้มครองผลประโยชน์ทางจิตใจและทางวัตถุอัน +เป็นผลจากประดิษฐกรรมใดทางวิทยาศาสตร์วรรณกรรมและศิลปกรรมซึ่งตนเป็นผู้ +สร้าง

+

ข้อ28

+

ทุกคนย่อมมีสิทธิในระเบียบทางสังคมและระหว่างประเทศซึ่งจะเป็นกรอบให้บรรลุสิทธิและอิสรภาพที่กำหนดไว้ในปฏิญญานี้อย่างเต็มที่

+

ข้อ29

+

1.ทุกคนมีหน้าที่ต่อชุมชนซึ่งการพัฒนาบุคลิกภาพของตนโดยอิสระและเต็มที่จะกระทำได้ก็แต่ในชุมชนเท่านั้น

+

2.ในการใช้สิทธิและอิสรภาพของตนทุกคนจะต้องอยู่ภายใต้ข้อจำกัดเพียงเท่า +ที่มีกำหนดไว้ตามกฎหมายเท่านั้นเพื่อวัตถุประสงค์ของการได้มาซึ่งการยอมรับ +และการเคารพสิทธิและอิสรภาพอันควรของผู้อื่นและเพื่อให้สอดรับกับความต้อง +การอันสมควรทางด้านศีลธรรมความสงบเรียบร้อยของประชาชนและสวัสดิการทั่วไปใน +สังคมประชาธิปไตย

+

3.สิทธิและอิสรภาพเหล่านี้ไม่อาจใช้ขัดต่อวัตถุประสงค์และหลักการของสหประชาชาติไม่ว่าในกรณีใด

+

ข้อ30

+

ไม่มีบทใดในปฏิญญานี้ที่อาจตีความได้ว่าเป็นการให้สิทธิใดแก่รัฐกลุ่มคน +หรือบุคคลใดในการดำเนินกิจกรรมใดหรือกระทำการใดอันมุ่งต่อการทำลายสิทธิและ +อิสรภาพใดที่กำหนดไว้ณที่นี้

+

+ + diff --git a/intl/lwbrk/crashtests/crashtests.list b/intl/lwbrk/crashtests/crashtests.list new file mode 100644 index 0000000000..a7cb7a173b --- /dev/null +++ b/intl/lwbrk/crashtests/crashtests.list @@ -0,0 +1 @@ +load 416721.html diff --git a/intl/lwbrk/crashtests/crashtests_manual.list b/intl/lwbrk/crashtests/crashtests_manual.list new file mode 100644 index 0000000000..c58041a076 --- /dev/null +++ b/intl/lwbrk/crashtests/crashtests_manual.list @@ -0,0 +1,6 @@ +# Tests need to be run with --setpref security.sandbox.content.win32k-disable=false +# This is because the pref is not dynamic and is also the reason that these tests +# can only be run manually. They are also DEBUG only. +defaults pref(intl.compare_against_brokered_complex_line_breaks,true) +load Lo_test_page_no_uniscribe_breaks.html +load UDHR_Thai_test_page_long_sequences.html diff --git a/intl/lwbrk/gtest/TestBreak.cpp b/intl/lwbrk/gtest/TestBreak.cpp new file mode 100644 index 0000000000..4e6622dffd --- /dev/null +++ b/intl/lwbrk/gtest/TestBreak.cpp @@ -0,0 +1,376 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include + +#include "gtest/gtest.h" +#include "mozilla/intl/LineBreaker.h" +#include "mozilla/intl/WordBreaker.h" +#include "mozilla/Preferences.h" +#include "mozilla/Span.h" +#include "nsISupports.h" +#include "nsServiceManagerUtils.h" +#include "nsString.h" +#include "nsTArray.h" +#include "nsXPCOM.h" + +using mozilla::intl::LineBreaker; +using mozilla::intl::WordBreaker; + +// Turn off clang-format to align the ruler comments to the test strings. + +// clang-format off +static char teng0[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "hello world"; +// clang-format on + +static uint32_t lexp0[] = {5, 11}; + +static uint32_t wexp0[] = {5, 6, 11}; + +// clang-format off +static char teng1[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "This is a test to test(reasonable) line break. This 0.01123 = 45 x 48."; +// clang-format on + +static uint32_t lexp1[] = {4, 7, 9, 14, 17, 34, 39, 40, 41, + 42, 49, 54, 62, 64, 67, 69, 73}; + +static uint32_t wexp1[] = {4, 5, 7, 8, 9, 10, 14, 15, 17, 18, 22, 23, + 33, 34, 35, 39, 43, 48, 49, 50, 54, 55, 56, 57, + 62, 63, 64, 65, 67, 68, 69, 70, 72, 73}; + +// clang-format off +static char teng2[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "()((reasonab(l)e) line break. .01123=45x48."; +// clang-format on + +static uint32_t lexp2[] = {17, 22, 23, 30, 44}; + +static uint32_t wexp2[] = {4, 12, 13, 14, 15, 16, 17, 18, 22, + 24, 29, 30, 31, 32, 37, 38, 43, 44}; + +// clang-format off +static char teng3[] = + // 1 2 3 4 5 6 7 + // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 + "It's a test to test(ronae ) line break...."; +// clang-format on + +static uint32_t lexp3[] = {4, 6, 11, 14, 25, 27, 32, 42}; + +static uint32_t wexp3[] = {2, 3, 4, 5, 6, 7, 11, 12, 14, 15, + 19, 20, 25, 26, 27, 28, 32, 33, 38, 42}; + +static char ruler1[] = + " 1 2 3 4 5 6 7 "; +static char ruler2[] = + "0123456789012345678901234567890123456789012345678901234567890123456789012"; + +bool Check(const char* in, mozilla::Span out, + mozilla::Span res) { + const uint32_t outlen = out.Length(); + const uint32_t i = res.Length(); + bool ok = true; + + if (i != outlen) { + ok = false; + printf("WARNING!!! return size wrong, expect %d but got %d \n", outlen, i); + } + + for (uint32_t j = 0; j < i; j++) { + if (j < outlen) { + if (res[j] != out[j]) { + ok = false; + printf("[%d] expect %d but got %d\n", j, out[j], res[j]); + } + } else { + ok = false; + printf("[%d] additional %d\n", j, res[j]); + } + } + + if (!ok) { + printf("string = \n%s\n", in); + printf("%s\n", ruler1); + printf("%s\n", ruler2); + + printf("Expect = \n"); + for (uint32_t j = 0; j < outlen; j++) { + printf("%d,", out[j]); + } + + printf("\nResult = \n"); + for (uint32_t j = 0; j < i; j++) { + printf("%d,", res[j]); + } + printf("\n"); + } + + return ok; +} + +bool TestASCIILB(const char* in, mozilla::Span out) { + NS_ConvertASCIItoUTF16 input(in); + EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!"; + + nsTArray result; + int32_t curr = 0; + while (true) { + curr = LineBreaker::Next(input.get(), input.Length(), curr); + if (curr == NS_LINEBREAKER_NEED_MORE_TEXT) { + break; + } + result.AppendElement(curr); + } + + return Check(in, out, result); +} + +bool TestASCIIWB(const char* in, mozilla::Span out) { + NS_ConvertASCIItoUTF16 input(in); + EXPECT_GT(input.Length(), 0u) << "Expect a non-empty input!"; + + nsTArray result; + int32_t curr = 0; + while (true) { + curr = WordBreaker::Next(input.get(), input.Length(), curr); + if (curr == NS_WORDBREAKER_NEED_MORE_TEXT) { + break; + } + result.AppendElement(curr); + } + + return Check(in, out, result); +} + +TEST(LineBreak, LineBreaker) +{ + ASSERT_TRUE(TestASCIILB(teng0, lexp0)); + ASSERT_TRUE(TestASCIILB(teng1, lexp1)); + ASSERT_TRUE(TestASCIILB(teng2, lexp2)); + ASSERT_TRUE(TestASCIILB(teng3, lexp3)); +} + +TEST(WordBreak, WordBreaker) +{ + ASSERT_TRUE(TestASCIIWB(teng0, wexp0)); + ASSERT_TRUE(TestASCIIWB(teng1, wexp1)); + ASSERT_TRUE(TestASCIIWB(teng2, wexp2)); + ASSERT_TRUE(TestASCIIWB(teng3, wexp3)); +} + +// 012345678901234 +static const char wb0[] = "T"; +static const char wb1[] = "h"; +static const char wb2[] = ""; +static const char wb3[] = "is is a int"; +static const char wb4[] = ""; +static const char wb5[] = ""; +static const char wb6[] = "ernationali"; +static const char wb7[] = "zation work."; + +static const char* wb[] = {wb0, wb1, wb2, wb3, wb4, wb5, wb6, wb7}; + +TEST(WordBreak, TestPrintWordWithBreak) +{ + uint32_t numOfFragment = sizeof(wb) / sizeof(char*); + + // This test generate the result string by appending '^' at every word break + // opportunity except the one at end of the text. + nsAutoString result; + + for (uint32_t i = 0; i < numOfFragment; i++) { + NS_ConvertASCIItoUTF16 fragText(wb[i]); + + int32_t cur = 0; + cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur); + uint32_t start = 0; + while (cur != NS_WORDBREAKER_NEED_MORE_TEXT) { + result.Append(Substring(fragText, start, cur - start)); + + // Append '^' only if cur is within the fragText. We'll check the word + // break opportunity between fragText and nextFragText using + // BreakInBetween() below. + if (cur < static_cast(fragText.Length())) { + result.Append('^'); + } + start = (cur >= 0 ? cur : cur - start); + cur = WordBreaker::Next(fragText.get(), fragText.Length(), cur); + } + + if (i != numOfFragment - 1) { + NS_ConvertASCIItoUTF16 nextFragText(wb[i + 1]); + if (nextFragText.IsEmpty()) { + // If nextFragText is empty, there's no new possible word break + // opportunity. + continue; + } + + const auto origFragLen = static_cast(fragText.Length()); + fragText.Append(nextFragText); + + bool canBreak = + origFragLen == + WordBreaker::Next(fragText.get(), fragText.Length(), origFragLen - 1); + if (canBreak) { + result.Append('^'); + } + } + } + ASSERT_STREQ("This^ ^is^ ^a^ ^internationalization^ ^work^.", + NS_ConvertUTF16toUTF8(result).get()); +} + +// This function searches a complete word starting from |offset| in wb[fragN]. +// If it reaches the end of wb[fragN], and there is no word break opportunity +// between wb[fragN] and wb[fragN+1], it will continue the search in wb[fragN+1] +// until a word break. +void TestFindWordBreakFromPosition(uint32_t fragN, uint32_t offset, + const char* expected) { + uint32_t numOfFragment = sizeof(wb) / sizeof(char*); + + NS_ConvertASCIItoUTF16 fragText(wb[fragN]); + + mozilla::intl::WordRange res = WordBreaker::FindWord(fragText, offset); + + nsAutoString result(Substring(fragText, res.mBegin, res.mEnd - res.mBegin)); + + if ((uint32_t)fragText.Length() <= res.mEnd) { + // if we hit the end of the fragment + nsAutoString curFragText = fragText; + for (uint32_t p = fragN + 1; p < numOfFragment; p++) { + NS_ConvertASCIItoUTF16 nextFragText(wb[p]); + if (nextFragText.IsEmpty()) { + // If nextFragText is empty, there's no new possible word break + // opportunity between curFragText and nextFragText. + continue; + } + + const auto origFragLen = static_cast(curFragText.Length()); + curFragText.Append(nextFragText); + bool canBreak = origFragLen == WordBreaker::Next(curFragText.get(), + curFragText.Length(), + origFragLen - 1); + if (canBreak) { + break; + } + mozilla::intl::WordRange r = WordBreaker::FindWord(nextFragText, 0); + + result.Append(Substring(nextFragText, r.mBegin, r.mEnd - r.mBegin)); + + if ((uint32_t)nextFragText.Length() != r.mEnd) { + break; + } + } + } + + ASSERT_STREQ(expected, NS_ConvertUTF16toUTF8(result).get()) + << "FindWordBreakFromPosition(" << fragN << ", " << offset << ")"; +} + +TEST(WordBreak, TestNextWordBreakWithComplexLanguage) +{ + nsString fragText(u"\u0e40\u0e1b\u0e47\u0e19\u0e19\u0e31\u0e01"); + + int32_t offset = 0; + while (offset != NS_WORDBREAKER_NEED_MORE_TEXT) { + int32_t newOffset = + WordBreaker::Next(fragText.get(), fragText.Length(), offset); + ASSERT_NE(offset, newOffset); + offset = newOffset; + } + ASSERT_TRUE(true); +} + +TEST(WordBreak, TestFindWordWithEmptyString) +{ + mozilla::intl::WordRange expect{0, 0}; + mozilla::intl::WordRange result = WordBreaker::FindWord(EmptyString(), 0); + ASSERT_EQ(expect.mBegin, result.mBegin); + ASSERT_EQ(expect.mEnd, result.mEnd); +} + +TEST(WordBreak, TestNextWordBreakWithEmptyString) +{ + char16_t empty[] = {}; + ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 0)); + ASSERT_EQ(NS_WORDBREAKER_NEED_MORE_TEXT, WordBreaker::Next(empty, 0, 1)); +} + +TEST(WordBreak, TestFindWordBreakFromPosition) +{ + TestFindWordBreakFromPosition(0, 0, "This"); + TestFindWordBreakFromPosition(1, 0, "his"); + TestFindWordBreakFromPosition(2, 0, "is"); + TestFindWordBreakFromPosition(3, 0, "is"); + TestFindWordBreakFromPosition(3, 1, "is"); + TestFindWordBreakFromPosition(3, 9, " "); + TestFindWordBreakFromPosition(3, 10, "internationalization"); + TestFindWordBreakFromPosition(4, 0, "ernationalization"); + TestFindWordBreakFromPosition(5, 0, "ernationalization"); + TestFindWordBreakFromPosition(6, 4, "ernationalization"); + TestFindWordBreakFromPosition(6, 8, "ernationalization"); + TestFindWordBreakFromPosition(7, 6, " "); + TestFindWordBreakFromPosition(7, 7, "work"); +} + +// Test for StopAtPunctuation option. +TEST(WordBreak, TestFindBreakWithStopAtPunctuation) +{ + bool original = + mozilla::Preferences::GetBool("intl.icu4x.segmenter.enabled", true); + + // Not UAX#29 rule + mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", false); + + nsString fragText(u"one.two"); + + mozilla::intl::WordRange result1 = WordBreaker::FindWord(fragText, 0); + ASSERT_EQ(0u, result1.mBegin); + ASSERT_EQ(3u, result1.mEnd); + mozilla::intl::WordRange result2 = WordBreaker::FindWord(fragText, 3); + ASSERT_EQ(3u, result2.mBegin); + ASSERT_EQ(4u, result2.mEnd); + mozilla::intl::WordRange result3 = WordBreaker::FindWord(fragText, 4); + ASSERT_EQ(4u, result3.mBegin); + ASSERT_EQ(7u, result3.mEnd); + + // UAX#29 rule + mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + + mozilla::intl::WordRange result4 = WordBreaker::FindWord( + fragText, 0, WordBreaker::FindWordOptions::StopAtPunctuation); + ASSERT_EQ(0u, result4.mBegin); + ASSERT_EQ(3u, result4.mEnd); + mozilla::intl::WordRange result5 = WordBreaker::FindWord( + fragText, 3, WordBreaker::FindWordOptions::StopAtPunctuation); + ASSERT_EQ(3u, result5.mBegin); + ASSERT_EQ(4u, result5.mEnd); + mozilla::intl::WordRange result6 = WordBreaker::FindWord( + fragText, 4, WordBreaker::FindWordOptions::StopAtPunctuation); + ASSERT_EQ(4u, result6.mBegin); + ASSERT_EQ(7u, result6.mEnd); + + // Default (without StopAtPunctuation) + mozilla::intl::WordRange result7 = WordBreaker::FindWord(fragText, 0); + ASSERT_EQ(0u, result7.mBegin); + ASSERT_EQ(7u, result7.mEnd); + mozilla::intl::WordRange result8 = WordBreaker::FindWord(fragText, 3); + ASSERT_EQ(0u, result8.mBegin); + ASSERT_EQ(7u, result8.mEnd); + mozilla::intl::WordRange result9 = WordBreaker::FindWord(fragText, 4); + ASSERT_EQ(0u, result9.mBegin); + ASSERT_EQ(7u, result9.mEnd); + + mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", original); +} diff --git a/intl/lwbrk/gtest/TestSegmenter.cpp b/intl/lwbrk/gtest/TestSegmenter.cpp new file mode 100644 index 0000000000..42d04b8e03 --- /dev/null +++ b/intl/lwbrk/gtest/TestSegmenter.cpp @@ -0,0 +1,209 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this file, + * You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "gtest/gtest.h" + +#include "mozilla/intl/Segmenter.h" +#include "mozilla/Preferences.h" + +namespace mozilla::intl { + +TEST(IntlSegmenter, TestLineBreakIteratorUtf16SeekOld) +{ + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", false); + EXPECT_TRUE(rv == NS_OK); + + const SegmenterOptions options{SegmenterGranularity::Line}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto lineSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr segIter = + lineSegmenter->Segment(MakeStringSpan(text)); + + // Seek to space between "hello" and "world". + ASSERT_EQ(segIter->Seek(5u), Some(11u)); + + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestLineBreakIteratorUtf16Seek) +{ + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + EXPECT_TRUE(rv == NS_OK); + + const SegmenterOptions options{SegmenterGranularity::Line}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto lineSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr segIter = + lineSegmenter->Segment(MakeStringSpan(text)); + + // Seek to space between "hello" and "world". + // UAX#14 rule returns before "w". + ASSERT_EQ(segIter->Seek(5u), Some(6u)); + + ASSERT_EQ(segIter->Next(), Some(11u)); + + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestWordBreakIteratorUtf16Simple) +{ + const SegmenterOptions options{SegmenterGranularity::Word}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto wordSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr segIter = + wordSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Next(), Some(5u)); + ASSERT_EQ(segIter->Next(), Some(6u)); + ASSERT_EQ(segIter->Next(), Some(11u)); + ASSERT_EQ(segIter->Next(), Nothing()); +} + +TEST(IntlSegmenter, TestWordBreakIteratorUtf16Seek) +{ + const SegmenterOptions options{SegmenterGranularity::Word}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto wordSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr segIter = + wordSegmenter->Segment(MakeStringSpan(text)); + + // Seek to the space between "hello" and "world" + ASSERT_EQ(segIter->Seek(5u), Some(6u)); + + ASSERT_EQ(segIter->Next(), Some(11u)); + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Simple) +{ + SegmenterOptions options{SegmenterGranularity::Grapheme}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto graphemeClusterSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr segIter = + graphemeClusterSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Next(), Some(1u)); + ASSERT_EQ(segIter->Next(), Some(2u)); + ASSERT_EQ(segIter->Next(), Some(3u)); + ASSERT_EQ(segIter->Next(), Some(4u)); + ASSERT_EQ(segIter->Next(), Some(5u)); + ASSERT_EQ(segIter->Next(), Some(6u)); + ASSERT_EQ(segIter->Next(), Some(7u)); + ASSERT_EQ(segIter->Next(), Some(8u)); + ASSERT_EQ(segIter->Next(), Some(9u)); + ASSERT_EQ(segIter->Next(), Some(10u)); + ASSERT_EQ(segIter->Next(), Some(11u)); + ASSERT_EQ(segIter->Next(), Nothing()); +} + +TEST(IntlSegmenter, TestGraphemeClusterBreakIteratorUtf16Seek) +{ + SegmenterOptions options{SegmenterGranularity::Grapheme}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto graphemeClusterSegmenter = result.unwrap(); + + const char16_t text[] = u"hello world"; + UniquePtr segIter = + graphemeClusterSegmenter->Segment(MakeStringSpan(text)); + + // Seek to the space between "hello" and "world" + ASSERT_EQ(segIter->Seek(5u), Some(6u)); + + ASSERT_EQ(segIter->Next(), Some(7u)); + ASSERT_EQ(segIter->Next(), Some(8u)); + ASSERT_EQ(segIter->Next(), Some(9u)); + ASSERT_EQ(segIter->Next(), Some(10u)); + ASSERT_EQ(segIter->Next(), Some(11u)); + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestGraphemeClusterBreakReverseIteratorUtf16) +{ + const char16_t text[] = u"hello world"; + GraphemeClusterBreakReverseIteratorUtf16 segIter(MakeStringSpan(text)); + + // Seek to the space between "hello" and "world" + ASSERT_EQ(segIter.Seek(6u), Some(5u)); + + ASSERT_EQ(segIter.Next(), Some(4u)); + ASSERT_EQ(segIter.Next(), Some(3u)); + ASSERT_EQ(segIter.Next(), Some(2u)); + ASSERT_EQ(segIter.Next(), Some(1u)); + ASSERT_EQ(segIter.Next(), Some(0u)); + ASSERT_EQ(segIter.Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter.Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16) +{ + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + EXPECT_TRUE(rv == NS_OK); + + SegmenterOptions options{SegmenterGranularity::Sentence}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto sentenceSegmenter = result.unwrap(); + + const char16_t text[] = u"Hello world. Hello world."; + UniquePtr segIter = + sentenceSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Next(), Some(13u)); + ASSERT_EQ(segIter->Next(), Some(25u)); + ASSERT_EQ(segIter->Next(), Nothing()); + + // Same as calling Next(). + ASSERT_EQ(segIter->Seek(0u), Nothing()); +} + +TEST(IntlSegmenter, TestSentenceBreakIteratorUtf16Seek) +{ + nsresult rv = Preferences::SetBool("intl.icu4x.segmenter.enabled", true); + EXPECT_TRUE(rv == NS_OK); + + SegmenterOptions options{SegmenterGranularity::Sentence}; + auto result = Segmenter::TryCreate("en", options); + ASSERT_TRUE(result.isOk()); + auto sentenceSegmenter = result.unwrap(); + + const char16_t text[] = u"Hello world. Hello world."; + UniquePtr segIter = + sentenceSegmenter->Segment(MakeStringSpan(text)); + + ASSERT_EQ(segIter->Seek(5u), Some(13u)); +} + +} // namespace mozilla::intl diff --git a/intl/lwbrk/gtest/TestSegmenterPerf.cpp b/intl/lwbrk/gtest/TestSegmenterPerf.cpp new file mode 100644 index 0000000000..772e284fa8 --- /dev/null +++ b/intl/lwbrk/gtest/TestSegmenterPerf.cpp @@ -0,0 +1,276 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim: set ts=8 sts=2 et sw=2 tw=80: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include + +#include "gtest/gtest.h" +#include "gtest/MozGTestBench.h" // For MOZ_GTEST_BENCH +#include "mozilla/intl/LineBreaker.h" +#include "mozilla/intl/Segmenter.h" +#include "mozilla/Preferences.h" +#include "nsAtom.h" +#include "nsLineBreaker.h" +#include "nsString.h" +#include "nsTArray.h" + +namespace mozilla::intl { + +using mozilla::intl::LineBreakRule; +using mozilla::intl::WordBreakRule; + +constexpr size_t kIterations = 100; + +static std::string ReadFileIntoString(const char* aPath) { + std::ifstream file(aPath); + std::stringstream sstr; + sstr << file.rdbuf(); + return sstr.str(); +} + +class SegmenterPerf : public ::testing::Test { + protected: + void SetUp() override { + // Test files are into xpcom/tests/gtest/wikipedia + mArUtf8 = ReadFileIntoString("ar.txt"); + mDeUtf8 = ReadFileIntoString("de.txt"); + mJaUtf8 = ReadFileIntoString("ja.txt"); + mRuUtf8 = ReadFileIntoString("ru.txt"); + mThUtf8 = ReadFileIntoString("th.txt"); + mTrUtf8 = ReadFileIntoString("tr.txt"); + mViUtf8 = ReadFileIntoString("vi.txt"); + + CopyUTF8toUTF16(mArUtf8, mArUtf16); + CopyUTF8toUTF16(mDeUtf8, mDeUtf16); + CopyUTF8toUTF16(mJaUtf8, mJaUtf16); + CopyUTF8toUTF16(mRuUtf8, mRuUtf16); + CopyUTF8toUTF16(mThUtf8, mThUtf16); + CopyUTF8toUTF16(mTrUtf8, mTrUtf16); + CopyUTF8toUTF16(mViUtf8, mViUtf16); + + mAr = NS_Atomize(u"ar"); + mDe = NS_Atomize(u"de"); + mJa = NS_Atomize(u"ja"); + mRu = NS_Atomize(u"ru"); + mTh = NS_Atomize(u"th"); + mTr = NS_Atomize(u"tr"); + mVi = NS_Atomize(u"vi"); + } + + public: + std::string mArUtf8; + std::string mDeUtf8; + std::string mJaUtf8; + std::string mRuUtf8; + std::string mThUtf8; + std::string mTrUtf8; + std::string mViUtf8; + + nsString mArUtf16; + nsString mDeUtf16; + nsString mJaUtf16; + nsString mRuUtf16; + nsString mThUtf16; + nsString mTrUtf16; + nsString mViUtf16; + + RefPtr mAr; + RefPtr mDe; + RefPtr mJa; + RefPtr mRu; + RefPtr mTh; + RefPtr mTr; + RefPtr mVi; +}; + +class AutoSetSegmenter final { + public: + explicit AutoSetSegmenter(bool aValue) { + nsresult rv = + mozilla::Preferences::SetBool("intl.icu4x.segmenter.enabled", aValue); + EXPECT_TRUE(rv == NS_OK); + } + + ~AutoSetSegmenter() { + mozilla::Preferences::ClearUser("intl.icu4x.segmenter.enabled"); + } +}; + +static void TestSegmenterBench(const nsString& aStr, bool aIsJaOrZh, + size_t aCount = kIterations) { + nsTArray breakState; + breakState.SetLength(aStr.Length()); + + for (size_t i = 0; i < aCount; i++) { + LineBreaker::ComputeBreakPositions( + aStr.get(), aStr.Length(), WordBreakRule::Normal, LineBreakRule::Strict, + aIsJaOrZh, breakState.Elements()); + } +} + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakAROld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mArUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakDEOld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mDeUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakJAOld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mJaUtf16, true); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakRUOld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mRuUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTHOld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mThUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTROld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mTrUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakVIOld, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mViUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakAR, [this] { + AutoSetSegmenter set(false); + TestSegmenterBench(mArUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakDE, [this] { + AutoSetSegmenter set(true); + TestSegmenterBench(mDeUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakJA, [this] { + AutoSetSegmenter set(true); + TestSegmenterBench(mJaUtf16, true); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakRU, [this] { + AutoSetSegmenter set(true); + TestSegmenterBench(mRuUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTH, [this] { + AutoSetSegmenter set(true); + // LSTM segmenter is too slow + TestSegmenterBench(mThUtf16, false, 3); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakTR, [this] { + AutoSetSegmenter set(true); + TestSegmenterBench(mTrUtf16, false); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfLineBreakVI, [this] { + AutoSetSegmenter set(true); + TestSegmenterBench(mViUtf16, false); +}); + +class LBSink final : public nsILineBreakSink { + public: + LBSink() = default; + ~LBSink() = default; + + virtual void SetBreaks(uint32_t, uint32_t, uint8_t*) override {} + virtual void SetCapitalization(uint32_t, uint32_t, bool*) override {} +}; + +static void TestDOMSegmenterBench(const nsString& aStr, nsAtom* aLang, + size_t aCount = kIterations) { + LBSink sink; + bool trailingBreak; + + for (size_t i = 0; i < aCount; i++) { + nsLineBreaker breaker; + breaker.AppendText(aLang, aStr.get(), aStr.Length(), 0, &sink); + breaker.Reset(&trailingBreak); + } +} + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakAROld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mArUtf16, mAr); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakDEOld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mDeUtf16, mDe); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakJAOld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mJaUtf16, mJa); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakRUOld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mRuUtf16, mRu); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTHOld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mThUtf16, mTh); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTROld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mTrUtf16, mTr); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakVIOld, [this] { + AutoSetSegmenter set(false); + TestDOMSegmenterBench(mViUtf16, mVi); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakAR, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mArUtf16, mAr); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakDE, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mDeUtf16, mDe); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakJA, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mJaUtf16, mJa); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakRU, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mRuUtf16, mRu); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTH, [this] { + AutoSetSegmenter set(true); + // LSTM segmenter is too slow + TestDOMSegmenterBench(mThUtf16, mTh, 3); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakTR, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mTrUtf16, mTr); +}); + +MOZ_GTEST_BENCH_F(SegmenterPerf, PerfDOMLineBreakVI, [this] { + AutoSetSegmenter set(true); + TestDOMSegmenterBench(mViUtf16, mVi); +}); + +} // namespace mozilla::intl diff --git a/intl/lwbrk/gtest/moz.build b/intl/lwbrk/gtest/moz.build new file mode 100644 index 0000000000..092a0f0a86 --- /dev/null +++ b/intl/lwbrk/gtest/moz.build @@ -0,0 +1,13 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +UNIFIED_SOURCES += [ + "TestBreak.cpp", + "TestSegmenter.cpp", + "TestSegmenterPerf.cpp", +] + +FINAL_LIBRARY = "xul-gtest" diff --git a/intl/lwbrk/jisx4051class.h b/intl/lwbrk/jisx4051class.h new file mode 100644 index 0000000000..3140cf63a7 --- /dev/null +++ b/intl/lwbrk/jisx4051class.h @@ -0,0 +1,217 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +/* + DO NOT EDIT THIS DOCUMENT !!! THIS DOCUMENT IS GENERATED BY + mozilla/intl/lwbrk/tools/anzx4051.pl + */ +static const uint32_t gLBClass00[32] = { + 0x55555555, // U+0000 - U+0007 + 0x55555555, // U+0008 - U+000F + 0x55555555, // U+0010 - U+0017 + 0x55555555, // U+0018 - U+001F + 0x7AABAAA5, // U+0020 - U+0027 + 0x7A7AAAA9, // U+0028 - U+002F + 0x66666666, // U+0030 - U+0037 + 0xAAA9AA66, // U+0038 - U+003F + 0x77777777, // U+0040 - U+0047 + 0x77777777, // U+0048 - U+004F + 0x77777777, // U+0050 - U+0057 + 0x77AA9777, // U+0058 - U+005F + 0x77777777, // U+0060 - U+0067 + 0x77777777, // U+0068 - U+006F + 0x77777777, // U+0070 - U+0077 + 0x7AAA9777, // U+0078 - U+007F + 0x77777777, // U+0080 - U+0087 + 0x77777777, // U+0088 - U+008F + 0x77777777, // U+0090 - U+0097 + 0x77777777, // U+0098 - U+009F + 0xAA9A9AAB, // U+00A0 - U+00A7 + 0x77A9777A, // U+00A8 - U+00AF + 0xAAAAAAAA, // U+00B0 - U+00B7 + 0xAAAAAAAA, // U+00B8 - U+00BF + 0x77777777, // U+00C0 - U+00C7 + 0x77777777, // U+00C8 - U+00CF + 0x77777777, // U+00D0 - U+00D7 + 0x77777777, // U+00D8 - U+00DF + 0x77777777, // U+00E0 - U+00E7 + 0x77777777, // U+00E8 - U+00EF + 0xA7777777, // U+00F0 - U+00F7 + 0x77777777, // U+00F8 - U+00FF +}; + +static const uint32_t gLBClass20[32] = { + 0xB5555555, // U+2000 - U+2007 + 0x77775555, // U+2008 - U+200F + 0x777277B7, // U+2010 - U+2017 + 0x77A777A7, // U+2018 - U+201F + 0xA1117777, // U+2020 - U+2027 + 0xB7777777, // U+2028 - U+202F + 0x77744444, // U+2030 - U+2037 + 0x7A115107, // U+2038 - U+203F + 0x11017777, // U+2040 - U+2047 + 0x77777711, // U+2048 - U+204F + 0x77777777, // U+2050 - U+2057 + 0x57777777, // U+2058 - U+205F + 0x7777777B, // U+2060 - U+2067 + 0x77777777, // U+2068 - U+206F + 0x77777777, // U+2070 - U+2077 + 0x77777777, // U+2078 - U+207F + 0x77777777, // U+2080 - U+2087 + 0x77777777, // U+2088 - U+208F + 0x77777777, // U+2090 - U+2097 + 0x77777777, // U+2098 - U+209F + 0x77777777, // U+20A0 - U+20A7 + 0x77777777, // U+20A8 - U+20AF + 0x77777777, // U+20B0 - U+20B7 + 0x77777777, // U+20B8 - U+20BF + 0x77777777, // U+20C0 - U+20C7 + 0x77777777, // U+20C8 - U+20CF + 0x77777777, // U+20D0 - U+20D7 + 0x77777777, // U+20D8 - U+20DF + 0x77777777, // U+20E0 - U+20E7 + 0x77777777, // U+20E8 - U+20EF + 0x77777777, // U+20F0 - U+20F7 + 0x77777777, // U+20F8 - U+20FF +}; + +static const uint32_t gLBClass21[32] = { + 0x77777777, // U+2100 - U+2107 + 0x77777777, // U+2108 - U+210F + 0x73777777, // U+2110 - U+2117 + 0x77777777, // U+2118 - U+211F + 0x77777777, // U+2120 - U+2127 + 0x77777777, // U+2128 - U+212F + 0x77777777, // U+2130 - U+2137 + 0x77777777, // U+2138 - U+213F + 0x77777777, // U+2140 - U+2147 + 0x77777777, // U+2148 - U+214F + 0x77777777, // U+2150 - U+2157 + 0x77777777, // U+2158 - U+215F + 0x55555555, // U+2160 - U+2167 + 0x55555555, // U+2168 - U+216F + 0x55555555, // U+2170 - U+2177 + 0x55555555, // U+2178 - U+217F + 0x77777777, // U+2180 - U+2187 + 0x77777777, // U+2188 - U+218F + 0x77777777, // U+2190 - U+2197 + 0x77777777, // U+2198 - U+219F + 0x77777777, // U+21A0 - U+21A7 + 0x77777777, // U+21A8 - U+21AF + 0x77777777, // U+21B0 - U+21B7 + 0x77777777, // U+21B8 - U+21BF + 0x77777777, // U+21C0 - U+21C7 + 0x77777777, // U+21C8 - U+21CF + 0x77777777, // U+21D0 - U+21D7 + 0x77777777, // U+21D8 - U+21DF + 0x77777777, // U+21E0 - U+21E7 + 0x77777777, // U+21E8 - U+21EF + 0x77777777, // U+21F0 - U+21F7 + 0x77777777, // U+21F8 - U+21FF +}; + +static const uint32_t gLBClass30[32] = { + 0x55155115, // U+3000 - U+3007 + 0x10101010, // U+3008 - U+300F + 0x10105510, // U+3010 - U+3017 + 0x11011010, // U+3018 - U+301F + 0x55555555, // U+3020 - U+3027 + 0x55555555, // U+3028 - U+302F + 0x55555555, // U+3030 - U+3037 + 0x55555555, // U+3038 - U+303F + 0x15151515, // U+3040 - U+3047 + 0x55555515, // U+3048 - U+304F + 0x55555555, // U+3050 - U+3057 + 0x55555555, // U+3058 - U+305F + 0x55551555, // U+3060 - U+3067 + 0x55555555, // U+3068 - U+306F + 0x55555555, // U+3070 - U+3077 + 0x55555555, // U+3078 - U+307F + 0x15151555, // U+3080 - U+3087 + 0x51555555, // U+3088 - U+308F + 0x55555555, // U+3090 - U+3097 + 0x51111115, // U+3098 - U+309F + 0x15151515, // U+30A0 - U+30A7 + 0x55555515, // U+30A8 - U+30AF + 0x55555555, // U+30B0 - U+30B7 + 0x55555555, // U+30B8 - U+30BF + 0x55551555, // U+30C0 - U+30C7 + 0x55555555, // U+30C8 - U+30CF + 0x55555555, // U+30D0 - U+30D7 + 0x55555555, // U+30D8 - U+30DF + 0x15151555, // U+30E0 - U+30E7 + 0x51555555, // U+30E8 - U+30EF + 0x51155555, // U+30F0 - U+30F7 + 0x51111555, // U+30F8 - U+30FF +}; + +static const uint32_t gLBClass0E[32] = { + 0x88888888, // U+0E00 - U+0E07 + 0x88888888, // U+0E08 - U+0E0F + 0x88888888, // U+0E10 - U+0E17 + 0x88888888, // U+0E18 - U+0E1F + 0x88888888, // U+0E20 - U+0E27 + 0x18888888, // U+0E28 - U+0E2F + 0x88888888, // U+0E30 - U+0E37 + 0x08888888, // U+0E38 - U+0E3F + 0x81888888, // U+0E40 - U+0E47 + 0x78888888, // U+0E48 - U+0E4F + 0x66666666, // U+0E50 - U+0E57 + 0x88881166, // U+0E58 - U+0E5F + 0x88888888, // U+0E60 - U+0E67 + 0x88888888, // U+0E68 - U+0E6F + 0x88888888, // U+0E70 - U+0E77 + 0x88888888, // U+0E78 - U+0E7F + 0x88888888, // U+0E80 - U+0E87 + 0x88888888, // U+0E88 - U+0E8F + 0x88888888, // U+0E90 - U+0E97 + 0x88888888, // U+0E98 - U+0E9F + 0x88888888, // U+0EA0 - U+0EA7 + 0x18888888, // U+0EA8 - U+0EAF + 0x88888888, // U+0EB0 - U+0EB7 + 0x88888888, // U+0EB8 - U+0EBF + 0x81888888, // U+0EC0 - U+0EC7 + 0x88888888, // U+0EC8 - U+0ECF + 0x66666666, // U+0ED0 - U+0ED7 + 0x88888866, // U+0ED8 - U+0EDF + 0x88888888, // U+0EE0 - U+0EE7 + 0x88888888, // U+0EE8 - U+0EEF + 0x88888888, // U+0EF0 - U+0EF7 + 0x88888888, // U+0EF8 - U+0EFF +}; + +static const uint32_t gLBClass17[32] = { + 0x77777777, // U+1700 - U+1707 + 0x77777777, // U+1708 - U+170F + 0x77777777, // U+1710 - U+1717 + 0x77777777, // U+1718 - U+171F + 0x77777777, // U+1720 - U+1727 + 0x77777777, // U+1728 - U+172F + 0x70077777, // U+1730 - U+1737 + 0x77777777, // U+1738 - U+173F + 0x77777777, // U+1740 - U+1747 + 0x77777777, // U+1748 - U+174F + 0x77777777, // U+1750 - U+1757 + 0x77777777, // U+1758 - U+175F + 0x77777777, // U+1760 - U+1767 + 0x77777777, // U+1768 - U+176F + 0x77777777, // U+1770 - U+1777 + 0x77777777, // U+1778 - U+177F + 0x88888888, // U+1780 - U+1787 + 0x88888888, // U+1788 - U+178F + 0x88888888, // U+1790 - U+1797 + 0x88888888, // U+1798 - U+179F + 0x88888888, // U+17A0 - U+17A7 + 0x88888888, // U+17A8 - U+17AF + 0x88888888, // U+17B0 - U+17B7 + 0x88888888, // U+17B8 - U+17BF + 0x88888888, // U+17C0 - U+17C7 + 0x88888888, // U+17C8 - U+17CF + 0x88118888, // U+17D0 - U+17D7 + 0x77888181, // U+17D8 - U+17DF + 0x88888888, // U+17E0 - U+17E7 + 0x77777788, // U+17E8 - U+17EF + 0x88888888, // U+17F0 - U+17F7 + 0x77777788, // U+17F8 - U+17FF +}; diff --git a/intl/lwbrk/jisx4051pairtable.txt b/intl/lwbrk/jisx4051pairtable.txt new file mode 100644 index 0000000000..2bae1b18fe --- /dev/null +++ b/intl/lwbrk/jisx4051pairtable.txt @@ -0,0 +1,286 @@ + + + +/* + + Simplification of Pair Table in JIS X 4051 + + 1. The Origion Table - in 4.1.3 + + In JIS x 4051. The pair table is defined as below + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 14 15 16 17 18 19 20 + * # * # + 1 X X X X X X X X X X X X X X X X X X X X X E + 2 X X X X X X + 3 X X X X X X + 4 X X X X X X + 5 X X X X X X + 6 X X X X X X + 7 X X X X X X X + 8 X X X X X X E + 9 X X X X X X + 10 X X X X X X + 11 X X X X X X + 12 X X X X X X + 13 X X X X X X X + 14 X X X X X X X + 15 X X X X X X X X X + 16 X X X X X X X X + 17 X X X X X E + 18 X X X X X X X X X + 19 X E E E E E X X X X X X X X X X X X E X E E + 20 X X X X X E + + * Same Char + # Other Char + + 2. Simplified by remove the class which we do not care + + However, since we do not care about class 13(Subscript), 14(Ruby), + 19(split line note begin quote), and 20(split line note end quote) + we can simplify this par table into the following + + Class of + Leading Class of Trailing Char Class + Char + + 1 2 3 4 5 6 7 8 9 10 11 12 15 16 17 18 + + 1 X X X X X X X X X X X X X X X X + 2 X X X X X + 3 X X X X X + 4 X X X X X + 5 X X X X X + 6 X X X X X + 7 X X X X X X + 8 X X X X X X + 9 X X X X X + 10 X X X X X + 11 X X X X X + 12 X X X X X + 15 X X X X X X X X + 16 X X X X X X X + 17 X X X X X + 18 X X X X X X X X + + 3. Simplified by merged classes + + After the 2 simplification, the pair table have some duplication + a. class 2, 3, 4, 5, 6, are the same- we can merged them + b. class 10, 11, 12, 17 are the same- we can merged them + + + Class of + Leading Class of Trailing Char Class + Char + + 1 [a] 7 8 9 [b]15 16 18 + + 1 X X X X X X X X X + [a] X + 7 X X + 8 X X + 9 X + [b] X + 15 X X X X + 16 X X X + 18 X X X X + + + 4. Now we use one bit to encode weather it is breakable, and use 2 bytes + for one row, then the bit table will look like: + + 18 <- 1 + + 1 0000 0001 1111 1111 = 0x01FF + [a] 0000 0000 0000 0010 = 0x0002 + 7 0000 0000 0000 0110 = 0x0006 + 8 0000 0000 0100 0010 = 0x0042 + 9 0000 0000 0000 0010 = 0x0002 + [b] 0000 0000 0000 0010 = 0x0042 + 15 0000 0001 0101 0010 = 0x0152 + 16 0000 0001 1000 0010 = 0x0182 + 17 0000 0001 1100 0010 = 0x01C2 + +*/ + +static uint16_t gJISx4051SimplifiedPair[9] = { + 0x01FF, 0x0002, 0x0006, 0x0042, 0x0002, 0x0042, 0x0152, 0x0182, 0x01C2 +}; + +PRBool XXXX::ClassesToPair(nsJISx4051Cls aCls1, nsJISx4051Cls aCls1) +{ + NS_ASSERTION( (aCls1 < 9) "invalid class"); + NS_ASSERTION( (aCls2 < 9) "invalid class"); + return ( 0 != (gJISx4051SimplifiedPair[aCls1] & (1L << aCls2) )); +} + + +#define X4051_IS_DIGIT(u) ((0x0030 >= (u)) && ((u) >= 0x0039)) + +nsJISx4051Cls XXXX::GetClass( + PRUnichar aChar, PRUnichar aBefore = 0, PRUnichar aAfter = 0) +{ + // take care the special case in cls 15 + if( ((0x2C == aChar) || (0x2E == aChar)) && + (X4051_IS_DIGIT(aBefore)) && X4051_IS_DIGIT(aAfter))) + { + return kJISx4051Cls_15; + } + + nsJISx4051Cls cls; + if(gSingle->Lookup(aChar, &cls)) + return cls; + + if(gRange->Lookup(aChar, &cls)) + return cls; + + return kJISx4051Cls_15; +} + + +typedef enum { + kJISx4051Cls_1 = 0, + kJISx4051Cls_2 = 1, + kJISx4051Cls_3 = 1, + kJISx4051Cls_4 = 1, + kJISx4051Cls_5 = 1, + kJISx4051Cls_6 = 1, + kJISx4051Cls_7 = 2, + kJISx4051Cls_8 = 3, + kJISx4051Cls_9 = 4, + kJISx4051Cls_10 = 5, + kJISx4051Cls_11 = 5, + kJISx4051Cls_12 = 5, + // kJISx4051Cls_13 = 0, + // kJISx4051Cls_14 = 0, + kJISx4051Cls_15 = 6, + kJISx4051Cls_16 = 7, + kJISx4051Cls_17 = 5, + kJISx4051Cls_18 = 8, + // kJISx4051Cls_19 = 0, + // kJISx4051Cls_20 = 0 +} nsJISx4051Cls; + + + // Table 2 + YYYY(kJISx4051Cls_1 , 0x0028), + YYYY(kJISx4051Cls_1 , 0x005B), + YYYY(kJISx4051Cls_1 , 0x007B), + YYYY(kJISx4051Cls_1 , 0x2018), + YYYY(kJISx4051Cls_1 , 0x201B), + YYYY(kJISx4051Cls_1 , 0x201C), + YYYY(kJISx4051Cls_1 , 0x201F), + YYYY(kJISx4051Cls_1 , 0x3008), + YYYY(kJISx4051Cls_1 , 0x300A), + YYYY(kJISx4051Cls_1 , 0x300C), + YYYY(kJISx4051Cls_1 , 0x300E), + YYYY(kJISx4051Cls_1 , 0x3010), + YYYY(kJISx4051Cls_1 , 0x3014), + YYYY(kJISx4051Cls_1 , 0x3016), + YYYY(kJISx4051Cls_1 , 0x3018), + YYYY(kJISx4051Cls_1 , 0x301A), + YYYY(kJISx4051Cls_1 , 0x301D), + + // Table 3 + YYYY(kJISx4051Cls_2 , 0x0029), + YYYY(kJISx4051Cls_2 , 0x002C), + YYYY(kJISx4051Cls_2 , 0x005D), + YYYY(kJISx4051Cls_2 , 0x007D), + YYYY(kJISx4051Cls_2 , 0x2019), + YYYY(kJISx4051Cls_2 , 0x201A), + YYYY(kJISx4051Cls_2 , 0x201D), + YYYY(kJISx4051Cls_2 , 0x201E), + YYYY(kJISx4051Cls_2 , 0x3001), + YYYY(kJISx4051Cls_2 , 0x3009), + YYYY(kJISx4051Cls_2 , 0x300B), + YYYY(kJISx4051Cls_2 , 0x300D), + YYYY(kJISx4051Cls_2 , 0x300F), + YYYY(kJISx4051Cls_2 , 0x3011), + YYYY(kJISx4051Cls_2 , 0x3015), + YYYY(kJISx4051Cls_2 , 0x3017), + YYYY(kJISx4051Cls_2 , 0x3019), + YYYY(kJISx4051Cls_2 , 0x301B), + YYYY(kJISx4051Cls_2 , 0x301E), + YYYY(kJISx4051Cls_2 , 0x301F), + + // Table 4 + YYYY(kJISx4051Cls_3 , 0x203C), + YYYY(kJISx4051Cls_3 , 0x2044), + YYYY(kJISx4051Cls_3 , 0x301C), + YYYY(kJISx4051Cls_3 , 0x3041), + YYYY(kJISx4051Cls_3 , 0x3043), + YYYY(kJISx4051Cls_3 , 0x3045), + YYYY(kJISx4051Cls_3 , 0x3047), + YYYY(kJISx4051Cls_3 , 0x3049), + YYYY(kJISx4051Cls_3 , 0x3063), + YYYY(kJISx4051Cls_3 , 0x3083), + YYYY(kJISx4051Cls_3 , 0x3085), + YYYY(kJISx4051Cls_3 , 0x3087), + YYYY(kJISx4051Cls_3 , 0x308E), + YYYY(kJISx4051Cls_3 , 0x309D), + YYYY(kJISx4051Cls_3 , 0x309E), + YYYY(kJISx4051Cls_3 , 0x30A1), + YYYY(kJISx4051Cls_3 , 0x30A3), + YYYY(kJISx4051Cls_3 , 0x30A5), + YYYY(kJISx4051Cls_3 , 0x30A7), + YYYY(kJISx4051Cls_3 , 0x30A9), + YYYY(kJISx4051Cls_3 , 0x30C3), + YYYY(kJISx4051Cls_3 , 0x30E3), + YYYY(kJISx4051Cls_3 , 0x30E5), + YYYY(kJISx4051Cls_3 , 0x30E7), + YYYY(kJISx4051Cls_3 , 0x30EE), + YYYY(kJISx4051Cls_3 , 0x30F5), + YYYY(kJISx4051Cls_3 , 0x30F6), + YYYY(kJISx4051Cls_3 , 0x30FC), + YYYY(kJISx4051Cls_3 , 0x30FD), + YYYY(kJISx4051Cls_3 , 0x30FE), + + // Table 5 + YYYY(kJISx4051Cls_4 , 0x0021), + YYYY(kJISx4051Cls_4 , 0x003F), + + // Table 6 + YYYY(kJISx4051Cls_5 , 0x003A), + YYYY(kJISx4051Cls_5 , 0x003B), + YYYY(kJISx4051Cls_5 , 0x30FB), + + // Table 7 + YYYY(kJISx4051Cls_6 , 0x002E), + YYYY(kJISx4051Cls_6 , 0x3002), + + // Table 8 + YYYY(kJISx4051Cls_7 , 0x2014), + YYYY(kJISx4051Cls_7 , 0x2024), + YYYY(kJISx4051Cls_7 , 0x2025), + YYYY(kJISx4051Cls_7 , 0x2026), + + // Table 9 + YYYY(kJISx4051Cls_8 , 0x0024), + YYYY(kJISx4051Cls_8 , 0x00A3), + YYYY(kJISx4051Cls_8 , 0x00A5), + YYYY(kJISx4051Cls_8 , 0x2116), + + // Table 10 + YYYY(kJISx4051Cls_9 , 0x0025), + YYYY(kJISx4051Cls_9 , 0x00A2), + YYYY(kJISx4051Cls_9 , 0x00B0), + YYYY(kJISx4051Cls_9 , 0x2030), + YYYY(kJISx4051Cls_9 , 0x2031), + YYYY(kJISx4051Cls_9 , 0x2032), + YYYY(kJISx4051Cls_9 , 0x2033), + + // Table 1 + YYYY(kJISx4051Cls_10, 0x3000), + + // Table 1 + ZZZZ(kJISx4051Cls_11, 0x3000), + + + + diff --git a/intl/lwbrk/moz.build b/intl/lwbrk/moz.build new file mode 100644 index 0000000000..359efe8f5b --- /dev/null +++ b/intl/lwbrk/moz.build @@ -0,0 +1,55 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# vim: set filetype=python: +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +TEST_DIRS += ["gtest"] + +EXPORTS.mozilla.intl += [ + "LineBreaker.h", + "nsComplexBreaker.h", + "Segmenter.h", + "WordBreaker.h", +] + +UNIFIED_SOURCES += [ + "LineBreaker.cpp", + "Segmenter.cpp", + "WordBreaker.cpp", +] + +SOURCES += [ + "nsComplexBreaker.cpp", +] + +if CONFIG["MOZ_WIDGET_TOOLKIT"] == "gtk": + SOURCES += [ + "nsPangoBreaker.cpp", + ] + CXXFLAGS += CONFIG["MOZ_PANGO_CFLAGS"] +elif CONFIG["MOZ_WIDGET_TOOLKIT"] == "windows": + SOURCES += [ + "nsUniscribeBreaker.cpp", + ] +elif CONFIG["MOZ_WIDGET_TOOLKIT"] == "cocoa": + UNIFIED_SOURCES += [ + "nsCarbonBreaker.cpp", + ] +else: + SOURCES += [ + "nsRuleBreaker.cpp", + "rulebrk.c", + ] + +if CONFIG["JS_HAS_INTL_API"] and CONFIG["MOZ_ICU4X"]: + LOCAL_INCLUDES += [ + "/intl/icu_capi/cpp/include", + ] + # Disable warnings when including C++ headers of ICU4X. + # - https://github.com/rust-diplomat/diplomat/issues/277 + CXXFLAGS += [ + "-Wno-mismatched-tags", + ] + +FINAL_LIBRARY = "xul" diff --git a/intl/lwbrk/nsCarbonBreaker.cpp b/intl/lwbrk/nsCarbonBreaker.cpp new file mode 100644 index 0000000000..d1d81b2578 --- /dev/null +++ b/intl/lwbrk/nsCarbonBreaker.cpp @@ -0,0 +1,43 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include +#include +#include "nsDebug.h" +#include "nscore.h" + +void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) { + NS_ASSERTION(aText, "aText shouldn't be null"); + + memset(aBreakBefore, 0, aLength * sizeof(uint8_t)); + + CFStringRef str = ::CFStringCreateWithCharactersNoCopy( + kCFAllocatorDefault, reinterpret_cast(aText), aLength, + kCFAllocatorNull); + if (!str) { + return; + } + + CFStringTokenizerRef st = ::CFStringTokenizerCreate( + kCFAllocatorDefault, str, ::CFRangeMake(0, aLength), + kCFStringTokenizerUnitLineBreak, nullptr); + if (!st) { + ::CFRelease(str); + return; + } + + CFStringTokenizerTokenType tt = ::CFStringTokenizerAdvanceToNextToken(st); + while (tt != kCFStringTokenizerTokenNone) { + CFRange r = ::CFStringTokenizerGetCurrentTokenRange(st); + if (r.location != 0) { // Ignore leading edge + aBreakBefore[r.location] = true; + } + tt = CFStringTokenizerAdvanceToNextToken(st); + } + + ::CFRelease(st); + ::CFRelease(str); +} diff --git a/intl/lwbrk/nsComplexBreaker.cpp b/intl/lwbrk/nsComplexBreaker.cpp new file mode 100644 index 0000000000..eac44aab6d --- /dev/null +++ b/intl/lwbrk/nsComplexBreaker.cpp @@ -0,0 +1,174 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsComplexBreaker.h" + +#include + +#include "MainThreadUtils.h" +#include "mozilla/Assertions.h" +#include "mozilla/Services.h" +#include "mozilla/StaticPtr.h" +#include "mozilla/UniquePtr.h" +#include "nsTHashMap.h" +#include "nsIObserver.h" +#include "nsIObserverService.h" +#include "nsString.h" +#include "nsTArray.h" +#include "nsThreadUtils.h" + +using namespace mozilla; + +using CacheMap = nsTHashMap>; + +static StaticAutoPtr sBreakCache; + +// The underlying hash table extends capacity, when it hits .75 full and uses +// powers of 2 for sizing. This cache limit will hopefully mean most pages fit +// within the cache, while keeping it to a reasonable size. Also by holding the +// previous cache even if pages are bigger than the cache the most commonly used +// should still remain fast. +static const int kCacheLimit = 3072; + +static StaticAutoPtr sOldBreakCache; + +// Simple runnable to delete caches off the main thread. +class CacheDeleter final : public Runnable { + public: + explicit CacheDeleter(CacheMap* aCacheToDelete) + : Runnable("ComplexBreaker CacheDeleter"), + mCacheToDelete(aCacheToDelete) {} + + NS_IMETHOD Run() override { + MOZ_ASSERT(!NS_IsMainThread()); + mCacheToDelete = nullptr; + return NS_OK; + } + + private: + UniquePtr mCacheToDelete; +}; + +class ComplexBreakObserver final : public nsIObserver { + ~ComplexBreakObserver() = default; + + public: + NS_DECL_ISUPPORTS + NS_DECL_NSIOBSERVER +}; + +NS_IMPL_ISUPPORTS(ComplexBreakObserver, nsIObserver) + +NS_IMETHODIMP ComplexBreakObserver::Observe(nsISupports* aSubject, + const char* aTopic, + const char16_t* aData) { + MOZ_ASSERT(NS_IsMainThread()); + + if (strcmp(aTopic, "memory-pressure") == 0) { + if (sOldBreakCache) { + // We have an old cache, so delete that one first. + NS_DispatchBackgroundTask( + MakeAndAddRef(sOldBreakCache.forget())); + } else if (sBreakCache) { + NS_DispatchBackgroundTask( + MakeAndAddRef(sBreakCache.forget())); + } + } + + return NS_OK; +} + +void ComplexBreaker::Initialize() { + MOZ_ASSERT(NS_IsMainThread()); + + nsCOMPtr obs = services::GetObserverService(); + if (obs) { + obs->AddObserver(new ComplexBreakObserver(), "memory-pressure", false); + } +} + +void ComplexBreaker::Shutdown() { + MOZ_ASSERT(NS_IsMainThread()); + + sBreakCache = nullptr; + sOldBreakCache = nullptr; +} + +static void AddToCache(const char16_t* aText, uint32_t aLength, + nsTArray aBreakBefore) { + if (NS_WARN_IF(!sBreakCache->InsertOrUpdate( + nsString(aText, aLength), std::move(aBreakBefore), fallible))) { + return; + } + + if (sBreakCache->Count() <= kCacheLimit) { + return; + } + + if (sOldBreakCache) { + NS_DispatchBackgroundTask( + MakeAndAddRef(sOldBreakCache.forget())); + } + + sOldBreakCache = sBreakCache.forget(); +} + +static void CopyAndFill(const nsTArray& aCachedBreakBefore, + uint8_t* aBreakBefore, uint8_t* aEndBreakBefore) { + auto* startFill = std::copy(aCachedBreakBefore.begin(), + aCachedBreakBefore.end(), aBreakBefore); + std::fill(startFill, aEndBreakBefore, false); +} + +void ComplexBreaker::GetBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) { + // It is believed that this is only called on the main thread, so we don't + // need to lock the caching structures. A diagnostic assert is used in case + // our tests don't exercise all code paths. + MOZ_DIAGNOSTIC_ASSERT(NS_IsMainThread()); + + MOZ_ASSERT(aText, "aText shouldn't be null"); + MOZ_ASSERT(aLength, "aLength shouldn't be zero"); + MOZ_ASSERT(aBreakBefore, "aBreakBefore shouldn't be null"); + + // If we have a cache then check it, if not then create it. + if (sBreakCache) { + if (auto entry = + sBreakCache->Lookup(nsDependentSubstring(aText, aLength))) { + auto& breakBefore = entry.Data(); + CopyAndFill(breakBefore, aBreakBefore, aBreakBefore + aLength); + return; + } + } else { + sBreakCache = new CacheMap(); + } + + // We keep the previous cache when we hit our limit, so that the most recently + // used fragments remain fast. + if (sOldBreakCache) { + auto breakBefore = + sOldBreakCache->Extract(nsDependentSubstring(aText, aLength)); + if (breakBefore) { + CopyAndFill(*breakBefore, aBreakBefore, aBreakBefore + aLength); + // Move the entry to the latest cache. + AddToCache(aText, aLength, std::move(*breakBefore)); + return; + } + } + + NS_GetComplexLineBreaks(aText, aLength, aBreakBefore); + + // As a very simple memory saving measure we trim off trailing elements that + // are false before caching. + auto* afterLastTrue = aBreakBefore + aLength; + while (!*(afterLastTrue - 1)) { + if (--afterLastTrue == aBreakBefore) { + break; + } + } + + AddToCache(aText, aLength, + nsTArray(aBreakBefore, afterLastTrue - aBreakBefore)); +} diff --git a/intl/lwbrk/nsComplexBreaker.h b/intl/lwbrk/nsComplexBreaker.h new file mode 100644 index 0000000000..4120217a6e --- /dev/null +++ b/intl/lwbrk/nsComplexBreaker.h @@ -0,0 +1,36 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsComplexBreaker_h__ +#define nsComplexBreaker_h__ + +#include +/** + * Find line break opportunities in aText[] of aLength characters, + * filling boolean values indicating line break opportunities for + * corresponding charactersin aBreakBefore[] on return. + */ +void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore); + +class ComplexBreaker { + public: + static void Initialize(); + + static void Shutdown(); + + /** + * A wrapper around the platform specific NS_GetComplexLineBreaks, which adds + * caching of the results to mitigate sometimes expensive implementation. + * @param aText - pointer to the text to process for possible line breaks + * @param aLength - the length to process + * @param aBreakBefore - result array correlated to aText, where element is + * set to true if line can be broken before + * corresponding character in aText and false otherwise + */ + static void GetBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore); +}; + +#endif /* nsComplexBreaker_h__ */ diff --git a/intl/lwbrk/nsLWBrkCIID.h b/intl/lwbrk/nsLWBrkCIID.h new file mode 100644 index 0000000000..b612155ef0 --- /dev/null +++ b/intl/lwbrk/nsLWBrkCIID.h @@ -0,0 +1,28 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsLWBrkCIID_h__ +#define nsLWBrkCIID_h__ + +// {2BF64764-997F-450D-AF96-3028D1A902B0} +#define NS_LBRK_CID \ + { \ + 0x2bf64764, 0x997f, 0x450d, { \ + 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 \ + } \ + } + +#define NS_LBRK_CONTRACTID "@mozilla.org/intl/lbrk;1" + +// {2BF64765-997F-450D-AF96-3028D1A902B0} +#define NS_WBRK_CID \ + { \ + 0x2bf64765, 0x997f, 0x450d, { \ + 0xaf, 0x96, 0x30, 0x28, 0xd1, 0xa9, 0x2, 0xb0 \ + } \ + } + +#define NS_WBRK_CONTRACTID "@mozilla.org/intl/wbrk;1" + +#endif diff --git a/intl/lwbrk/nsPangoBreaker.cpp b/intl/lwbrk/nsPangoBreaker.cpp new file mode 100644 index 0000000000..e098a11e58 --- /dev/null +++ b/intl/lwbrk/nsPangoBreaker.cpp @@ -0,0 +1,61 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsComplexBreaker.h" + +#include +#include "nsUTF8Utils.h" +#include "nsString.h" +#include "nsTArray.h" + +void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) { + NS_ASSERTION(aText, "aText shouldn't be null"); + + memset(aBreakBefore, uint8_t(false), aLength * sizeof(uint8_t)); + + AutoTArray attrBuffer; + // XXX(Bug 1631371) Check if this should use a fallible operation as it + // pretended earlier. + attrBuffer.AppendElements(aLength + 1); + // `PangoLogAttr` doesn't have a default constructor (it is a C struct), so + // we need to manually initialize the new elements. See bug 1808182. + memset(attrBuffer.Elements(), 0, attrBuffer.Length() * sizeof(PangoLogAttr)); + + NS_ConvertUTF16toUTF8 aUTF8(aText, aLength); + + const gchar* p = aUTF8.Data(); + const gchar* end = p + aUTF8.Length(); + uint32_t u16Offset = 0; + + static PangoLanguage* language = pango_language_from_string("en"); + + while (p < end) { + PangoLogAttr* attr = attrBuffer.Elements(); + pango_get_log_attrs(p, end - p, -1, language, attr, attrBuffer.Length()); + + while (p < end) { + aBreakBefore[u16Offset] = attr->is_line_break; + if (NS_IS_LOW_SURROGATE(aText[u16Offset])) + aBreakBefore[++u16Offset] = false; // Skip high surrogate + ++u16Offset; + + // We're iterating over text obtained from NS_ConvertUTF16toUTF8, + // so we know we have valid UTF-8 and don't need to check for + // errors. + uint32_t ch = UTF8CharEnumerator::NextChar(&p, end); + ++attr; + + if (!ch) { + // pango_break (pango 1.16.2) only analyses text before the + // first NUL (but sets one extra attr). Workaround loop to call + // pango_break again to analyse after the NUL is done somewhere else + // (gfx/thebes/gfxFontconfigFonts.cpp: SetupClusterBoundaries()). + // So, we do the same here for pango_get_log_attrs. + break; + } + } + } +} diff --git a/intl/lwbrk/nsRuleBreaker.cpp b/intl/lwbrk/nsRuleBreaker.cpp new file mode 100644 index 0000000000..641f094360 --- /dev/null +++ b/intl/lwbrk/nsRuleBreaker.cpp @@ -0,0 +1,18 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsComplexBreaker.h" +#include "nsDebug.h" + +#define TH_UNICODE +#include "rulebrk.h" + +void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) { + NS_ASSERTION(aText, "aText shouldn't be null"); + + for (uint32_t i = 0; i < aLength; i++) + aBreakBefore[i] = (0 == TrbWordBreakPos(aText, i, aText + i, aLength - i)); +} diff --git a/intl/lwbrk/nsUniscribeBreaker.cpp b/intl/lwbrk/nsUniscribeBreaker.cpp new file mode 100644 index 0000000000..9e7a759537 --- /dev/null +++ b/intl/lwbrk/nsUniscribeBreaker.cpp @@ -0,0 +1,146 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsComplexBreaker.h" + +#include + +#include + +#include "nsUTF8Utils.h" +#include "nsString.h" +#include "nsTArray.h" + +#if defined(MOZ_SANDBOX) +# include "mozilla/WindowsProcessMitigations.h" +# include "mozilla/SandboxSettings.h" +# include "mozilla/sandboxTarget.h" +# include "nsXULAppAPI.h" + +# if defined(MOZ_DEBUG) +# include "mozilla/StaticPrefs_intl.h" +# endif +#endif + +using namespace mozilla; + +#if defined(MOZ_SANDBOX) +static bool UseBrokeredLineBreaking() { + // If win32k lockdown is enabled we can't use Uniscribe in this process. Also + // if the sandbox is above a certain level we can't load the required DLLs + // without other intervention. Given that it looks like we are likely to have + // win32k lockdown enabled first, using the brokered call for people testing + // this case also makes most sense. + static bool sUseBrokeredLineBreaking = + IsWin32kLockedDown() || + (XRE_IsContentProcess() && GetEffectiveContentSandboxLevel() >= 20); + + return sUseBrokeredLineBreaking; +} +#endif + +void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, + uint8_t* aBreakBefore) { + NS_ASSERTION(aText, "aText shouldn't be null"); + +#if defined(MOZ_SANDBOX) + if (UseBrokeredLineBreaking()) { + // We can't use Uniscribe, so use a brokered call. Use of Uniscribe will be + // replaced in bug 1684927. + char16ptr_t text = aText; + if (!SandboxTarget::Instance()->GetComplexLineBreaks(text, aLength, + aBreakBefore)) { + NS_WARNING("Brokered line break failed, breaks might be incorrect."); + } + + return; + } +#endif + + int outItems = 0; + HRESULT result; + AutoTArray items; + char16ptr_t text = aText; + + memset(aBreakBefore, false, aLength); + + items.AppendElements(64); + + do { + result = ScriptItemize(text, aLength, items.Length(), nullptr, nullptr, + items.Elements(), &outItems); + + if (result == E_OUTOFMEMORY) { + // XXX(Bug 1631371) Check if this should use a fallible operation as it + // pretended earlier. + items.AppendElements(items.Length()); + } + } while (result == E_OUTOFMEMORY); + + for (int iItem = 0; iItem < outItems; ++iItem) { + uint32_t endOffset = + (iItem + 1 == outItems ? aLength : items[iItem + 1].iCharPos); + uint32_t startOffset = items[iItem].iCharPos; + AutoTArray sla; + + // XXX(Bug 1631371) Check if this should use a fallible operation as it + // pretended earlier. + sla.AppendElements(endOffset - startOffset); + + if (ScriptBreak(text + startOffset, endOffset - startOffset, + &items[iItem].a, sla.Elements()) < 0) + return; + + // We don't want to set a potential break position at the start of text; + // that's the responsibility of a higher level. + for (uint32_t j = startOffset ? 0 : 1; j + startOffset < endOffset; ++j) { + aBreakBefore[j + startOffset] = sla[j].fSoftBreak; + } + } + +#if defined(MOZ_DEBUG) && defined(MOZ_SANDBOX) + // When tests are enabled and pref is set, we compare the line breaks returned + // from the Uniscribe breaker in the content process, with the ones returned + // from the brokered call to the parent. If they differ we crash so we can + // test using a crashtest. + if (!StaticPrefs::intl_compare_against_brokered_complex_line_breaks() || + !XRE_IsContentProcess()) { + return; + } + + nsTArray brokeredBreaks(aLength); + brokeredBreaks.AppendElements(aLength); + if (!SandboxTarget::Instance()->GetComplexLineBreaks( + text, aLength, brokeredBreaks.Elements())) { + MOZ_CRASH("Brokered GetComplexLineBreaks failed."); + } + + bool mismatch = false; + for (uint32_t i = 0; i < aLength; ++i) { + if (aBreakBefore[i] != brokeredBreaks[i]) { + mismatch = true; + break; + } + } + + if (mismatch) { + nsCString line("uniscribe: "); + // The logging here doesn't handle surrogates, but we only have tests using + // Thai currently, which is BMP-only. + for (uint32_t i = 0; i < aLength; ++i) { + if (aBreakBefore[i]) line.Append('#'); + line.Append(NS_ConvertUTF16toUTF8(aText + i, 1).get()); + } + printf_stderr("%s\n", line.get()); + line.Assign("brokered : "); + for (uint32_t i = 0; i < aLength; ++i) { + if (brokeredBreaks[i]) line.Append('#'); + line.Append(NS_ConvertUTF16toUTF8(aText + i, 1).get()); + } + printf_stderr("%s\n", line.get()); + MOZ_CRASH("Brokered breaks did not match."); + } +#endif +} diff --git a/intl/lwbrk/rulebrk.c b/intl/lwbrk/rulebrk.c new file mode 100644 index 0000000000..d7574b929f --- /dev/null +++ b/intl/lwbrk/rulebrk.c @@ -0,0 +1,388 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#define TH_UNICODE + +#include +#include +#include +#include "th_char.h" +#define th_isalpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) +#define th_isspace(c) ((c) == ' ' || (c) == '\t') + +/* +///////////////////////////////////////////////// +// Thai character type array +*/ + +typedef unsigned short twb_t; +extern const twb_t _TwbType[0x100 - 0xa0]; + +/* +// bit definition +*/ + +#define VRS 0x0001 +#define VRE 0x0002 +#define VRX 0x0004 + +#define VRA 0x0008 + +#define VLA 0x0010 +#define VLO 0x0020 +#define VLI 0x0040 + +#define VC 0x0080 + +#define CC 0x0100 +#define CS 0x0200 + +#define C2 0x0400 +#define CHB 0x0800 +#define CHE 0x1000 + +#define MT 0x2000 +/* +//_#define me 0x2000 +*/ +#define M 0x4000 + +#define T 0x8000 + +#define VL (VLA | VLO | VLI) +#define VR (VRS | VRE | VRX) +#define NE (VL | VRS) +#define NB (VR | M) +#define V (VL | VR) +#define CX (CC | CS) +#define C (CX | VC) +#define A (C | V | M) + +#define twbtype(c) (_TwbType[th_zcode(c)]) + +#ifndef TRUE +# define TRUE 1 +# define FALSE 0 +#endif +#define RETURN(b) return (b) + +/* +///////////////////////////////////////////////// +*/ + +int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr, + int right) +/* const ThBreakIterator *it, const th_char **p)*/ +{ + /* + //int left, right; + //const th_char *s = *p; + */ + const th_char* lstr = pstr + left; + th_char _c[6]; + twb_t _t[6]; +#define c(i) (_c[(i) + 3]) +#define t(i) (_t[(i) + 3]) + int i, j; + + /* + //left = s - it->begin; + */ + if (left < 0) return -1; + /* + //right = (it->end == NULL) ? 4 : it->begin - s; + */ + if (right < 1) return -1; + + /* + // get c(0), t(0) + */ + c(0) = rstr[0]; /* may be '\0' */ + if (!th_isthai(c(0))) return -1; + t(0) = twbtype(c(0)); + if (!(t(0) & A)) return -1; + + /* + // get c(-1), t(-1) + */ + if (left >= 1) { + c(-1) = lstr[-1]; + if (!th_isthai(c(-1))) return 0; + t(-1) = twbtype(c(-1)); + if (!(t(-1) & A)) return 0; /* handle punctuation marks here */ + } else { + c(-1) = 0; + t(-1) = 0; + } + + /* + // get c(1..2), t(1..2) + */ + for (i = 1; i <= 2; i++) { + if (i >= right) { + c(i) = 0; + t(i) = 0; + } else { + c(i) = rstr[i]; /* may be '\0'; */ + if (!th_isthai(c(i))) + right = i--; + else { + t(i) = twbtype(c(i)); + if (!(t(i) & A)) right = i--; + } + } + } + /* + // get c(-2..-3), t(-2..-3) + */ + for (i = -2, j = -2; i >= -3; j--) { + if (j < -left) { + c(i) = 0; + t(i) = 0; + i--; + } else { + c(i) = lstr[j]; + if (!th_isthai(c(i))) + left = 0; + else { + t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0); + if (!(t(i) & A)) + left = 0; + else { + if ((t(i + 1) & MT) && ((t(i) & VR) || (t(i + 2) & VR))) { + c(i + 1) = c(i); + t(i + 1) = t(i); + } else + i--; + } + } + } + } + + /* + // prohibit the unlikely + */ + if ((t(-1) & C) && (t(0) & C)) { + if ((t(-1) & CHE) || (t(0) & CHB)) return -1; + } + /* + // special case : vlao, C/ sara_a|aa, !sara_a + */ + if ((t(-3) & (VLA | VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) && + (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) + return 0; + + /* + // prohibit break + */ + if (t(0) & NB) return -1; + if (t(-1) & NE) return -1; + + /* + // apply 100% rules + */ + if (t(-1) & VRE) { + if (c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0; + return -1; /* usually too short syllable, part of word */ + } + + if (t(-2) & VRE) return -1; + + if ((t(0) & C) && (t(1) & (VR | MT)) && + (c(2) != TH_THANTHAKHAT)) { /*?C, NB */ + if ((t(-1) & (VRS | VRX)) && c(1) == TH_SARA_I) return -1; /* exception */ + if (t(-1) & (V | M)) return 0; /* !C/ C, NB */ + if (t(-2) & VRS) return 0; /* VRS, C / C, NB */ + if (!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */ + if (t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */ + if (t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */ + } + } + if ((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */ + if ((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V | M))) + return 0; /* VRS, C/ !C */ + + if ((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) { + if ((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */ + if ((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */ + } + /* + // apply 90% rules + */ + if (t(0) & VL) return 0; + if (t(1) & VL) return -1; + if (c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) + return 0; + + /* + //return -1; + // apply 80% rules + */ + if (t(0) & CHE) { + if ((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */ + /*if(t(-1) & VRX) return 0; // VRX/ CHE */ + if (t(-1) & VC) return 0; /* VC/ CHE */ + } + if (t(-1) & CHB) { + if ((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */ + if (t(0) & VC) return 0; /* CHB/ VC */ + } + + if ((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */ + if (t(-2) & VLI) + return 0; /* VLI,C/C,VR .*/ + else { /* vlao, C ? C , VR */ + if (c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */ + if (t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */ + if (!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */ + } + } + /* C,MT,C */ + if ((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1; + + return -1; +} + +int TrbFollowing(const th_char* begin, int length, int offset) +/* +//(ThBreakIterator *this, int offset) +*/ +{ + const th_char* w = begin + offset; + const th_char* end = begin + length; + while (w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++; + + if (w < end && *w && !th_isthai(*w)) { + int english = FALSE; + while (w < end && *w && !th_isthai(*w) && !th_isspace(*w)) { + if (th_isalpha(*w)) english = TRUE; + w++; + } + if (english || w == end || (!th_isthai(*w) && th_isspace(*w))) + return w - begin; + } + if (w == end || *w == 0 || !th_isthai(*w)) return w - begin; + w++; + if (w < end && *w && th_isthai(*w)) { + int brk = TrbWordBreakPos(begin, w - begin, w, end - w); + while (brk < 0) { + w++; + if (w == end || *w == 0 || !th_isthai(*w)) break; + brk = TrbWordBreakPos(begin, w - begin, w, end - w); + } + if (brk > 0) w += brk; + } + if (w < end && *w && !th_isthai(*w)) { + while (w < end && *w && !th_isthai(*w) && !th_isalpha(*w) && + !th_isspace(*w)) + w++; + } + return w - begin; +} + +/* +///////////////////////////////////////////////// +*/ +const twb_t _TwbType[0x100 - 0xa0] = { +#if 0 +/* 80 € */ T, +/* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +/* 90  */ T, +/* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +#endif + /* a0   */ 0, + /* a1 ¡ */ CS, + /* a2 ¢ */ CS | CHE, + /* a3 £ */ CC | CHE, + /* a4 € */ CS | CHE, + /* a5 ¥ */ CC | CHE, + /* a6 Š */ CS, + /* a7 § */ CS | CHB, + /* a8 š */ CS, + /* a9 © */ CC | CHE, + /* aa ª */ CS, + /* ab « */ CC | CHE, + /* ac ¬ */ CC | CHB | CHE, + /* ad ­ */ CS | CHB, + /* ae ® */ CS | CHB, + /* af ¯ */ CS | CHB, + /* b0 ° */ CS, + /* b1 ± */ CS | CHB | CHE, + /* b2 ² */ CS | CHB | CHE, + /* b3 ³ */ CS | CHB, + /* b4 Ž */ CS, + /* b5 µ */ CS, + /* b6 ¶ */ CS, + /* b7 · */ CS, + /* b8 ž */ CS, + /* b9 ¹ */ CS, + /* ba º */ CS, + /* bb » */ CS, + /* bc Œ */ CC | CHE, + /* bd œ */ CC | CHE, + /* be Ÿ */ CS, + /* bf ¿ */ CS, + /* c0 À */ CS | CHE, + /* c1 Á */ CS, + /* c2  */ CS, + /* c3 à */ CS | C2 | CHE, /* ? add CHE */ + /* c4 Ä */ VC | CHE, + /* c5 Å */ CS | C2, + /* c6 Æ */ VC | CHE, + /* c7 Ç */ VC | C2, + /* c8 È */ CS, + /* c9 É */ CS | CHB, + /* ca Ê */ CS | CHE, + /* cb Ë */ CC | CHE, + /* CC Ì */ CS | CHB | CHE, + /* cd Í */ VC, + /* ce Î */ CC | CHE, + /* cf Ï */ T, + /* d0 Ð */ VRE | VRA, + /* d1 Ñ */ VRS, + /* d2 Ò */ VRX | VRA, + /* d3 Ó */ VRE, + /* d4 Ô */ VRX | VRA, + /* d5 Õ */ VRX | VRA, + /* d6 Ö */ VRS, + /* d7 × */ VRS | VRA, + /* d8 Ø */ VRX, + /* d9 Ù */ VRX, + /* da Ú */ T, + /* db Û */ 0, + /* dc Ü */ 0, + /* dd Ý */ 0, + /* de Þ */ 0, + /* df ß */ T, + /* e0 à */ VLA, + /* e1 á */ VLO, + /* e2 â */ VLO, + /* e3 ã */ VLI, + /* e4 ä */ VLI, + /* e5 å */ VRE, + /* e6 æ */ M, + /* e7 ç */ M, + /* e8 è */ M | MT, + /* e9 é */ M | MT, + /* ea ê */ M | MT, + /* eb ë */ M | MT, + /* ec ì */ M, + /* ed í */ T, + /* ee î */ T, + /* ef ï */ T, + /* f0 ð */ T, + /* f1 ñ */ T, + /* f2 ò */ T, + /* f3 ó */ T, + /* f4 ô */ T, + /* f5 õ */ T, + /* f6 ö */ T, + /* f7 ÷ */ T, + /* f8 ø */ T, + /* f9 ù */ T, + /* fa ú */ T, + /* fb û */ T, + /* fc ü */ 0, + /* fd ý */ 0, + /* fe þ */ 0, + /* ff ’ */ 0}; diff --git a/intl/lwbrk/rulebrk.h b/intl/lwbrk/rulebrk.h new file mode 100644 index 0000000000..c1f2e0957b --- /dev/null +++ b/intl/lwbrk/rulebrk.h @@ -0,0 +1,26 @@ +/* +Copyright (c) 1999 Samphan Raruenrom +Permission to use, copy, modify, distribute and sell this software +and its documentation for any purpose is hereby granted without fee, +provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear +in supporting documentation. Samphan Raruenrom makes no +representations about the suitability of this software for any +purpose. It is provided "as is" without express or implied warranty. +*/ +#ifndef __RULEBRK_H__ +#define __RULEBRK_H__ +#include "th_char.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int TrbWordBreakPos(const th_char* pstr, int left, const th_char* rstr, + int right); +int TrbFollowing(const th_char* begin, int length, int offset); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/intl/lwbrk/th_char.h b/intl/lwbrk/th_char.h new file mode 100644 index 0000000000..a088228fff --- /dev/null +++ b/intl/lwbrk/th_char.h @@ -0,0 +1,133 @@ +/* +Copyright (c) 1999 Samphan Raruenrom +Permission to use, copy, modify, distribute and sell this software +and its documentation for any purpose is hereby granted without fee, +provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear +in supporting documentation. Samphan Raruenrom makes no +representations about the suitability of this software for any +purpose. It is provided "as is" without express or implied warranty. +*/ +#ifndef __TH_CHAR_H__ +#define __TH_CHAR_H__ + +typedef unsigned char tis_char; + +#ifdef TH_UNICODE +/* + * The char16_t type is only usable in C++ code, so we need this ugly hack to + * select a binary compatible C type for the expat C code to use. + */ +# ifdef __cplusplus +typedef char16_t th_char; +# else +typedef uint16_t th_char; +# endif +# define TH_THAIBEGIN_ 0x0e00 +# define th_isthai(c) (0x0e00 <= (c) && (c) <= 0x0e5f) +#else +typedef tis_char th_char; +# define TH_THAIBEGIN_ 0xa0 +# define th_isthai(c) ((c) >= 0xa0) +#endif +#define th_zcode(c) ((c)-TH_THAIBEGIN_) + +enum TH_CHARNAME { + TH_THAIBEGIN = TH_THAIBEGIN_, + TH_KOKAI, + TH_KHOKHAI, + TH_KHOKHUAT, + TH_KHOKHWAI, + TH_KHOKHON, + TH_KHORAKHANG, + TH_NGONGU, + TH_CHOCHAN, + TH_CHOCHING, + TH_CHOCHANG, + TH_SOSO, + TH_CHOCHOE, + TH_YOYING, + TH_DOCHADA, + TH_TOPATAK, + TH_THOTHAN, + TH_THONANGMONTHO, + TH_THOPHUTHAO, + TH_NONEN, + TH_DODEK, + TH_TOTAO, + TH_THOTHUNG, + TH_THOTHAHAN, + TH_THOTHONG, + TH_NONU, + TH_BOBAIMAI, + TH_POPLA, + TH_PHOPHUNG, + TH_FOFA, + TH_PHOPHAN, + TH_FOFAN, + TH_PHOSAMPHAO, + TH_MOMA, + TH_YOYAK, + TH_RORUA, + TH_RU, + TH_LOLING, + TH_LU, + TH_WOWAEN, + TH_SOSALA, + TH_SORUSI, + TH_SOSUA, + TH_HOHIP, + TH_LOCHULA, + TH_OANG, + TH_HONOKHUK, + TH_PAIYANNOI, + TH_SARA_A, + TH_MAIHANAKAT, + TH_SARA_AA, + TH_SARA_AM, + TH_SARA_I, + TH_SARA_II, + TH_SARA_UE, + TH_SARA_UEE, + TH_SARA_U, + TH_SARA_UU, + TH_PHINTHU, + TH_REM_CHERNG_, + TH_TAC_WBRK_, + TH_UNDEF_DD, + TH_UNDEF_DE, + TH_BAHT, + TH_SARA_E, + TH_SARA_AE, + TH_SARA_O, + TH_MAIMUAN, + TH_MAIMALAI, + TH_LAKKHANGYAO, + TH_MAIYAMOK, + TH_MAITAIKHU, + TH_MAIEK, + TH_MAITHO, + TH_MAITRI, + TH_MAICHATTAWA, + TH_THANTHAKHAT, + TH_NIKHAHIT, + TH_YAMAKKAN, + TH_FONGMAN, + TH_THAIZERO, + TH_THAIONE, + TH_THAITWO, + TH_THAITHREE, + TH_THAIFOUR, + TH_THAIFIVE, + TH_THAISIX, + TH_THAISEVEN, + TH_THAIEIGHT, + TH_THAININE, + TH_ANGKHANKHU, + TH_KHOMUT, + TH_UNDEF_FC, + TH_UNDEF_FD, + TH_UNDEF_FE, + TH_THAIEND +}; +#endif diff --git a/intl/lwbrk/tools/anzx4051.html b/intl/lwbrk/tools/anzx4051.html new file mode 100644 index 0000000000..9f3461a285 --- /dev/null +++ b/intl/lwbrk/tools/anzx4051.html @@ -0,0 +1,709 @@ + + + + + Analysis of JIS X 4051 to Unicode General Category Mapping + + +

Analysis of JIS X 4051 to Unicode General Category Mapping

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CLMNPSZTotalCcCfCoCsLlLmLoLtLuMcMeMnNdNlNoPcPdPePfPiPoPsScSkSmSoZlZpZs
00_11411512111
01_[a]32231368824211211721
02_7111
03_8111
04_9555
05_[b]33153332513239321153332513
06_15303030
07_1818157335612523911864758133045253643249811
08_COMPLEX543320211101531122101021
09_[c]347322
0A_[d]126251448111633192372
0B_[e]111361113
X0
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
00_101_[a]02_703_804_905_[b]06_1507_1808_COMPLEX09_[c]0A_[d]0B_[e]X
0033101277442
0E16201
1724110
20211151310044
21132163
301047161
+ + diff --git a/intl/lwbrk/tools/anzx4051.pl b/intl/lwbrk/tools/anzx4051.pl new file mode 100644 index 0000000000..e76eac6207 --- /dev/null +++ b/intl/lwbrk/tools/anzx4051.pl @@ -0,0 +1,356 @@ +#!/usr/bin/perl +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +###################################################################### +# +# Initial global variable +# +###################################################################### +%utot = (); +$ui=0; +$li=0; + +###################################################################### +# +# Open the unicode database file +# +###################################################################### +open ( UNICODATA , "< ../../unicharutil/tools/UnicodeData-Latest.txt") + || die "cannot find UnicodeData-Latest.txt"; + +###################################################################### +# +# Open the JIS X 4051 Class file +# +###################################################################### +open ( CLASS , "< jisx4051class.txt") + || die "cannot find jisx4051class.txt"; + +###################################################################### +# +# Open the JIS X 4051 Class simplified mapping +# +###################################################################### +open ( SIMP , "< jisx4051simp.txt") + || die "cannot find jisx4051simp.txt"; + +###################################################################### +# +# Open the output file +# +###################################################################### +open ( OUT , "> anzx4051.html") + || die "cannot open output anzx4051.html file"; + +###################################################################### +# +# Open the output file +# +###################################################################### +open ( HEADER , "> ../jisx4051class.h") + || die "cannot open output ../jisx4051class.h file"; + +###################################################################### +# +# Generate license and header +# +###################################################################### +$hthmlheader = < + + + + +Analysis of JIS X 4051 to Unicode General Category Mapping + + + +

+Analysis of JIS X 4051 to Unicode General Category Mapping +

+END_OF_HTML +print OUT $hthmlheader; + +###################################################################### +# +# Generate license and header +# +###################################################################### +$npl = <) { + chop; + ###################################################################### + # + # Get value from fields + # + ###################################################################### + @f = split(/;/ , $_); + $c = $f[0]; # The unicode value + $g = $f[2]; + $d = substr($g, 0, 1); + + $gcat{$c} = $g; + $dcat{$c} = $d; + $gcount{$g}++; + $dcount{$d}++; +} +close(UNIDATA); + +while() { + chop; + ###################################################################### + # + # Get value from fields + # + ###################################################################### + @f = split(/;/ , $_); + + $simp{$f[0]} = $f[1]; + $sccount{$f[1]}++; +} +close(SIMP); + +sub GetClass{ + my ($u) = @_; + my $hex = DecToHex($u); + $g = $gcat{$hex}; + if($g ne "") { + return $g; + } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { + return "Han"; + } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { + return "Lo"; + } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { + return "Cs"; + } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { + return "Cs"; + } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { + return "Cs"; + } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { + return "Co"; + } else { + printf "WARNING !!!! Cannot find General Category for U+%s \n" , $hex; + } +} +sub GetDClass{ + my ($u) = @_; + my $hex = DecToHex($u); + $g = $dcat{$hex}; + if($g ne "") { + return $g; + } elsif (( 0x3400 <= $u) && ( $u <= 0x9fa5 ) ) { + return "Han"; + } elsif (( 0xac00 <= $u) && ( $u <= 0xd7a3 ) ) { + return "L"; + } elsif (( 0xd800 <= $u) && ( $u <= 0xdb7f ) ) { + return "C"; + } elsif (( 0xdb80 <= $u) && ( $u <= 0xdbff ) ) { + return "C"; + } elsif (( 0xdc00 <= $u) && ( $u <= 0xdfff ) ) { + return "C"; + } elsif (( 0xe000 <= $u) && ( $u <= 0xf8ff ) ) { + return "C"; + } else { + printf "WARNING !!!! Cannot find Detailed General Category for U+%s \n" , $hex; + } +} +sub DecToHex{ + my ($d) = @_; + return sprintf("%04X", $d); +} +%gtotal = (); +%dtotal = (); +while() { + chop; + ###################################################################### + # + # Get value from fields + # + ###################################################################### + @f = split(/;/ , $_); + + if( substr($f[2], 0, 1) ne "a") + { + $sc = $simp{$f[2]}; + $l = hex($f[0]); + if($f[1] eq "") + { + $h = $l; + } else { + $h = hex($f[1]); + } + for($k = $l; $k <= $h ; $k++) + { + if( exists($occ{$k})) + { + # printf "WARNING !! Conflict defination!!! U+%s -> [%s] [%s | %s]\n", + # DecToHex($k), $occ{$k} , $f[2] , $sc; + } + else + { + $occ{$k} = $sc . " | " . $f[2]; + $gclass = GetClass($k); + $dclass = GetDClass($k); + $gtotal{$sc . $gclass}++; + $dtotal{$sc . $dclass}++; + $u = DecToHex($k); + $rk = " " . substr($u,0,2) . ":" . $sc; + $rangecount{$rk}++; + } + } + } +} + +#print %gtotal; +#print %dtotal; + +sub printreport +{ + print OUT "\n"; + print OUT "\n"; + } + + print OUT "\n"; + foreach $g (sort(keys %gcount)) { + print OUT "\n"; + } + print OUT "\n"; + foreach $sc (sort(keys %sccount)) { + + print OUT "\n"; + } + + print OUT "\n"; + + foreach $g (sort(keys %gcount)) { + $count = $gtotal{$sc . $g}; + print OUT "\n"; + } + + + print OUT "\n"; + } + print OUT "
\n"; + + foreach $d (sort(keys %dcount)) { + print OUT "$dTotal$g
$sc\n"; + + $total = 0; + foreach $d (sort (keys %dcount)) { + $count = $dtotal{$sc . $d}; + $total += $count; + print OUT "$count$total$count
\n"; + + + print OUT "\n"; + print OUT "\n"; + } + + print OUT "\n"; + + + for($rr = 0; $rr < 0x4f; $rr++) + { + $empty = 0; + $r = sprintf("%02X" , $rr) ; + $tmp = "\n", $count); + $empty += $count; + } + + $tmp .= "\n"; + + if($empty ne 0) + { + print OUT $tmp; + } + } + print OUT "
\n"; + + foreach $sc (sort(keys %sccount)) + { + print OUT "$sc
" . $r . "\n"; + + foreach $sc (sort(keys %sccount)) { + $count = $rangecount{ " " .$r . ":" .$sc}; + $tmp .= sprintf("%s
\n"; + +} +printreport(); + +sub printarray +{ + my($r, $def) = @_; +printf "[%s || %s]\n", $r, $def; + $k = hex($r) * 256; + printf HEADER "static const uint32_t gLBClass%s[32] = {\n", $r; + for($i = 0 ; $i < 256; $i+= 8) + { + for($j = 7 ; $j >= 0; $j-- ) + { + $v = $k + $i + $j; + if( exists($occ{$v})) + { + $p = substr($occ{$v}, 1,1); + } else { + $p = $def; + } + + if($j eq 7 ) + { + printf HEADER "0x%s" , $p; + } else { + printf HEADER "%s", $p ; + } + } + printf HEADER ", // U+%04X - U+%04X\n", $k + $i ,( $k + $i + 7); + } + print HEADER "};\n\n"; +} +printarray("00", "7"); +printarray("20", "7"); +printarray("21", "7"); +printarray("30", "5"); +printarray("0E", "8"); +printarray("17", "7"); + +#print %rangecount; + +###################################################################### +# +# Close files +# +###################################################################### +close(HEADER); +close(CLASS); +close(OUT); + diff --git a/intl/lwbrk/tools/jisx4051class.txt b/intl/lwbrk/tools/jisx4051class.txt new file mode 100644 index 0000000000..c435c1ae55 --- /dev/null +++ b/intl/lwbrk/tools/jisx4051class.txt @@ -0,0 +1,159 @@ +0000;001f;17 +0020;;17 +0024;;24 +0027;;18 +0028;;22 +002D;;18 +002F;;18 +0021;002F;23 +0030;0039;15 +003C;;22 +003A;003F;23 +0040;;18 +0041;005A;18 +005B;;22 +005E;;18 +005F;;18 +005B;005F;23 +0060;;18 +0061;007A;18 +007B;;22 +007B;007E;23 +00A0;;24 +00A3;;22 +00A5;;22 +00A9;;18 +00AA;;18 +00AB;;18 +00AC;;22 +00AE;;18 +00AF;;18 +00A1;00BF;23 +00B0;;18 +00F7;;23 +00C0;00FF;18 +0E3F;;1 +0E2F;;4 +0E46;;4 +0E5A;0E5B;4 +0E50;0E59;15 +0E4F;;18 +0EAF;;4 +0EC6;;4 +0ED0;0ED9;15 +1735;1736;1 +17D4;17D5;4 +17D8;;4 +17DA;;4 +1780;17DD;21 +17E0;17E9;21 +17F0;17F9;21 +2007;;24 +2000;200B;17 +200C;200F;18 +2010;;18 +2011;;24 +2012;2013;18 +2014;;7 +2015;;18 +2016;2017;18 +2019;;23 +201D;;23 +2018;201F;18 +2020;2023;18 +2024;2026;2 +2027;;23 +2028;202E;18 +202F;;24 +2030;2034;9 +2035;2038;18 +2039;;1 +203A;;2 +203B;;12 +203C;203D;3 +203E;;23 +203F;2043;18 +2044;;3 +2045;;1 +2046;;2 +2047;2049;3 +204A;205E;18 +205F;;17 +2060;;24 +2061;2063;18 +206A;206F;18 +2070;2071;18 +2074;208E;18 +2090;2094;18 +2116;;8 +2160;217F;12 +2190;21EA;a12 +2126;;18 +2100;2138;18 +2153;2182;18 +2190;21EA;18 +3008;;1 +300A;;1 +300C;;1 +300E;;1 +3010;;1 +3014;;1 +3016;;1 +3018;;1 +301A;;1 +301D;;1 +3001;;2 +3009;;2 +300B;;2 +300D;;2 +300F;;2 +3011;;2 +3015;;2 +3017;;2 +3019;;2 +301B;;2 +301E;;2 +301F;;2 +3005;;3 +301C;;3 +3041;;3 +3043;;3 +3045;;3 +3047;;3 +3049;;3 +3063;;3 +3083;;3 +3085;;3 +3087;;3 +308E;;3 +309D;;3 +309E;;3 +30A1;;3 +30A3;;3 +30A5;;3 +30A7;;3 +30A9;;3 +30C3;;3 +30E3;;3 +30E5;;3 +30E7;;3 +30EE;;3 +30F5;;3 +30F6;;3 +30FC;;3 +30FD;;3 +30FE;;3 +30FB;;5 +3002;;6 +3000;;10 +3042;3094;11 +3099;309E;3 +3003;;12 +3004;;12 +3006;;12 +3007;;12 +3012;;12 +3013;;12 +3020;;12 +3036;;12 +30A2;30FA;12 diff --git a/intl/lwbrk/tools/jisx4051simp.txt b/intl/lwbrk/tools/jisx4051simp.txt new file mode 100644 index 0000000000..e12a7fd805 --- /dev/null +++ b/intl/lwbrk/tools/jisx4051simp.txt @@ -0,0 +1,24 @@ +1;00_1 +2;01_[a] +3;01_[a] +4;01_[a] +5;01_[a] +6;01_[a] +7;02_7 +8;03_8 +9;04_9 +10;05_[b] +11;05_[b] +12;05_[b] +13;X +14;X +15;06_15 +16;X +17;05_[b] +18;07_18 +19;X +20;X +21;08_COMPLEX +22;09_[c] +23;0A_[d] +24;0B_[e] diff --git a/intl/lwbrk/tools/spec_table.html b/intl/lwbrk/tools/spec_table.html new file mode 100644 index 0000000000..b7a642a332 --- /dev/null +++ b/intl/lwbrk/tools/spec_table.html @@ -0,0 +1,664 @@ + + + + + + + + + + +

This is a specification table for line breaking.

+

+ The values of IE7 and Opera9: 'A' means that the line is breakable After + the character, and 'B' means Before. 'BA' means Before and After. +

+

+ (C) which is the tail of the IE7 and the Opera9 means Character. (N) means + Numeric. This means that they are around the character at testing. E.g., + "a$a" is a testcase for (C), "0$0" is a testcase for (N). +

+

+ Gecko is not breaking the lines on most western language context. But for + file paths, URLs and very long word which is connected hyphens, some + characters might be breakable. They are 'breakable' in the table. However, + they are not always breakable, they depend on the context in the + word. +


characterGeckoIE7(C)IE7(N)Opera9.2(C)Opera9.2(N)
characterGeckoIE7(C)IE7(N)Opera9.2(C)Opera9.2(N)
0x21!AA
0x22"
0x23#
0x24$B
0x25%breakableAA
0x26&breakable
0x27'
0x28(BB
0x29)AA
0x2A*
0x2B+
0x2C,
0x2D-breakableBABAAA
0x2E.
0x2F/breakableAA
0x3A:
0x3B;breakable
0x3C<
0x3D=
0x3E>
0x3F?AA
0x40@
0x5B[BB
0x5C\breakableB
0x5D]AA
0x5E^
0x5F_
0x60`
0x7B{BB
0x7C|AA
0x7D}AA
0x7E~
0xA1¡
0xA2¢AA
0xA3£B
0xA4¤
0xA5¥B
0xA6¦
0xA7§
0xA8¨
0xA9©
0xAAª
0xAB«
0xAC¬
0xAE®
0xAF¯
0xB0°AA
0xB1±
0xB2²
0xB3³
0xB4´BB
0xB5µ
0xB6
0xB7·
0xB8¸
0xB9¹
0xBAº
0xBB»
0xBC¼
0xBD½
0xBE¾
0xBF¿
0xD7×
0xF7÷
+ + -- cgit v1.2.3